In [1]:
import re
import pandas as pd
import numpy as np

from word2number import w2n


In [2]:
ufo_us = pd.read_csv("Data/ufo_data_us_cleaned_v1.csv")
ufo_non_us = pd.read_csv("Data/ufo_data_non_us_cleaned_v1.csv")

First, let's try and set a 'standard' capture to filter out 'easily' converted notations

In [3]:
standard = r"[0-9]+\+? *(?:m(?:in)?(?:ute)?s?\.?|s(?:ec)?(?:ond)?s?\.?|h(?:ou)?rs?\.?)"

us_weird_dur = ufo_us[~ufo_us["Duration"].str.contains(standard, na=True, flags=re.IGNORECASE)]["Duration"]

There are a few things we can do to normalize entries, such as remove "approx." some other things:

- remove 'about'
- replace a -> 1
- replace 'all night'/'all day' -> 12 hours
- word2num (one->1 etc)


In [4]:
ufo_us["Duration"].isna().sum()

3015

In [5]:
# create a function to do these things
def fix_entries(x):
    x = str(x).lower()
    if re.search(r"[a-z]+ +(?:m(?:in)?(?:ute)?s?\.?|s(?:ec)?(?:ond)?s?\.?|h(?:ou)?rs?\.?)", x, flags=re.IGNORECASE):
        split = x.split(" ")
        try:
            y = str(w2n.word_to_num(split[0])) + " " + " ".join(split[1:])
            print("{} -> {}".format(x, y))
            x = y
        except ValueError:
            pass
    if re.search(r"than", x, flags=re.IGNORECASE):
        y = re.sub(r"than", "", x, flags=re.IGNORECASE)
        print("{} -> {}".format(x, y))
        x = y
    if re.search("a? ?few", x, flags=re.IGNORECASE):
        y = re.sub("a? ?few", "3", x, flags=re.IGNORECASE)
        print("{} -> {}".format(x, y))
        x = y
    if re.search("a? ?couple", x, flags=re.IGNORECASE):
        y = re.sub("a? ?couple", "2", x, flags=re.IGNORECASE)
        print("{} -> {}".format(x, y))
        x = y    
    if re.search("about", x, flags=re.IGNORECASE):
        y = re.sub("about", "", x, flags=re.IGNORECASE)
        print("{} -> {}".format(x, y))
        x = y
    if re.search(r"approx\.?", x, flags=re.IGNORECASE):
        y = re.sub(r"approx\.?", "", x, flags=re.IGNORECASE)
        print("{} -> {}".format(x, y))
        x = y
    if re.search(r"one", x, flags=re.IGNORECASE):
        y = re.sub(r"one", "1", x, flags=re.IGNORECASE)
        print("{} -> {}".format(x, y))
        x = y
    if re.search(r"all +(?:day|night)", x, flags=re.IGNORECASE):
        y = re.sub(r"all +(?:day|night)", "12 hours", x, flags=re.IGNORECASE)
        print("{} -> {}".format(x, y))
        x = y
    if re.search(r"an? ", x, flags=re.IGNORECASE):
        y = re.sub(r"an? ", "1 ", x, flags=re.IGNORECASE)
        print("{} -> {}".format(x, y))
        x = y
    
    return x
    


In [6]:
ufo_us["Duration"] = ufo_us["Duration"].apply(fix_entries)
ufo_us["Duration"].replace("nan", np.nan, inplace=True)

econds
less than a second -> less  a second
less  a second -> less  1 second
all night -> 12 hours
all night -> 12 hours
90 plus minutes -> 90 plus minutes
twenty five seconds -> 20 five seconds
a few minutes -> 3 minutes
few seconds -> 3 seconds
a few seconds, multiple o -> 3 seconds, multiple o
few seconds -> 3 seconds
all night -> 12 hours
about 3 minutes ->  3 minutes
an hour -> 1 hour
a few seconds -> 3 seconds
a few seconds -> 3 seconds
one-half second -> 1 second
ten seconds -> 10 seconds
three hours -> 3 hours
one minute -> 1 minute
few seconds -> 3 seconds
a few seconds -> 3 seconds
few seconds -> 3 seconds
few seconds -> 3 seconds
less than 5 minutes -> less  5 minutes
a couple of seconds -> 2 of seconds
about 5 minutes ->  5 minutes
twenty minutes -> 20 minutes
five minutes -> 5 minutes
a few minutes -> 3 minutes
a couple of seconds -> 2 of seconds
a couple of seconds -> 2 of seconds
30 minutes or so -> 30 minutes or so
approx.3 hrs. -> 3 hrs.
one minute -> 1 minute
few seco

In [7]:
standard = r"[0-9]+\+? *(?:m(?:in)?(?:ute)?s?\.?|s(?:ec)?(?:ond)?s?\.?|h(?:ou)?rs?\.?)"

us_weird_dur_fix = ufo_us[~ufo_us["Duration"].str.contains(standard, na=True, flags=re.IGNORECASE)]["Duration"]

In [8]:
us_fix_useable = ufo_us[ufo_us["Duration"].str.contains(standard, na=False, flags=re.IGNORECASE)]["Duration"]

Ok, now that we have a general way to get most cases, let's apply it to replace everything in the dataframe

In [9]:
def to_seconds(x):
    mults = {
        r"mo(?:nths?)?": int(2.628e+6),
        r"m(?:in)?(?:ute)?s?": 60,
        r"h(?:ou)?rs?": 60*60,
        r"d(?:ays?)?": 60*60*24,
        r"y(?:ea)?rs?": int(3.154e+7),
        r"s(?:ec)?(?:ond)?s?": 1,
    }
    x = str(x).lower()
    match = re.search(r"([0-9]+-)?([0-9]+)\+? *(m(?:in)?(?:ute)?s?|h(?:ou)?rs?|d(?:ays?)?|mo(?:nths?)?|y(?:ea)?rs?|s(?:ec)?(?:ond)?s?)\.?", x, flags=re.IGNORECASE)
    if match:
        if match.groups()[0] is not None:
            number = (float(match.groups()[0][:-1]) + float(match.groups()[1])) / 2
        else:
            number = float(match.groups()[1])
        mult = [m for r, m in mults.items() if re.search(r, match.groups()[2], flags=re.IGNORECASE)][0]


        return number * mult
    else:
        return np.nan

        
        


In [10]:
ufo_us["Duration"] = ufo_us["Duration"].apply(to_seconds)

In [11]:
ufo_us.to_csv("ufo_data_us_cleaned_v2.csv", index=False)

In [12]:
ufo_non_us["Duration"] = ufo_non_us["Duration"].apply(fix_entries)
ufo_non_us["Duration"].replace("nan", np.nan, inplace=True)
ufo_non_us["Duration"] = ufo_non_us["Duration"].apply(to_seconds)
ufo_non_us.to_csv("ufo_data_non_us_cleaned_v2.csv", index=False)

less than 1 second -> less  1 second
20 min approx -> 20 min 
approx 2 minutes ->  2 minutes
five minutes -> 5 minutes
flyby approx 20 sec. -> flyby  20 sec.
about 15 minutes each tim ->  15 minutes each tim
none -> n1
about 10 minutes ->  10 minutes
ten minutes -> 10 minutes
half an hour -> half 1 hour
less than  2 min -> less   2 min
few seconds -> 3 seconds
few seconds -> 3 seconds
none -> n1
less than 2 hours -> less  2 hours
more than ten minutes -> more  ten minutes
few seconds -> 3 seconds
about 10 min ->  10 min
less than a minute -> less  a minute
less  a minute -> less  1 minute
about 3 or 4 hours. ->  3 or 4 hours.
at least one hour -> at least 1 hour
approx 2hrs ->  2hrs
all night -> 12 hours
few minutes -> 3 minutes
approx 1 hour ->  1 hour
about 15 sec ->  15 sec
half an hour -> half 1 hour
a few minutes, maybe 10 m -> 3 minutes, maybe 10 m
approx 1 minute ->  1 minute
two minutes -> 2 minutes
greater than 10 minutes,  -> greater  10 minutes, 
all night -> 12 hours
five m