In [9]:
import pandas as pd
from pathlib import Path
from openpyxl import load_workbook
from colorama import Fore, Back, Style

FILENAME = 'UFO Data.xlsx'
CWD = Path.cwd()

file_path = Path(CWD.parent, 'data', FILENAME)
df = pd.read_excel(file_path)
df


Unnamed: 0,Date / Time,City,State,Shape,Duration,Summary,Posted
0,12/23/20 07:30,Canadian Lakes,MI,Light,5 seconds,"light in sky ,no sound",12/23/20
1,12/23/20 03:18,Grants,CA,Light,2 Seconds,Bright Orb seen below plane level at Mach 1+ S...,12/23/20
2,12/23/20 03:00,East Quogue,NY,Other,30 mins,"flickering colored lights in the night sky, in...",12/23/20
3,12/22/20 22:45,Indian River Shores,FL,Changing,4 minutes,Orange orb over Indian River Shores Florida,12/23/20
4,12/22/20 22:05,lake elsinore,CA,Light,30 seconds,saw a glowing object.shape not discerned. trav...,12/23/20
...,...,...,...,...,...,...,...
256,12/1/20 14:45,Weidman,MI,Light,4-5 minutes,Extremely bright mirror like light that left v...,12/23/20
257,12/1/20 12:00,Millstone,NJ,Circle,5 mins,It flew across highway dropped to the ground a...,12/23/20
258,12/1/20 09:00,Barnegat Township,NJ,Unknown,30 min,Witnessed object being chased by fighter jets ...,12/23/20
259,12/1/20 06:00,Hyde Park,VT,Diamond,30 sec,I saw a very bright light in the sky and then ...,12/23/20


In [2]:
df.columns

Index(['Date / Time', 'City', 'State', 'Shape', 'Duration', 'Summary',
       'Posted'],
      dtype='object')

In [6]:
def get_sheetnames_xlsx(file_path):
    wb = load_workbook(file_path, read_only=True, keep_links=False)
    return wb.sheetnames


sheet_names = get_sheetnames_xlsx(file_path)
sheet_names

['12th',
 '11th',
 '10th',
 '9th',
 '8th',
 '7th',
 '6th',
 '5th',
 '4th',
 '3rd',
 '2nd',
 '1st']

In [5]:
def load_frame(file_path, sheet_name):
    raw_data = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
    # looking for the header row
    for index, row in raw_data.iterrows():
        
        if row.notnull().all():
            data = raw_data.iloc[(index + 1):].reset_index(drop=True)
            data.columns = list(raw_data.iloc[index])
            break
    # transforming columns to numeric where possible
    for c in data.columns:
        data[c] = pd.to_numeric(data[c], errors='ignore')

    return data

data = load_frame(file_path, sheet_names[10])
data.head()


Unnamed: 0,Date / Time,City,State,Shape,Duration,Summary,Posted
0,2/29/20 23:38,Livonia,MI,Other,30 seconds,Object was moving very fast and low in the nig...,2020-09-04 00:00:00
1,2/29/20 21:30,New Haven,MI,Fireball,30 seconds,Fiery porthole.,2020-09-04 00:00:00
2,2/29/20 21:00,Solon,ME,Circle,1 hour,"Red Pulsating light left, white on right, much...",2020-09-04 00:00:00
3,2/29/20 20:54,Clear Spring,MD,Light,30 minutes,You can see in the video. I was trying my best...,2020-09-04 00:00:00
4,2/29/20 20:40,Spring Branch,TX,Circle,15 minutes,It was just above tree line and was moving fir...,2020-09-04 00:00:00


In [11]:
def load_wb(file_path):
    names = get_sheetnames_xlsx(file_path)
    frames = []
    for n in names:
        print(f'> {Fore.CYAN}loading sheet{Style.RESET_ALL} "{n}"')
        df = load_frame(file_path, n)
        print(f'found {len(df)} records')
        frames.append(df)
    return pd.concat(frames)

df = load_wb(file_path)
df.head()

> [36mloading sheet[0m "12th"
found 261 records
> [36mloading sheet[0m "11th"
found 468 records
> [36mloading sheet[0m "10th"
found 508 records
> [36mloading sheet[0m "9th"
found 440 records
> [36mloading sheet[0m "8th"
found 700 records
> [36mloading sheet[0m "7th"
found 639 records
> [36mloading sheet[0m "6th"
found 367 records
> [36mloading sheet[0m "5th"
found 563 records
> [36mloading sheet[0m "4th"
found 1040 records
> [36mloading sheet[0m "3rd"
found 816 records
> [36mloading sheet[0m "2nd"
found 607 records
> [36mloading sheet[0m "1st"
found 611 records


Unnamed: 0,Date / Time,City,State,Shape,Duration,Summary,Posted
0,12/23/20 07:30,Canadian Lakes,MI,Light,5 seconds,"light in sky ,no sound",12/23/20
1,12/23/20 03:18,Grants,CA,Light,2 Seconds,Bright Orb seen below plane level at Mach 1+ S...,12/23/20
2,12/23/20 03:00,East Quogue,NY,Other,30 mins,"flickering colored lights in the night sky, in...",12/23/20
3,12/22/20 22:45,Indian River Shores,FL,Changing,4 minutes,Orange orb over Indian River Shores Florida,12/23/20
4,12/22/20 22:05,lake elsinore,CA,Light,30 seconds,saw a glowing object.shape not discerned. trav...,12/23/20


In [12]:
frames = [load_frame(file_path, n) for n in sheet_names]
df = pd.concat(frames)
df.head()

Unnamed: 0,Date / Time,City,State,Shape,Duration,Summary,Posted
0,12/23/20 07:30,Canadian Lakes,MI,Light,5 seconds,"light in sky ,no sound",12/23/20
1,12/23/20 03:18,Grants,CA,Light,2 Seconds,Bright Orb seen below plane level at Mach 1+ S...,12/23/20
2,12/23/20 03:00,East Quogue,NY,Other,30 mins,"flickering colored lights in the night sky, in...",12/23/20
3,12/22/20 22:45,Indian River Shores,FL,Changing,4 minutes,Orange orb over Indian River Shores Florida,12/23/20
4,12/22/20 22:05,lake elsinore,CA,Light,30 seconds,saw a glowing object.shape not discerned. trav...,12/23/20


In [13]:
df.sample(20)

Unnamed: 0,Date / Time,City,State,Shape,Duration,Summary,Posted
184,1/18/20 19:05,Alamo,TX,Light,7 minutes,My uncle had seen it moving across the sky...,2020-07-02 00:00:00
116,12/7/20 17:56,North Java,,Light,6 to 10 min.,Took dog out. Large bright white light directl...,12/23/20
177,2/22/20,Hixson,TN,Circle,30-45 minutes,While arriving home we saw a large bright ligh...,2/25/20
434,1/7/20 05:57,Abilene,TX,Cigar,10 minutes,It was up and down about an inch long that slo...,1/31/20
33,3/30/20 21:00,Hillandale,MD,Circle,30 minutes,((Starlink satellites)),6/25/20
16,10/31/20 09:30,Dade City,FL,Oval,6h 30 mins,Ufo above Northern Dade City Florida,12/23/20
259,8/19/20 00:10,N. Richland Hills,TX,Disk,1,Craft had mainly white light with red and gree...,8/20/20
464,7/10/20,Morristown,TN,Light,2 months,"For the past two months, at night, I've been w...",7/23/20
153,6/17/20 01:38,Fairview,NJ,Triangle,10 minutes,A bunch of unexplained beams of lights coming ...,6/25/20
526,4/17/20 21:26,Zagreb (Croatia),,Circle,10 minutes,"We saw 30 circle moving on sky, it was lasting...",6/25/20


In [14]:
len(df)

7020

In [15]:
len(df['Duration'].unique())

899

In [30]:
import os
import sys

sys.path.insert(0, os.path.abspath('../lib'))

import dehumanize

 False
> [36mREGULAR CASE:[0m ['ninety', 'five', 'hrs']
> [36mninety five hrs[0m 	=> 34200.0 Seconds
> [32mOriginal String:[0m ninety six hrs
> [32mHas number:[0m False
> [36mREGULAR CASE:[0m ['ninety', 'six', 'hrs']
> [36mninety six hrs[0m 	=> 34560.0 Seconds
> [32mOriginal String:[0m ninety seven hrs
> [32mHas number:[0m False
> [36mREGULAR CASE:[0m ['ninety', 'seven', 'hrs']
> [36mninety seven hrs[0m 	=> 34920.0 Seconds
> [32mOriginal String:[0m ninety eight hrs
> [32mHas number:[0m False
> [36mREGULAR CASE:[0m ['ninety', 'eight', 'hrs']
> [36mninety eight hrs[0m 	=> 35280.0 Seconds
> [32mOriginal String:[0m ninety nine hrs
> [32mHas number:[0m False
> [36mREGULAR CASE:[0m ['ninety', 'nine', 'hrs']
> [36mninety nine hrs[0m 	=> 35640.0 Seconds
> [32mOriginal String:[0m 91h
> [32mHas number:[0m True
> [36mREGULAR CASE:[0m ['91h']
> [36m91h[0m 	=> 32760.0 Seconds
> [32mOriginal String:[0m 91 h
> [32mHas number:[0m True
> [36mREGULAR CASE

In [23]:
class UnknownTimeFrameException(Exception):
    '''
    Exception raised when no timeframe is found.

    Attributes:
        timeframe -- time frame
        message -- explaination of the erro
    '''

    def __init__(self, timeframe, message='Could not recognize time frame: {}'):
        self.timeframe = timeframe
        self.message = message.format(timeframe)
        super().__init__(self.message)


class CouldNotConvertTimeFrameException(Exception):
    '''
    Exception raised when timeframe is not convertible.

    Attributes:
        value -- user defined input
        timeframe -- time frame
        message -- explaination of the erro
    '''

    def __init__(self, value, timeframe, message='"{}" does note translate to a time frame [{} Seconds]'):
        self.value = value
        self.timeframe = timeframe
        self.message = message.format(value, timeframe)
        super().__init__(self.message)


class MatchingTimeNotFoundException(Exception):
    '''
    Exception raised when timeframe is not convertible.

    Attributes:
        duration -- user defined input
        pattern -- time frame
        message -- explaination of the erro
    '''

    def __init__(self, duration, pattern, message='Failing to match anything in string: "{}" using pattern: "{}"'):
        self.duration = duration
        self.pattern = pattern
        self.message = message.format(duration, pattern)
        super().__init__(self.message)


class WrongTypeException(Exception):
    '''
    Exception raised when duration is of the wrong type.

    Attributes:
        duration -- user defined input
        message -- explaination of the erro
    '''

    def __init__(self, duration, message='Cannot convert "{}": wrong type "{}"'):
        self.duration = duration
        self.message = message.format(duration, type(duration))
        super().__init__(self.message)



In [17]:
sep = '\\s*'
range_separator = ['/', '-', 'to']
range_numbers = '|'.join(
    [f'\\d{{1,3}}{sep}{rs}{sep}\\d{{1,3}}' for rs in range_separator])

regex = f'(:?[~|approx\\.|Approximately|>]*){sep}(\\d{{1,3}}|{range_numbers}|about a)\\+*{sep}(\\w*)'
print(regex)

(:?[~|approx\.|Approximately|>]*)\s*(\d{1,3}|\d{1,3}\s*/\s*\d{1,3}|\d{1,3}\s*-\s*\d{1,3}|\d{1,3}\s*to\s*\d{1,3}|about a)\+*\s*(\w*)


In [24]:
import re
from random import randint
from text2digits import text2digits

UNITS = ['zero', 'one', 'two', 'three', 'four',
         'five', 'six', 'seven', 'eight', 'nine']
TEENS = ['ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen',
         'nineteen']
TENS = ['twenty', 'thirty', 'forty', 'fifty',
        'sixty', 'seventy', 'eighty', 'ninety']


def get_matches(human_readable_duration, regex):
    matches = []
    t2d = text2digits.Text2Digits(convert_ordinals=False)
    # special case
    SPECIAL_CASE = 'second'
    split_str = human_readable_duration.split(' ')
    time_frame = split_str[-1:][0]
    filtered = ' '.join([s for s in split_str if s != SPECIAL_CASE])

    if time_frame == SPECIAL_CASE and filtered in TENS:
        print(f'> {Fore.RED}SPECIAL CASE:{Style.RESET_ALL} {split_str}')
        print(f'> {Fore.RED}SPLITED:{Style.RESET_ALL} {filtered}')
        clean_str = t2d.convert(filtered)
        matches = [('', clean_str, time_frame)]
        print(f'> {Fore.RED}SPECIAL MATCHES:{Style.RESET_ALL} {matches}')
    else:
        print(f'> {Fore.CYAN}REGULAR CASE:{Style.RESET_ALL} {split_str}')
        clean_str = t2d.convert(human_readable_duration)
        matches = re.findall(regex, clean_str)

    return (matches, clean_str)

def extract_time(human_readable_duration, regex):
    duration = -1
    time_frame = False
    matches, clean_str = get_matches(human_readable_duration, regex)
    if len(matches) == 0:
        return (matches, time_frame, MatchingTimeNotFoundException(clean_str, regex))

    if len(matches) == 1:
        group = matches[0]
        duration = 1 if group[1] == 'about a' else int(group[1])
        time_frame = group[2]
    # if we have a range ie: 1-2, 1/2, 1 to 2
    elif len(matches) == 2:
        # averaging between groups
        group_min = matches[1]
        duration_min = int(group_min[1])
        group_max = matches[1]
        duration_max = int(group_max[1])
        duration = (duration_min + duration_max) / 2
        time_frame = group_max[2]

    return (duration, time_frame, None)


def to_seconds(human_readable_duration, regex):
    if not isinstance(human_readable_duration, str):
        return (None, WrongTypeException(human_readable_duration))
    duration, time_frame, err = extract_time(human_readable_duration, regex)
    if time_frame is not False:
        if time_frame.lower() in TF_SECONDS:
            return (duration, None)
        elif time_frame.lower() in TF_MINUTES:
            return (duration * 60, None)
        elif time_frame.lower() in TF_HOURS:
            return (duration * 360, None)
    return (None, UnknownTimeFrameException(time_frame))

In [27]:
def get_apnumbers():
    nums = []
    nums = [*nums, *UNITS, *TEENS]

    for n in TENS:
        for u in UNITS:
            curr = n if u == UNITS[0] else f'{n} {u}'
            nums.append(curr)
    return nums

TF_SECONDS = ['seconds', 'second', 'sec', 'secs', 's']
TF_MINUTES = ['minute', 'minutes', 'min', 'mins', 'm']
TF_HOURS = ['hour', 'hours', 'hr', 'hrs', 'h']
TIME_FRAMES = [*TF_SECONDS, *TF_MINUTES, *TF_HOURS]

durations = []
min_duration = randint(0, 99)
max_duration = min_duration + randint(1, 99)
for tf in TIME_FRAMES:
    durations.append(f'{min_duration}{tf}')
    durations.append(f'{min_duration} {tf}')
    durations.append(f'~{min_duration}{tf}')
    durations.append(f'~{min_duration} {tf}')
    durations.append(f'{min_duration}/{max_duration}{tf}')
    durations.append(f'{min_duration}/{max_duration} {tf}')
    durations.append(f'{min_duration} / {max_duration}{tf}')
    durations.append(f'{min_duration} / {max_duration} {tf}')
    durations.append(f'{min_duration}-{max_duration}{tf}')
    durations.append(f'{min_duration}-{max_duration} {tf}')
    durations.append(f'{min_duration} - {max_duration}{tf}')
    durations.append(f'{min_duration} - {max_duration} {tf}')
    durations.append(f'{min_duration} to {max_duration} {tf}')
    durations.append(f'about a {tf}')
    durations.append(f'approx. {min_duration} {tf}')
    durations.append(f'approx.{min_duration} {tf}')
    durations.append(f'Approx. {min_duration} {tf}')
    durations.append(f'Approx.{min_duration} {tf}')
    durations.append(f'Approximately {min_duration} {tf}')
    durations.append(f'>{min_duration} {tf}')
    durations.append(f'{min_duration}+ {tf}')

    for a in get_apnumbers():
        durations.append(f'{a} {tf}')

print(f'{Back.GREEN}Testing to_second{Style.RESET_ALL}')
count = 0
for d in durations:
    seconds = float(to_seconds(d, regex))
    if seconds == -1:
        raise CouldNotConvertTimeFrameException(d, seconds)
    else:
        print(f'> {Fore.CYAN}{d}{Style.RESET_ALL} \t=> {seconds} Seconds')
    count += 1

print(f'Tested {count} cases against {len(durations)} initial cases.')

[42mTesting to_second[0m
> [36mREGULAR CASE:[0m ['9seconds']


TypeError: float() argument must be a string or a number, not 'tuple'

In [25]:
df['Duration'].apply(lambda x: to_seconds(x, regex))

> [36mREGULAR CASE:[0m ['5', 'seconds']
> [36mREGULAR CASE:[0m ['2', 'Seconds']
> [36mREGULAR CASE:[0m ['30', 'mins']
> [36mREGULAR CASE:[0m ['4', 'minutes']
> [36mREGULAR CASE:[0m ['30', 'seconds']
> [36mREGULAR CASE:[0m ['45', 'seconds']
> [36mREGULAR CASE:[0m ['20', 'seconds']
> [36mREGULAR CASE:[0m ['While', 'taken', 'pictures']
> [36mREGULAR CASE:[0m ['3', 'minutes']
> [36mREGULAR CASE:[0m ['20', 'seconds']
> [36mREGULAR CASE:[0m ['5', 'seconds']
> [36mREGULAR CASE:[0m ['4', 'seconds']
> [36mREGULAR CASE:[0m ['4', 'seconds']
> [36mREGULAR CASE:[0m ['4', 'seconds']
> [36mREGULAR CASE:[0m ['1-3', 'seconds']
> [36mREGULAR CASE:[0m ['1-2', 'mins']
> [36mREGULAR CASE:[0m ['12', 'minutes']
> [36mREGULAR CASE:[0m ['15-20', 'min']
> [36mREGULAR CASE:[0m ['3', 'minutes']
> [36mREGULAR CASE:[0m ['2hrs']
> [36mREGULAR CASE:[0m ['2-3', 'minutes']
> [36mREGULAR CASE:[0m ['40', 'mins']
> [36mREGULAR CASE:[0m ['1', 'minute']
> [36mREGULAR CASE:[0m 

InvalidOperation: [<class 'decimal.ConversionSyntax'>]