In [None]:
import pandas as pd
import re
from dateutil import parser
from datetime import datetime, timedelta

In [None]:
with open('../data/events.csv') as f:
    data = pd.read_csv(f)
    starttime = data['starttime']
    endtime = data['endtime']

In [None]:
def sub_pattern(time_list, compiled_pats, is_endtime=False):
    collector = list()
    
    for st in time_list:
        # if time is not null
        if not pd.isnull(st) and st != '':
            # some time contains '^ ' so it doesn't match properly
            st = st.strip()
            # pass regex patterns
            check = list(map(lambda x: x.sub(r'\1', st) if x.match(st) else None, pats))
            if any(check):
                # if match any pattern, append to collector
                subbed = next(item for item in check if item is not None)
                collector.append(subbed)
            else:
                # if doesnt match, and only have one pattern
                # meaning it is endtime in starttime pattern
                # so put None to all except the match one
                if len(compiled_pats) == 1:
                    collector.append(None)
                else:
                    # if doesnt match any pattern assume this is allday
                    # e.g., '(All day)', 'All day', 'May 21, 2013' falls here
                    collector.append('allday')
        # if time is null
        else:
            collector.append(None)
#             # if not endtime
#             if not is_endtime:
#                 # and it is endtime in starttime pattern
#                 if len(compiled_pats) == 1:
#                     # put None
#                     collector.append(None)
#                 # it is a starttime, assume it is allday
#                 else:
#                     collector.append('allday')
#             else:
#                 collector.append(None)
            
    return collector

# TODO
def cleansing(ts_te_tuple):
    if not all(ts_te_tuple):
        return(ts_te_tuple)
    
    artifacts_removed = tuple(map(lambda x: x.replace(' ', '').replace('.', '').lower(), ts_te_tuple))
    dt_tuple = tuple(parser.parse(elem).time() if elem != 'allday' else elem for elem in artifacts_removed)
    return dt_tuple

In [None]:
# pattern for starttime and endtime 
patterns = [
    r'^[a-zA-Z]+, [a-zA-Z]+ [0-9]{1,2}, [0-9]{4} - (.*)',
    r'^([0-9]{1,2}:[0-9]{1,2}\s?[apmAPM\.]{0,4}).*',
    r'^([0-9]{1,2}\s?[apmAPM\.]{1,4}).*',
]

# compile to put as argument
pats = [re.compile(pat) for pat in patterns]

starttime_subbed = sub_pattern(starttime, compiled_pats=pats)
endtime_subbed = sub_pattern(endtime, compiled_pats=pats, is_endtime=True)

# pattern for extracting endtime out of starttime
patterns = [
    r'^[0-9]{1,2}:[0-9]{1,2}\s?[apmAPM\.]{0,4}\s?-\s?([0-9]{1,2}:[0-9]{1,2}\s?[apmAPM\.]{0,4})'
]

# compile to put as argument
pats = [re.compile(pat) for pat in patterns]

endtime_in_starttime_subbed = sub_pattern(starttime, compiled_pats=pats)

time_showed = []
for ts, te, k in zip(starttime_subbed, endtime_subbed, endtime_in_starttime_subbed):
    # merge k into te first
    if pd.isnull(te) and not pd.isnull(k):
        time_showed.append((ts, k, k))
    # +1 hr to all null te, except ts is allday
    elif not pd.isnull(ts) and ts is not 'allday' and pd.isnull(te):
        te = (parser.parse(ts) + timedelta(hours=1)).strftime("%I:%M %p")
        time_showed.append((ts, te, k))
    # fill allday to te
    elif ts == 'allday' and pd.isnull(te):
        te = 'allday'
        time_showed.append((ts, te, k))
    # fill None
    elif pd.isnull(ts) and pd.isnull(te):
        time_showed.append((None, None, k))
    # nothing to do
    else:
        time_showed.append((ts, te, k))
        
# keep only ts and te
time_showed = [(ts, te) for (ts, te, k) in time_showed]
    
# TODO
# if endtime is nan but starttime is not, endtime = starttime + 1 hour
# if endtime is nan but starttime is All day, endtime = All day
# if endtime is nan and starttime is nan, endtime = null, starttime = null
# if endtime is nan but starttime is XX:XX - YY:YY, endtime = YY:YY //

In [None]:
list(zip(time_showed, starttime_subbed, endtime_subbed))
time_showed[10:20]

In [None]:
# cleaning
final = [cleansing(x) for x in time_showed]

In [None]:
final_starttime = [ts for (ts, te) in final]
final_endtime = [te for (ts, te) in final]
final_endtime

In [None]:
new_df = data.assign(starttime_dt=final_starttime, endtime_dt=final_endtime)
new_df.to_csv('../data/events_dt.csv', index=False)