In [1]:
import numpy as np
import pandas as pd

In [2]:
use_mv = True

if use_mv:
    all_events_data = pd.read_csv("../cleaned_data/all_events_data_mv.csv",index_col = 0,dtype = str)
else:
    all_events_data = pd.read_csv("../cleaned_data/all_events_data.csv", index_col = 0,dtype = str)

In [3]:
all_events_data["eve_index"] = all_events_data["eve_index"].astype("int")

In [4]:
all_events_data["time"] =pd.to_datetime(all_events_data["time"],infer_datetime_format = True,format="%Y-%m-%d %H:%M:%S")
#delete all invalid time
all_events_data =all_events_data[~all_events_data.time.isnull()].reset_index(drop = True)

In [5]:
#merge with timestamp of next event
time_next = pd.DataFrame({"TIME_next":all_events_data.time[1:]}).reset_index(drop = True)
time_merged = all_events_data.merge(time_next, how = "left", left_index=True, right_index=True)

In [6]:
#calculate day-gaps after each event for each 
time_token =pd.DataFrame({"time_gap":time_merged.TIME_next - time_merged.time, "flag":np.ones(len(time_merged))})

In [7]:
time_token = all_events_data.merge(time_token, how = "left", left_index=True, right_index=True)

In [8]:
#get rid of the last time-gap of each patient (its meaningless)
time_token_clean = time_token.groupby("subject_id").apply(lambda x: x[:-1])

In [9]:
time_token_clean.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,subject_id,events,time,eve_index,time_gap,flag
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10000019,0,10000019,V3000,2129-05-21 19:16:00,25177,0 days 00:00:00,1.0
10000019,1,10000019,V053,2129-05-21 19:16:00,24803,0 days 00:00:00,1.0
10000019,2,10000019,V290,2129-05-21 19:16:00,25156,0 days 00:00:00,1.0
10000019,3,10000019,V502,2129-05-21 19:16:00,25368,0 days 01:44:00,1.0
10000019,4,10000019,phytonadione (vitamin k1),2129-05-21 21:00:00,33312,0 days 00:00:00,1.0


In [10]:
#remove the observation with inconsistant time. one happend over 50 years before the next event
a = time_token_clean[time_token_clean.time_gap > pd.Timedelta(days=15000)]
index_to_drop = [i for (a,i) in a.index]
#remove from the orginal data,
all_events_data.drop(index_to_drop, inplace = True)
#remove from time token data
time_token_clean = time_token_clean[time_token_clean.time_gap < pd.Timedelta(days=15000)]
#reset_index
time_token_clean.reset_index(drop = True, inplace = True)

In [11]:
time_token_clean.time_gap = time_token_clean.time_gap.apply(lambda x:x.days)

In [12]:
# to create tokens only take time gap >0
time_token_clean = time_token_clean[time_token_clean.time_gap > 0]

In [13]:
time_token_clean.head()

Unnamed: 0,subject_id,events,time,eve_index,time_gap,flag
27,10000032,ipratropium bromide neb,2180-05-07 02:00:00,31147,1,1.0
29,10000032,spironolactone,2180-05-08 08:00:00,34340,49,1.0
52,10000032,albumin 25% (12.5g / 50ml),2180-06-27 18:00:00,27504,25,1.0
90,10000032,albuterol inhaler,2180-07-24 21:00:00,27528,12,1.0
137,10000074,phytonadione (vitamin k1) (for l&d to nbn orde...,2110-10-16 11:00:00,33313,1,1.0


In [14]:
#create bins 0-2 days 3-5 days 6-12 days 13-30 days 30-90 days 90-365 days 365+ days
max_gap_days = np.max(time_token_clean.time_gap)
max_index = all_events_data["eve_index"].max()

time_token_clean["Bin_indx"] = pd.cut(time_token_clean.time_gap, [0,2,7,15,90,365,max_gap_days], labels=[max_index+1,max_index+2,max_index+3,max_index+4,max_index+5,max_index+6])
time_token_clean["Bin"] = pd.cut(time_token_clean.time_gap, [0,2,7,15,90,365,max_gap_days], \
                                 labels=["timetoken0-2day","timetoken3-7day","timetoken8-15day","timetoken16-90day","timetoken91-365day","timetoken366+day"])

In [15]:
time_token_clean.head()

Unnamed: 0,subject_id,events,time,eve_index,time_gap,flag,Bin_indx,Bin
27,10000032,ipratropium bromide neb,2180-05-07 02:00:00,31147,1,1.0,35628,timetoken0-2day
29,10000032,spironolactone,2180-05-08 08:00:00,34340,49,1.0,35631,timetoken16-90day
52,10000032,albumin 25% (12.5g / 50ml),2180-06-27 18:00:00,27504,25,1.0,35631,timetoken16-90day
90,10000032,albuterol inhaler,2180-07-24 21:00:00,27528,12,1.0,35630,timetoken8-15day
137,10000074,phytonadione (vitamin k1) (for l&d to nbn orde...,2110-10-16 11:00:00,33313,1,1.0,35628,timetoken0-2day


In [16]:
time_token_clean = time_token_clean.drop(["eve_index", "events"], axis=1)\
                            .rename(columns = {"Bin_indx":"eve_index","Bin":"events" })

In [17]:
time_token_clean.head()

Unnamed: 0,subject_id,time,time_gap,flag,eve_index,events
27,10000032,2180-05-07 02:00:00,1,1.0,35628,timetoken0-2day
29,10000032,2180-05-08 08:00:00,49,1.0,35631,timetoken16-90day
52,10000032,2180-06-27 18:00:00,25,1.0,35631,timetoken16-90day
90,10000032,2180-07-24 21:00:00,12,1.0,35630,timetoken8-15day
137,10000074,2110-10-16 11:00:00,1,1.0,35628,timetoken0-2day


In [18]:
#create time tokens as "events" and "event" index same format as other events
time_token_clean_final = time_token_clean.reindex(columns = [u'events', u'subject_id', u'time', u'eve_index','flag'])
time_token_clean_final['events'] = time_token_clean_final['events'].astype('object')
time_token_clean_final['eve_index'] = time_token_clean_final['eve_index'].astype('int')

In [19]:
#concatenate with event data
all_events_data["flag"] = np.zeros(len(all_events_data))
all_events_w_time = pd.concat([time_token_clean_final,all_events_data], axis = 0)\
                            .sort_values(by =['subject_id','time','flag'])

In [20]:
all_events_w_time.drop(['flag'],axis = 1, inplace = True)

In [21]:
if use_mv:
    all_events_w_time.to_csv("../cleaned_data/all_events_data_w_time_mv.csv")
else:
    all_events_w_time.to_csv("../cleaned_data/all_events_data_w_time.csv")

In [22]:
event_id = all_events_w_time.loc[:,["eve_index","events"]].sort_values(by = "eve_index").drop_duplicates()
event_id

Unnamed: 0,eve_index,events
1168045,1,(0.9% nacl)
15424818,2,(apri) (desogestrel-ethinyl estradiol)
21699458,3,(celebrate) calcium citrate + d
10942411,4,(uptravi)selexipag
16513141,5,*n f pantoprazole
...,...,...
6156723,35629,timetoken3-7day
18571205,35630,timetoken8-15day
2445342,35631,timetoken16-90day
7107486,35632,timetoken91-365day


In [23]:
if use_mv:
    event_id.to_csv("../cleaned_data/events_id_w_time_mv.csv")
else:
    event_id.to_csv("../cleaned_data/events_id_w_time.csv")