In [1]:
import pandas as pd
import numpy as np

In [2]:
period = pd.read_csv('pslovedata/Period.csv').dropna()
symptom = pd.read_csv('pslovedata/Symptom.csv')
user = pd.read_csv('pslovedata/User.csv')

In [3]:
def extract_date(datestr):
    vals = str(datestr).split("/")
    if len(vals) > 2:
        day, month, year = vals[0], vals[1], vals[2]
        if len(day) == 1:
            day = '0' + day
        if len(month) == 1:
            month = '0' + month
        if len(year) == 2:
            if int(year) < 30:
                year = '20' + year
            else:
                year = '19' + year
        extracted = pd.datetime(int(year), int(month), int(day))
        if extracted == pd.NaT:
            return np.NaN
        return extracted
    return np.NaN



In [4]:
period['fstart'] = pd.Series.apply( period['start_date'], extract_date)
period['fend'] = pd.Series.apply( period['end_date'], extract_date)
period = period[period['fstart'].dt.year >= 2014]
period = period[period['fstart'] != period['fend']]
period = period.dropna()
period = period[period.apply(lambda x: True if ((x['fend'] - x['fstart']).days < 12 and (x['fend'] - x['fstart']).days > 0) else False, axis =1)]
period = period.drop_duplicates(subset=['User_id', 'fstart'])
period = period.drop_duplicates(subset=['User_id', 'fend'])

period['fdates'] = list(zip(period['fstart'], period['fend']))
period_cleaned = period.groupby('User_id')['fdates'].apply(list).reset_index()
period_cleaned = period_cleaned[period_cleaned['fdates'].apply(lambda x: True if len(x) > 2 else False)]

In [8]:
encoding_len = 13
symptom_len = 10
count = 0
# First & second last elements are markers for period start and end
# Last elements is marker for end of a cycle

def end_token():
    et = np.zeros(encoding_len)
    et[-1] = 1
    return et

et = end_token()

def day_vec(user, pmap, day, day_symptoms):
    dv = np.zeros(encoding_len)
    if day in pmap:
        if pmap[day] == 's':
            dv[0] = 1
        else:
            dv[-2] = 1
    assert symptom_len == len(day_symptoms)
    for i in range(symptom_len):
        if day_symptoms[i] > 0:
            dv[i+1] = 1
    return dv
    
    
def create_vector(user, periods):
    global count
    global et
    pmap = {}
    sorted_ps = sorted(periods, key= lambda x: x[0])
#     Assuming period cannot start and end on same date
    for p in periods:
        pmap[p[0]] = 's'
        pmap[p[1]] = 'e'
    start = sorted_ps[0][0]
    end = sorted_ps[-1][1]
    delta = (end - start).days
    
    vec = []
    for i in range(delta+1):
        curr_date = start + pd.DateOffset(i)
        try:
            sym = symptom_cleaned.loc[(user, curr_date)].as_matrix()[0]
        except:
            sym = np.zeros(symptom_len)
        vec.append(day_vec(user, pmap, curr_date, sym))
        if vec[-1][-2] == 1:
            # That is the end of period was appended last
            vec.append(et)
    count = count + 1
    if count % 500 == 0:
        print("done ", count)
    return vec
    
def markends(vector):
    return marker_points(vector, -2)

def markstarts(vector):
    return marker_points(vector, 0)

def markendtokens(vector):
    return marker_points(vector, -1)

def marker_points(vector, point_pos):
    i=0
    points = []
    for v in vector:
        if v[point_pos] == 1:
            points.append(i)
        i += 1
    return points

def add_position_markers(encoded):
    encoded['ends'] = encoded.apply(lambda x: markends(x['vector']), axis = 1)
    encoded['starts'] = encoded.apply(lambda x: markstarts(x['vector']), axis = 1)
    encoded['endtokens'] = encoded.apply(lambda x: markendtokens(x['vector']), axis = 1)
    
def encoded_splitting(encoded):
    splits = []
    ends = encoded['ends']
    starts = encoded['starts']
    first = 0
    if len(starts) == len(ends):
        for i in range(len(ends)-1):
            if starts[i+1] - ends[i] > 35:
                splits.append((encoded['user'], encoded['vector'][first:starts[i+1]]))
                first = starts[i+1]
    return splits

block_size = 5
def split(user, vector, starts, ends, endtokens):
    splits = []
    if len(endtokens) > block_size:
        start = 0
        split_index = 0
        while(split_index + block_size < len(endtokens)):
            split_index = split_index + block_size - 1
            inp = vector[ start : endtokens[split_index] + 1]
            out = vector[endtokens[split_index] + 1 : endtokens[split_index + 1] + 1]
            splits.append((inp, out))
            start = starts[split_index]
        if split_index < len(endtokens) - 2: 
            # Case where last few elements were left
            splits.append((vector[start : endtokens[-2]+1], vector[endtokens[-2]+1 : endtokens[-1] + 1]))            
    else:
        inp = vector[ : endtokens[-2] + 1]
        out = vector[endtokens[-2]+1 : ]
        splits.append((inp, out))
    return splits
            
    
    
encodings = period_cleaned.apply(lambda x: (x['User_id'], create_vector(x['User_id'], x['fdates'])), axis = 1)

done  500
done  1000
done  1500
done  2000


In [9]:
encoded = pd.DataFrame(encodings.to_list(), columns=['user', 'vector'])

add_position_markers(encoded)
encoded_cleaned = encoded.apply(lambda x: encoded_splitting(x), axis = 1)
encoded_cleaned_final =pd.DataFrame(pd.DataFrame(encoded_cleaned).apply(lambda x: pd.Series(x[0]), axis=1).stack().reset_index(level=1, drop=True).to_list()).rename({0:'user', 1:'vector'}, axis=1)
add_position_markers(encoded_cleaned_final)
encoded_cleaned_final = encoded_cleaned_final[encoded_cleaned_final['endtokens'].apply(lambda x: True if len(x) > 2 else False)]

augmented = encoded_cleaned_final.apply(lambda x: (x['user'], split(x['user'], x['vector'], x['starts'], x['ends'], x['endtokens'])), axis = 1)
augmented = pd.DataFrame(augmented.to_list())
aug_inp_out = augmented.apply(lambda x: pd.Series(x[1]), axis=1).stack().reset_index(level=1, drop=True)
aug_inp_out.name = 'inp_out'
aug = augmented.drop(1, axis=1).join(aug_inp_out)
aug[['inp','out']] = pd.DataFrame(aug['inp_out'].tolist(), index=aug.index)
aug = aug.rename({0:'user'}, axis = 1)
aug = aug[aug.apply(lambda x: True if len(x['out']) < 40 else False, axis =1)]



In [12]:
aug.to_pickle('augmented_data')