In [1]:
import os, pandas as pd, numpy as np, re
from functools import reduce
pd.set_option('display.max_columns', 500)

#def main():
sage = '/home/wraikes/Dropbox/partnership/dmt/data/sage_not_final'
#sage = r'C:\Users\williamr.PDFK\Dropbox\partnership\dmt_temp\Sage (Mobile Phone) Data 8.31.17'
os.chdir(sage)
files_to_exclude = ['digital-marshmallow-status_8.31.17.csv',
                    'digital-marshmallow-appVersion_8.31.17.csv']
end_string = '_8.31.17.csv'

In [2]:
def parse_directory(directory, exclude_files, end_string):
    '''
    Go through and process all files in directory.
    If extra processing is needed, use a special function 'new_df_func'.
    Return a single dataframe, of all data merged.
    '''
    
    new_dfs = []
    
    for file in directory:
        name = create_df_name(file, end_string)
        
        if file not in exclude_files:
            df = pd.read_csv(file)
            df = remove_dupes(df)
            df = create_new_df(df, name)
            
            new_dfs.append(df)
    
    final_df = reduce(lambda left, right: pd.merge(left, right, how = 'outer',
                                                   on='externalId'), 
                      new_dfs)
    
    return final_df

In [16]:
def create_new_df(df, df_name):
    '''
    Each dataframe is cleaned with special instructions based on the filters.
    A cleaned dataframe is returned.
    '''
    if len(df.externalId) == len(df.externalId.unique()):
        df = col_relabel(df, df_name)
        
    elif df_name == 'bart-v4':
        attributes = [
            ['baseline', 'BART0.25'],
            ['baseline', 'BART250.00'],
            ['21-day-assessment', 'BART0.25'],
            ['21-day-assessment', 'BART250.00']
        ]
                
        df = spread_cols(df, df_name, attributes, var_2='data.json.variable_label')
        
    elif df_name == 'delay_discounting_raw-v6':
        bl = 'baseline'
        _21 = '21-day-assessment'
        
        attributes = [
            [bl, 'dd_time_6_months'],
            [bl, 'dd_money_6_month'],
            [bl, 'dd_money_1_month'],
            [bl, 'dd_time_1_year'],
            [_21, 'dd_time_6_month'],
            [_21, 'dd_money_6_month'],
            [_21, 'dd_money_1_month'],
            [_21, 'dd_time_1_year']            
        ]
        
        df = spread_cols(df, df_name, attributes, var_2='data.json.variableLabel')

    elif df_name == 'demographics-v2' or df_name == 'generally_sem_diff_bl-v2':
        indices = df.index[df.externalId == 'yQ7pYy']
        df.drop(indices[-1], axis=0, inplace=True)

    elif df_name == 'behavior_choices_4_bl-v2':
        attributes = [
            'baseline'        
        ]
        
        df = spread_cols(df, df_name, attributes)
        
    elif df_name == 'discounting_raw-v2':
        bl = 'baseline'
        _21 = '21-day-assessment'

        attributes = [
            [bl, 'pd_constant_money'],
            [bl, 'pd_constant_probabiliy'],
            [_21, 'pd_constant_money'],
            [_21, 'pd_constant_probability']
        ]
        
        df = spread_cols(df, df_name, attributes, var_2='data.json.variableLabel')
    
    elif df_name == 'evening_notification_time-v2':
        attributes = ['baseline']

        df = spread_cols(df, df_name, attributes)
                        
    elif df_name == 'goNoGo-v2':
        bl = 'baseline'
        _21 = '21-day-assessment'

        attributes = [
            [bl, 'go_no_go_stable_stimulus_active_task'],
            [bl, 'go_no_go_variable_stimulus_active_task'],
            [_21, 'go_no_go_stable_stimulus_active_task'],
            [_21, 'go_no_go_variable_stimulus_active_task']
        ]
        
        indices = df.index[df.externalId == 'ksJM3Y']
        df.drop(indices[-1], axis=0, inplace=True)
        
        df = spread_cols(df, df_name, attributes, var_2='data.json.variable_label')
    
    elif df_name == 'morning_notification_time-v3':
        attributes = ['baseline']
        
        df = spread_cols(df, df_name, attributes)
    
    elif df_name == 'pam_multiple-v2':
        attributes = [
            'baseline',
            '21-day-assessment'
        ]
        
        df = spread_cols(df, df_name, attributes)
    
    else:
        print('Danger!!!')
                                 
    return df

In [4]:
def spread_cols(df, df_name, attributes, var_1='metadata.json.taskIdentifier', var_2=None):
    '''
    Merge the dataframes into one, with new columns.
    '''
    
    dfs = []
    
    for att in attributes:
        new_df = filter_df(df, att, var_1, var_2)
        new_df = col_relabel(new_df, df_name, att)
        dfs.append(new_df)
    
    new_df = reduce(lambda left, right: pd.merge(left, 
                                                 right, 
                                                 how = 'outer',
                                                 on='externalId'),
                    dfs)
    
    return new_df

In [5]:
def filter_df(df, att, var_1, var_2=None):
    '''
    Filter a dataframe based off of var_1 and var_2 variables.
    '''
    
    cols = df.columns
    
    new_df = pd.DataFrame(columns=cols)
        
    for ix, row in df.iterrows():
        if var_2:
            if row[var_1] == att[0] and row[var_2] == att[1]:
                new_df = new_df.append(row, ignore_index=True)
        else:
            if row[var_1] == att[0]:
                new_df = new_df.append(row, ignore_index=True)
    
    return new_df

In [6]:
def col_relabel(df, prefix, att=None):
    '''
    Remove: 'metadata.json.' and 'data.json.'.
    Replace spaces with underscores.
    Append new name to existing column name, except if externalId.
    '''
    new_cols = []
    col_re = re.compile('metadata.json.|data.json.')
    
    
    for col in df.columns:
        new_col = re.sub(col_re, '', col)

        if new_col != 'externalId':
            if att:
                if len(att) == 2:
                    new_col = '{}_{}_{}_{}___{}'.format('SAGE', 
                                                        prefix, 
                                                        att[0], 
                                                        att[1], 
                                                        new_col
                                                        )
                else:
                    new_col = '{}_{}_{}___{}'.format('SAGE', 
                                                     prefix, 
                                                     att[0], 
                                                     new_col
                                                     )                
            
            else:
                new_col = '{}_{}___{}'.format('SAGE', 
                                              prefix, 
                                              new_col
                                              ) 
                                              
        new_col = new_col.replace(' ', '_')
        new_col = new_col.replace('-', '_')
        new_cols.append(new_col)
    
    df.columns = new_cols
    
    return df

In [7]:
def create_df_name(string, end_string):
    string = string.replace('digital-marshmallow-', '')
    string = string.replace(end_string, '')
    
    return string      

In [8]:
def remove_dupes(df):
    
    test_users = [
        'ThpMV2Achc', 
        'SEkQVTCe6j', 
        'Wh8NSX3DHL', 
        'SaXFr2kPZa', 
        'VWUcSp4TeH', 
        'yXEfAmW682', 
        'gwEpQR8j9B',
        'WbbNWM4RAF', 
        'D5bzYrfd8E', 
        'LJcmEFWp74', 
        'ULoF3MM1nN'
    ]

    diff_study = [
        'rL8eA3',
        'rLg5xs',
        'rLrD9h',
        'rLP7H2',
        'rL6s6h',
        'aOyzBg',
        'aORA43',
        'aOh48U',
        'aOLu4K',
        'aOQtxv',
        'aO5TvQ',
        'mPC9S8',
        'mPgquX',
        'mP5xkB',
        'mPSQvh',
        'mPYk2p',
        'mP3rbd'
    ]

    
    return df[~df.externalId.isin(test_users + diff_study)]

In [9]:
os.listdir()

['digital-marshmallow-morning_notification_time-v3_8.31.17.csv',
 'digital-marshmallow-generally_sem_diff_bl-v2_8.31.17.csv',
 'digital-marshmallow-bart-v4_8.31.17.csv',
 'digital-marshmallow-past_year_likert_bl-v3_8.31.17.csv',
 'digital-marshmallow-goNoGo-v2_8.31.17.csv',
 'digital-marshmallow-morning_yesterday_sem_diff-v2_8.31.17.csv',
 'digital-marshmallow-as_a_child_likert_bl-v2_8.31.17.csv',
 'digital-marshmallow-delay_discounting_raw-v6_8.31.17.csv',
 'digital-marshmallow-evening_sem_diff-v2_8.31.17.csv',
 'digital-marshmallow-comments-v2_8.31.17.csv',
 'digital-marshmallow-behavior_likert_21-v2_8.31.17.csv',
 'digital-marshmallow-discounting_raw-v2_8.31.17.csv',
 'digital-marshmallow-appVersion_8.31.17.csv',
 'digital-marshmallow-morning_sem_diff-v2_8.31.17.csv',
 'digital-marshmallow-past_year_likert_21-v2_8.31.17.csv',
 'digital-marshmallow-pam-v2_8.31.17.csv',
 'digital-marshmallow-morning_behavior_choices-v1_8.31.17.csv',
 'digital-marshmallow-pam_multiple-v2_8.31.17.csv',


In [19]:
_list = os.listdir()[0:1]
_list

['digital-marshmallow-morning_notification_time-v3_8.31.17.csv']

In [22]:
test = pd.read_csv(_list[0])
test = remove_dupes(test)

In [24]:
len(test.externalId) == len(set(test.externalId))

False

In [28]:
len(test.externalId)

136

In [29]:
test.externalId.duplicated()

20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
30      True
31     False
32     False
33     False
35     False
36     False
37      True
38     False
39     False
40     False
41     False
42     False
43     False
44     False
45     False
46     False
47      True
48      True
49     False
50     False
       ...  
144     True
145     True
146     True
147     True
148     True
149     True
150     True
152     True
153     True
154     True
155     True
157     True
158     True
159     True
160     True
161     True
162     True
163     True
164     True
165     True
166     True
171    False
172     True
173    False
174    False
175     True
176     True
177     True
178     True
182    False
Name: externalId, dtype: bool

In [27]:
len((test.externalId.duplicated()))

136

In [30]:
test.externalId

20     GpRu74
21     Dwr9Wq
22     gSWn9N
23     4chdWY
24     uS9Yn3
25     up3ALb
26     6fjRmX
27     Uusd4p
28     fG7hUR
29     zqjJ2m
30     uS9Yn3
31     3MhSZ8
32     4s9KFs
33     ksJM3Y
35     UXR4bY
36     3YSsRf
37     ksJM3Y
38     V9Yycg
39     kT6Rjc
40     tKr2P3
41     Jc7DYR
42     7f7Gx2
43     nx7zFF
44     8zVXXt
45     8sWLxd
46     rKAU4r
47     8sWLxd
48     tKr2P3
49     L67rRS
50     h9XRjL
        ...  
144    Wh4HL2
145    nMMw8W
146    nMMw8W
147    nMMw8W
148    nMMw8W
149    nMMw8W
150    nMMw8W
152    nMMw8W
153    nMMw8W
154    nMMw8W
155    nMMw8W
157    nMMw8W
158    nMMw8W
159    nMMw8W
160    65CanL
161    nMMw8W
162    65CanL
163    cAr3x9
164    nMMw8W
165    nMMw8W
166    nMMw8W
171    m2L8ka
172    nMMw8W
173    KSzr5A
174    bWr8MN
175    nMMw8W
176    nMMw8W
177    nMMw8W
178    nMMw8W
182    bdyP3M
Name: externalId, dtype: object

In [20]:
test = parse_directory(_list, files_to_exclude, end_string)

In [21]:
test

Unnamed: 0,SAGE_morning_notification_time_v3_b___ROW_ID,SAGE_morning_notification_time_v3_b___ROW_VERSION,SAGE_morning_notification_time_v3_b___recordId,SAGE_morning_notification_time_v3_b___appVersion,SAGE_morning_notification_time_v3_b___phoneInfo,SAGE_morning_notification_time_v3_b___uploadDate,SAGE_morning_notification_time_v3_b___healthCode,externalId,SAGE_morning_notification_time_v3_b___dataGroups,SAGE_morning_notification_time_v3_b___createdOn,SAGE_morning_notification_time_v3_b___createdOnTimeZone,SAGE_morning_notification_time_v3_b___userSharingScope,SAGE_morning_notification_time_v3_b___UUID,SAGE_morning_notification_time_v3_b___taskIdentifier,SAGE_morning_notification_time_v3_b___taskRunUUID,SAGE_morning_notification_time_v3_b___startDate,SAGE_morning_notification_time_v3_b___startDate.timezone,SAGE_morning_notification_time_v3_b___endDate,SAGE_morning_notification_time_v3_b___endDate.timezone,SAGE_morning_notification_time_v3_b___groupLabel,SAGE_morning_notification_time_v3_b___notification_time
