## Sage Data Merge

In [1]:
import os, pandas as pd, numpy as np, re
from functools import reduce
pd.set_option('display.max_columns', 100)

#NOTE TO SELF: redo the relative paths.
sage = '/home/wraikes/Dropbox/partnership/DMTBilly data - Copy/Sage Data'
os.chdir(sage)

In [2]:
def remove_dupes(df):
    #'Must confirm removal of these dupes, especially ThpMV2Achc
    test_users = ['ULoF3MM1nN', 'gSWn9N', 'D5bzYrfd8E', 'ThpMV2Achc']
    return df[~df.externalId.isin(test_users)]

def dupe_check(df):
    return len(df.externalId) == len(df.externalId.unique()) and len(df.externalId) > 0

def new_cols(df, append):
    # Remove: 'metadata.json.' and 'data.json.'.
    col_re = re.compile('metadata.json.|data.json.')
    df.columns = list(map(lambda x: re.sub(col_re, '', x), df.columns))
    
    # Append df identifier to columns.
    df.columns = list(map(lambda x: x + append, df.columns))
    df = df.rename(columns = {'externalId' + append: 'externalId'})

    return df

def create_new_df(df, att, var_1, var_2=None):
    cols = df.columns
    
    new_df = pd.DataFrame(columns = cols)
        
    for ix, row in df.iterrows():
        if var_2:
            if row[var_1] == att[1] and row[var_2] == att[2]:
                new_df = new_df.append(row, ignore_index=True)
        else:
            if row[var_1] == att[1]:
                new_df = new_df.append(row, ignore_index=True)
    
    return new_df

def df_merge(df, attributes, var_1, var_2=None):
    dfs = []
    
    for att in attributes:
        new_df = create_new_df(df, att, var_1, var_2)
        new_df = new_cols(new_df, att[0])
        dfs.append(new_df)
    
    df_merge = reduce(lambda left, right: pd.merge(left, 
                                                   right, 
                                                   how = 'outer',
                                                   on='externalId'),
                      dfs)
    
    return df_merge

### Process files that do not need cleaning / restructuring.

In [3]:
test_users = ['ULoF3MM1nN', 'gSWn9N', 'D5bzYrfd8E', 'ThpMV2Achc']
files_to_exclude = ['digital-marshmallow-status_8.8.17.csv',
                    'digital-marshmallow-appVersion_8.8.17.csv']

files_as_is = []

for file in os.listdir():
    if file not in files_to_exclude:
        df = pd.read_csv(file)
        df = remove_dupes(df)
        if dupe_check(df):
            files_as_is.append(file)
        else:
            continue

In [4]:
names_as_is = ['_-_past_year_likert_21', 
               '_-_behavior_choices_1',
               '_-_comments_21',
               '_-_demos',
               '_-_behavior_lk_21',
               '_-_behavior_lk_bl',
               '_-_generally_sem_bl',
               '_-_generally_sem_21',
               '_-_as_a_child',
               '_-_past_year_likert_bl']

In [5]:
new_dfs = {}

for file, name in zip(files_as_is, names_as_is):
    df = pd.read_csv(file)
    df = remove_dupes(df)
    df = new_cols(df, name)
    new_dfs[name] = df

In [6]:
all([dupe_check(df) for name, df in new_dfs.items()])

True

### DataFrame: Bart_V4

In [7]:
bart_v4 = pd.read_csv('digital-marshmallow-bart-v4_8.8.17.csv')
bart_v4 = remove_dupes(bart_v4)

dupe_check(bart_v4)

False

In [8]:
bart_attributes = [
    ['_-_bart_v4_bl_0.25', 'baseline', 'BART0.25'],
    ['_-_bart_v4_bl_250', 'baseline', 'BART250.00'],
    ['_-_bart_v4_21_0.25', '21-day-assessment', 'BART250.00'],
    ['_-_bart_v4_21_250', '21-day-assessment', 'BART250.00']
]

In [13]:
bart_v4 = df_merge(bart_v4, 
                   bart_attributes,
                   var_1='metadata.json.taskIdentifier',
                   var_2='data.json.variable_label')

In [None]:
if dupe_check(bart_v4):
    new_dfs['bart_v4'] = bart_v4
    print('Done!')
else:
    print('False')

### DataFrame: Behavior_choices_4

In [None]:
behavior_4 = pd.read_csv('digital-marshmallow-behavior_choices_4_bl-v2_8.8.17.csv')
behavior_4 = remove_dupes(behavior_4)

dupe_check(behavior_4)

In [None]:
behavior_4_attributes = [
    ['_-_behavior_4_bl', 'baseline'],
]

In [None]:
behavior_4 = df_merge(behavior_4, 
                      behavior_4_attributes,
                      var_1='metadata.json.taskIdentifier')

In [None]:
if dupe_check(behavior_4):
    new_dfs['behave_4'] = behavior_4
    print('Done!')
else:
    print('False')

### DataFrame: Delay Discounting

In [None]:
delay = pd.read_csv('digital-marshmallow-delay_discounting_raw-v6_8.8.17.csv')
delay = remove_dupes(delay)

dupe_check(delay)

In [None]:
bl = 'baseline'
_21 = '21-day-assessment'

delay_attributes = [
    ['_-_delay_bl_time_6_month', bl, 'dd_time_6_month'],
    ['_-_delay_bl_money_6_month', bl, 'dd_money_6_month'],
    ['_-_delay_bl_money_1_month', bl, 'dd_money_1_month'],
    ['_-_delay_bl_time_1_year', bl, 'dd_time_1_year'],
    ['_-_delay_21_time_6_month', _21, 'dd_time_6_month'],
    ['_-_delay_21_money_6_month', _21, 'dd_money_6_month'],
    ['_-_delay_21_money_1_month', _21, 'dd_money_1_month'],
    ['_-_delay_21_time_1_year', _21, 'dd_time_1_year']
]

In [None]:
delay = df_merge(delay, 
                 delay_attributes, 
                 var_1='metadata.json.taskIdentifier',
                 var_2='data.json.variableLabel')

In [None]:
if dupe_check(delay):
    new_dfs['delay'] = delay
    print('Done!')
else:
    print('False')

### DataFrame: Discounting Raw

In [None]:
discount = pd.read_csv('digital-marshmallow-discounting_raw-v2_8.8.17.csv')
discount = remove_dupes(discount)

dupe_check(discount)

In [None]:
bl = 'baseline'
_21 = '21-day-assessment'

discount_attributes = [
    ['_-_discount_bl_money', bl, 'pd_constant_money'],
    ['_-_discount_bl_prob', bl, 'pd_constant_probabiliy'],
    ['_-_discount_21_money', _21, 'pd_constant_money'],
    ['_-_discount_21_prob', _21, 'pd_constant_probability']
]

discount = df_merge(discount, 
                    discount_attributes,
                    var_1='metadata.json.taskIdentifier',
                    var_2='data.json.variableLabel')

In [None]:
if dupe_check(discount):
    new_dfs['discount'] = discount
    print('Done!')
else:
    print("False")

### DataFrame: Evening Notification

In [None]:
evening_note = pd.read_csv('digital-marshmallow-evening_notification_time-v2_8.8.17.csv')
evening_note = remove_dupes(evening_note)

dupe_check(evening_note)

In [None]:
evening_note_attributes = [
    ['_-_evening_note_bl', 'baseline']
]

evening_note_bl = df_merge(evening_note, 
                           evening_note_attributes,
                           var_1='metadata.json.taskIdentifier')

In [None]:
if dupe_check(evening_note_bl):
    new_dfs['evening_note'] = evening_note_bl
    print('Done!')
else:
    print("False")

### DataFrame: GoNoGo - PLACEHOLDER (Extra Record - ksJM3Y)

In [None]:
gonogo = pd.read_csv('digital-marshmallow-goNoGo-v2_8.8.17.csv')
gonogo = remove_dupes(gonogo)

dupe_check(gonogo)

In [None]:
bl = 'baseline'
_21 = '21-day-assessment'

gonogo_attributes = [
    ['_-_gonogo_bl_stable', bl, 'go_no_go_stable_stimulus_active_task'],
    ['_-_gonogo_21_variable', bl, 'go_no_go_variable_stimulus_active_task'],
    ['_-_gonogo_bl_stable', _21, 'go_no_go_stable_stimulus_active_task'],
    ['_-_gonogo_21_variable', _21, 'go_no_go_variable_stimulus_active_task']
]

gonogo = df_merge(gonogo, 
                  gonogo_attributes,
                  var_1='metadata.json.taskIdentifier',
                  var_2='data.json.variable_label')

In [None]:
if dupe_check(gonogo):
    new_dfs['gonogo'] = gonogo
    print('Done!')
else:
    print('False')

In [None]:
gonogo.externalId[gonogo.externalId.duplicated()]

### DataFrame: Morning Notifications

In [None]:
morning_note = pd.read_csv('digital-marshmallow-morning_notification_time-v3_8.8.17.csv')
morning_note = remove_dupes(morning_note)

dupe_check(morning_note)

In [None]:
morning_note_attributes = [
    ['_-_morning_note_bl', 'baseline']
]

morning_note = df_merge(morning_note, 
                        morning_note_attributes,
                        var_1='metadata.json.taskIdentifier')

In [None]:
if dupe_check(morning_note):
    new_dfs['morning_note'] = morning_note
    print('Done!')
else:
    print('False')

### DataFrame: PAM Multiple

In [None]:
pam_mult = pd.read_csv('digital-marshmallow-pam_multiple-v2_8.8.17.csv')
pam_mult = remove_dupes(pam_mult)

dupe_check(pam_mult)

In [None]:
pam_mult_attributes = [
    ['_-_pam_mult_bl', 'baseline'],
    ['_-_pam_mult_21', '21-day-assessment']
]

pam_mult = df_merge(pam_mult, 
                    pam_mult_attributes,
                    var_1='metadata.json.taskIdentifier')

In [None]:
if dupe_check(pam_mult):
    new_dfs['pam_mult'] = pam_mult
    print('Done!')
else:
    print('False')

### Final Merge of All Sage Data

In [None]:
final_df = reduce(lambda left, right: pd.merge(left, right, how = 'outer',
                                               on='externalId'), 
                  new_dfs.values())

In [None]:
dupe_check(final_df)

In [None]:
os.chdir('/home/wraikes/Programming/Partnership/dmt/merged_data/')
final_df.to_csv('FINAL_SAGE.csv')