## Stata Part 3 Replication

### Discussion Questions:
- Move validation code into another layer.
- Coder id's not consistent between files (e.g Rebekah is 8 and then Arielle is 8 in different file)

In [1]:
import pandas as pd
import numpy as np
from utils import clean_columns,generate_score_variables,generate_behavior_columns,generate_calculated_columns,generate_duplicate_column,drop_cols_by_name

In [2]:
def performance_clean(csv_name,isExcel=False,sheet_name=None,sid=2,time=0):
    '''
    Base function that handles core cleaning of all of the part 3 files
    '''
    # Read data
    data = pd.read_excel(f"../data/{csv_name}.xlsx",sheet_name=sheet_name,skiprows=[1,2]) if isExcel else pd.read_csv(f"../data/{csv_name}.csv",skiprows=[1,2])
    # Get cleaned column names
    data.columns = clean_columns(data.columns,csv_name == 'Fall 2018 Behavioral Redirections Baseline_July 24, 2019_14.06')
    # Set sid, time, vid
    data['sid']=sid
    data['time']=time
    # Construct vid column
    data['vid'] = data['id'].astype(str) + "_" + data['sid'].astype(str)
    # Generate duplicate column
    generate_duplicate_column(data)
    # Get score variables
    generate_score_variables(data)
    # Get Behavior columns
    generate_behavior_columns(data)
    # Get calculated columns
    generate_calculated_columns(data)
    return data

In [3]:
def performance_clean_pt_1(csv_name,isExcel=False,sheet_name=None):
    '''
    3a just calls base function
    '''
    base = performance_clean(csv_name,isExcel,sheet_name)
    if csv_name == 'Fall 2018 Behavioral Redirections Baseline_July 24, 2019_14.06':
        drop_cols_by_name(base,'q25','q8')
        drop_cols_by_name(base,'q20','q26')
    return base

In [4]:
# Perform 3a cleaning on the two files
data11= performance_clean_pt_1('Summer 2018 Behavioral Redirections Baseline',isExcel=True, sheet_name='Summer 2018 Behavioral Redirect')
data12= performance_clean_pt_1('Fall 2018 Behavioral Redirections Baseline_July 24, 2019_14.06')
data1 = data11.append(data12)


In [5]:
def performance_clean_pt_2(csv_name,isExcel=False,sheet_name=None):
    '''
    3b cleaning
    '''
    # Get base
    base = performance_clean(csv_name,isExcel,sheet_name)
    # Drop specific column ranges
    drop_cols_by_name(base,'startdate','finished')
    drop_cols_by_name(base,'responseid','userlanguage')
    # Get coaching data
    coachingData = pd.read_excel('../data/S19 BR Coaching Video Assignments_Cleaned.xls')[['id','coder_num','codingtype','cid']]
    # Merge coaching
    withCoaching = base.merge(coachingData,on=['id','cid'],how='left')
    # Get tracker data
    trackerData = pd.read_excel('../data/tracker.xls')[['id_student','email','id_coach','id_interactor']].rename(columns={'id_student':'id'})
    # Merge tracking
    withTracking = withCoaching.merge(trackerData,on='id',how='inner')    
    return withTracking

In [6]:
# Perform 3b cleaning on the file
data2=performance_clean_pt_2('Spring2019_BR_CodedPerformanceOutcomes',isExcel=True, sheet_name='Sheet1')

In [7]:
def performance_clean_pt_3(csv_name,isExcel=False,sheet_name=None):
    '''
    3c cleaning
    '''
    # Call base function
    base = performance_clean(csv_name,isExcel,sheet_name)
    # Drop columns by name
    drop_cols_by_name(base,'startdate','finished')
    drop_cols_by_name(base,'responseid','userlanguage')
    drop_cols_by_name(base,'q12','q26')
    return base

In [8]:
# Perform 3c cleaning on the file
data3=performance_clean_pt_3('2019 Spring- Exit- Behavioral Redirections_August 7, 2019_08.26',isExcel=True, sheet_name='in')

In [9]:
data1.head()

Unnamed: 0,startdate,enddate,status,ipaddress,progress,duration (in seconds),finished,recordeddate,responseid,recipientlastname,...,tot_cu,tot_nb,tot_se,tot_ti,tot_su,score_dc_avg,prop_beh_ack,ti_dc_avg,prop_redirect,su_dc_avg
0,2018-08-05 11:52:20,2018-08-05 12:08:22,IP Address,73.12.28.114,100,962,True,2018-08-05 12:08:22,R_1DG1LFyukbYExE4,,...,186.0,0,0,186.0,186.0,1.0,1.0,1.0,1.0,1.0
1,2018-08-05 12:28:36,2018-08-05 12:30:57,IP Address,73.12.28.114,100,141,True,2018-08-05 12:30:57,R_zbNDFbpkHNBY653,,...,159.0,0,0,159.0,159.0,1.0,1.0,1.0,1.0,1.0
2,2018-08-05 12:50:12,2018-08-05 12:55:15,IP Address,73.12.28.114,100,302,True,2018-08-05 12:55:15,R_1LSRL5gAU8hCSjT,,...,147.0,0,0,147.0,147.0,1.0,1.0,1.0,1.0,1.0
3,2018-08-05 13:06:06,2018-08-05 13:10:16,IP Address,73.12.28.114,100,250,True,2018-08-05 13:10:17,R_21aEOkZyCCNB3UR,,...,102.0,0,0,102.0,102.0,1.0,1.0,1.0,1.0,1.0
4,2018-08-05 13:23:56,2018-08-05 13:26:11,IP Address,73.12.28.114,100,134,True,2018-08-05 13:26:11,R_1kZgh53W7n6CUOi,,...,196.0,0,0,196.0,196.0,1.0,1.0,1.0,1.0,1.0


In [10]:
data2.head()

Unnamed: 0,recordeddate,first_beh,cid,id,b1oc,b2oc,b3oc,b4oc,b5oc,b6oc,...,score_dc_avg,prop_beh_ack,ti_dc_avg,prop_redirect,su_dc_avg,coder_num,codingtype,email,id_coach,id_interactor
0,2019-03-05 06:27:42,Ethan humming,Claire,102,1,1,1,1,1,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,Group Norm,kmh3yj@virginia.edu,Arielle,Carrie
1,2019-03-05 06:27:42,Ethan humming,Claire,102,1,1,1,1,1,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,Norm,kmh3yj@virginia.edu,Arielle,Carrie
2,2019-03-03 13:03:29,Ethan video game,Maggie,102,1,1,1,1,1,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,Norm,kmh3yj@virginia.edu,Arielle,Carrie
3,2019-03-17 16:02:49,Ethan humming,Claire,14,1,1,1,1,1,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,Norm,ajm8gx@virginia.edu,Casedy,Carrie
4,2019-03-27 17:03:29,Ethan video game,Rachel G,14,1,1,1,1,1,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,Norm,ajm8gx@virginia.edu,Casedy,Carrie


In [11]:
data3.head()

Unnamed: 0,recordeddate,first_beh,cid,id,b1oc,b2oc,b3oc,b4oc,b5oc,b6oc,...,tot_cu,tot_nb,tot_se,tot_ti,tot_su,score_dc_avg,prop_beh_ack,ti_dc_avg,prop_redirect,su_dc_avg
0,2019-06-27 13:56:20,Ethan video game,,,,,,,,,...,0.0,0,0,0.0,0.0,,,,,
1,2019-07-01 16:59:28,Ethan humming,Claire,55.0,1.0,1.0,1.0,1.0,1.0,1.0,...,192.0,0,0,192.0,192.0,1.0,1.0,1.0,1.0,1.0
2,2019-07-08 16:53:24,Ethan humming,,,,,,,,,...,0.0,0,0,0.0,0.0,,,,,
3,2019-07-09 09:30:57,Ethan humming,Rachel L,58.0,1.0,1.0,1.0,1.0,1.0,1.0,...,11.0,2,1,11.0,11.0,1.0,1.0,1.0,1.0,1.0
4,2019-07-09 09:50:11,Ethan humming,Rachel L,23.0,1.0,1.0,1.0,1.0,1.0,1.0,...,121.0,0,0,121.0,121.0,1.0,1.0,1.0,1.0,1.0


In [79]:
# Write out files
for i,dataframe in enumerate([data1,data2,data3]):
    dataframe.to_csv(f'3-{i+1}.csv',index=False)