In [2]:
# we need to decide on the outcome
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# third party R-Python integration module
import statsmodels.api as sm
import statsmodels.formula.api as smf

# custom eda module, abstracted for cleanliness
from eda import *

# set plot style to Seaborn
sns.set_style("white")
sns.set_style("ticks")
sns.set(font_scale=1.5)
sns.set(color_codes=True)

%matplotlib inline

In [3]:
print('Loading and cleaning AWS survey responses')
df_aws = pd.read_csv('w241 Final Project - AWS_July 23, 2019_09.55.csv')
control_aws, treatment_aws = clean_and_split_data(df_aws)

print('Loading and cleaning non-AWS survey responses')
df_qual = pd.read_csv('w241 Final Project_July 30, 2019_15.38.csv')
control_qual, treatment_qual = clean_and_split_data(df_qual)

# test to ensure that there are no NaN left in the data after pre-processing
assert not control_aws.isna().any().any() and not treatment_aws.isna().any().any()
assert not control_qual.isna().any().any() and not treatment_qual.isna().any().any()

outcome_col_names = ['friendly', 'positive', 'sincere', 'comfortable', 'work_with', 
          'situation', 'peers', 'others_above', 'others_below', 'externally', 
          'hardworking', 'knowledgable', 'motivated', 'leadership', 'project']

Loading and cleaning AWS survey responses
Dropping first two header/metadata rows
Dropping null values in ['Progress']: 0
Dropping surveys with less than 100% progress: 25
Dropping surveys that didn't mark attention question correctly: 158
Loading and cleaning non-AWS survey responses
Dropping first two header/metadata rows
Dropping null values in ['Progress']: 0
Dropping surveys with less than 100% progress: 4
Dropping surveys that didn't mark attention question correctly: 3


In [4]:
control_aws.shape[0] + treatment_aws.shape[0] + control_qual.shape[0] + treatment_qual.shape[0]

540

In [16]:
control_outcome_cols = control_aws.filter(like='control_').columns
outcome_cols = [x.replace('control_', '') for x in control_outcome_cols]
rename_control_outcomes = {k:v for k, v in zip(control_outcome_cols, outcome_cols)}
control_aws = control_aws.rename(columns=rename_control_outcomes)
control_aws.head()

Unnamed: 0,pronouns,orientation,age,education,region,friendly,positive,sincere,comfortable,work_with,...,North African,Pacific Islander,White,Employed - Full Time,Employed - Part Time,Retired,Student,Unemployed,survey_source,assignment
15,He/his,Heterosexual,41 - 50,Bachelor's Degree,Southwestern United States,5,5,5,5,5,...,0,0,0,1,0,0,0,0,AWS,Control
16,He/his,Heterosexual,31 - 40,Master's Degree,Southeastern United States,5,5,5,6,5,...,0,0,1,1,0,0,0,0,AWS,Control
19,He/his,Heterosexual,Over 60,Master's Degree,Southeastern United States,6,6,6,6,6,...,0,0,1,0,1,0,0,0,AWS,Control
21,She/hers,Heterosexual,41 - 50,Master's Degree,Northeastern United States,5,6,6,5,5,...,0,0,1,0,0,0,0,0,AWS,Control
31,He/his,Heterosexual,31 - 40,Bachelor's Degree,Not located in United States,6,7,7,6,6,...,0,0,0,1,0,0,0,0,AWS,Control


In [18]:
treatment_outcome_cols = treatment_aws.filter(like='treatment_').columns
outcome_cols = [x.replace('treatment_', '') for x in treatment_outcome_cols]
rename_treatment_outcomes = {k:v for k, v in zip(treatment_outcome_cols, outcome_cols)}
treatment_aws = treatment_aws.rename(columns=rename_treatment_outcomes)
treatment_aws.head()

Unnamed: 0,pronouns,orientation,age,education,region,friendly,positive,sincere,comfortable,work_with,...,North African,Pacific Islander,White,Employed - Full Time,Employed - Part Time,Retired,Student,Unemployed,survey_source,assignment
7,He/his,Heterosexual,31 - 40,Some graduate college,Southeastern United States,4,6,5,5,5,...,0,0,0,1,0,0,0,0,AWS,Treatment
11,He/his,Heterosexual,41 - 50,High School/GED,Southeastern United States,6,6,5,6,5,...,0,0,1,1,0,0,0,0,AWS,Treatment
14,She/hers,Asexual,31 - 40,High School/GED,Not located in United States,3,3,4,1,1,...,0,0,1,0,0,1,0,0,AWS,Treatment
22,He/his,Heterosexual,22 - 30,Bachelor's Degree,Southeastern United States,4,6,5,5,4,...,0,0,0,1,0,0,0,0,AWS,Treatment
23,She/hers,Bisexual,22 - 30,Bachelor's Degree,Southeastern United States,6,7,6,7,7,...,0,0,0,1,0,0,0,0,AWS,Treatment


In [19]:
# Create two columns, survey source for AWS/MIDS and treatment assignment
control_aws['survey_source'] = 'AWS'
control_aws['assignment'] = 'Control'
treatment_aws['survey_source'] = 'AWS'
treatment_aws['assignment'] = 'Treatment'

# remove assignment prefixes from column names
aws_df = pd.concat([control_aws, treatment_aws])

Unnamed: 0,pronouns,orientation,age,education,region,friendly,positive,sincere,comfortable,work_with,...,North African,Pacific Islander,White,Employed - Full Time,Employed - Part Time,Retired,Student,Unemployed,survey_source,assignment
15,He/his,Heterosexual,41 - 50,Bachelor's Degree,Southwestern United States,5,5,5,5,5,...,0,0,0,1,0,0,0,0,AWS,Control
16,He/his,Heterosexual,31 - 40,Master's Degree,Southeastern United States,5,5,5,6,5,...,0,0,1,1,0,0,0,0,AWS,Control
19,He/his,Heterosexual,Over 60,Master's Degree,Southeastern United States,6,6,6,6,6,...,0,0,1,0,1,0,0,0,AWS,Control
21,She/hers,Heterosexual,41 - 50,Master's Degree,Northeastern United States,5,6,6,5,5,...,0,0,1,0,0,0,0,0,AWS,Control
31,He/his,Heterosexual,31 - 40,Bachelor's Degree,Not located in United States,6,7,7,6,6,...,0,0,0,1,0,0,0,0,AWS,Control


In [37]:
control_outcome_cols = control_qual.filter(like='control_').columns
outcome_cols = [x.replace('control_', '') for x in control_outcome_cols]
rename_control_outcomes = {k:v for k, v in zip(control_outcome_cols, outcome_cols)}
control_qual = control_qual.rename(columns=rename_control_outcomes)

treatment_outcome_cols = treatment_qual.filter(like='treatment_').columns
outcome_cols = [x.replace('treatment_', '') for x in treatment_outcome_cols]
rename_treatment_outcomes = {k:v for k, v in zip(treatment_outcome_cols, outcome_cols)}
treatment_qual = treatment_qual.rename(columns=rename_treatment_outcomes)

control_qual['survey_source'] = 'MIDS'
control_qual['assignment'] = 'Control'
treatment_qual['survey_source'] = 'MIDS'
treatment_qual['assignment'] = 'Treatment'

qual_df = pd.concat([control_qual, treatment_qual])

In [43]:
full_data = pd.concat([aws_df, qual_df])
full_data.to_csv('email_surveys.csv')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [None]:
# Let's look at summary statistics for demographic data by group
control_aws.describe()

In [None]:
treatment_aws.describe()

In [None]:
control_qual.describe()

In [None]:
treatment_qual.describe()

In [None]:
# Let's look at summary statistics for outcome variable by group
control_outcome_columns = ['control_' + x for x in outcome_col_names]
treatment_outcome_columns = ['treatment_' + x for x in outcome_col_names]
control_aws.loc[:, control_outcome_columns].describe()

In [None]:
treatment_aws.loc[:, treatment_outcome_columns].describe()

In [None]:
control_qual.loc[:, control_outcome_columns].describe()

In [None]:
treatment_qual.loc[:, treatment_outcome_columns].describe()

In [None]:
def show_distribution(col, var_name):
    control_col = 'control_' + col
    treatment_col = 'treatment_' + col
    # Plot Control Distribution in AWS Group
    sns.countplot(x=control_aws.loc[:, control_col].values).set_title(f"AWS Control {var_name} Likert")
    plt.savefig(f'aws_control_{var_name}_countplot.png')
    plt.close()
    # Plot Treatment Distribution in AWS Group
    sns.countplot(x=treatment_aws.loc[:, treatment_col].values).set_title(f"AWS Treatment {var_name} Likert")
    plt.savefig(f'aws_treatment_{var_name}_countplot.png')
    plt.close()
    
    # Plot Control Distribution in Non-AWS Group
    sns.countplot(x=control_qual.loc[:, control_col].values).set_title(f"Non-AWS Control {var_name} Likert")
    plt.savefig(f'control_{var_name}_countplot.png')
    plt.close()
    
    # Plot Treatment Distribution in Non-AWS Group
    sns.countplot(x=treatment_qual.loc[:, treatment_col].values).set_title(f"Non-AWS Treatment {var_name} Likert")
    plt.savefig(f'treatment_{var_name}_countplot.png')
    plt.close()


In [None]:
# let's do some basic visualizations of our outcome variable
title_variable_names = ['Friendliness', "Positivity", "Sincerity", "Comfort", "Working With",
                       "Situational Appropriateness", "Peer Appropriateness", "Superior Appropriateness", "Subordinate Appropriateness", "External Appropriateness",
                       "Hardworking", "Knowledgable", "Motivated", "Leadership", "Share a Project"]
for col_name, variable_name in zip(outcome_col_names, title_variable_names):
    show_distribution(col_name, variable_name)