# Care Team Engagment Prediction - Data preprocessing

This notebook includes the following steps:
<ul>
<li> Setup
<li> Read data file </li>
<li> Process data file </li>
<li> Remove nulls </li>
<li> Define Y </li>
</ul>

### 0. Setup

In [598]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [599]:
%matplotlib inline

### 1. Read data file (locally or from S3) 

In [600]:
# Read local data file
df1 = pd.read_csv('data15.csv')

### 2. Process Input file

In [601]:
df1.shape

(73619, 49)

In [602]:
df1.columns

Index(['days_to_first_et', 'days_to_coach', 'days_to_kit', 'acceptance_month',
       'indication', 'gender', 'user_age', 'bio_length', 'reasons_length',
       'imagine_free_length', 'reason_limited_time',
       'reason_family_obligations', 'reason_work_obligations', 'reason_other',
       'pain_severity', 'pain_vas', 'pain_description_length', 'weight', 'bmi',
       'gad', 'phq', 'is_sedentary', 'is_overweight', 'is_passive_coping',
       'is_catastrophizing', 'hours_to_complete_aq', 'conversations',
       'user_initiated_conv', 'hh_initiated_conv',
       'inbound_coach_messages_1_week', 'inbound_member_messages_1_week',
       'inbound_coach_messages_length_1_week',
       'inbound_coach_messages_4_weeks', 'conversation_density',
       'week_1_words', 'night_messages', 'morning_messages',
       'afternoon_messages', 'evening_message', 'count_text_msg_from_coach',
       'days_to_first_text_msg_from_user', 'days_to_first_text_msg_from_coach',
       'count_opr_msg_from_coach',

## Categorical Features

In [603]:
# Replace indication with dummy variables
indication_dummies = pd.get_dummies(df1['indication'])
df1 = pd.concat([df1, indication_dummies], axis=1)      
df1 = df1.drop(['indication'], axis=1)

In [604]:
age_dummies = pd.get_dummies(df1['gender'])
df1 = pd.concat([df1, age_dummies], axis=1)      
df1 = df1.drop(['gender'], axis=1)

In [605]:
df1['region'].fillna('None', inplace=True)
region_dummies = pd.get_dummies(df1['region'])
df1 = pd.concat([df1, region_dummies], axis=1)      
df1 = df1.drop(['region'], axis=1)

In [606]:
client_industry = pd.get_dummies(df1['client_industry'])
df1 = pd.concat([df1, client_industry], axis=1)      
df1 = df1.drop(['client_industry'], axis=1)

### 3. Remove null values

In [607]:
print(df1.isnull().sum()) # found no missing values in the data

days_to_first_et                  1125
days_to_coach                        0
days_to_kit                       1132
acceptance_month                     0
user_age                             0
                                  ... 
Retail                               0
Transportation and Warehousing       0
Unkown                               0
Utilities                            0
Wholesale                            0
Length: 85, dtype: int64


In [608]:
# Remove members with transferred_to_coach_day = null
df1 = df1[df1['days_to_coach'].notna()]

In [609]:
df1 = df1[df1['days_to_first_et'].notna()]

In [610]:
# if pain_severity or pain_vas is null -> 0
df1['pain_severity'].fillna(0, inplace=True)
df1['pain_vas'].fillna(0, inplace=True)
df1['gad'].fillna(0, inplace=True)
df1['phq'].fillna(0, inplace=True)
df1['conversation_density'].fillna(0, inplace=True)

In [611]:
# but average BMI where BMI is null
df1['bmi'].fillna((df1['bmi'].mean()), inplace=True)
df1['hours_to_complete_aq'].fillna((df1['hours_to_complete_aq'].mean()), inplace=True)
df1['weight'].fillna((df1['hours_to_complete_aq'].mean()), inplace=True)

In [612]:
df1['days_to_kit'].fillna(99, inplace=True)



In [613]:
df1['night_messages'].fillna(0, inplace=True)
df1['morning_messages'].fillna(0, inplace=True)
df1['afternoon_messages'].fillna(0, inplace=True)
df1['evening_message'].fillna(0, inplace=True)

df1['week_1_words'].fillna(0, inplace=True)


In [614]:
df1 = df1[df1['conversations'].notna()]

In [615]:
df1.columns[df1.isna().any()].tolist()

[]

In [616]:
#print(df1.isnull().sum()) # found no missing values in the data

## refine variables

In [617]:
df1['days_to_kit'] = df1['days_to_kit'].clip(lower=0)

In [618]:

# Define target column
# See analysis below showed the 20% of customer = 9 or more messages
df1['Y'] = (df1['inbound_coach_messages_4_weeks']).astype(int)
df1 = df1.drop(['inbound_coach_messages_4_weeks'], axis=1)

df1.to_csv('regressionData1.csv', index=False)


### 4. Define Y

In [619]:
limit = 7

# Define target column
# See analysis below showed the 20% of customer = 9 or more messages
df1['Y'] = (df1['Y'] > limit).astype(int)

# Save cleaned data for future use
df1.to_csv('classificationData1.csv', index=False)


# ----------------------------------------------------------------

## Make all Varibales binary

In [620]:
temp_age = pd.qcut(df1['user_age'], 10, #retbins=True,
                  labels = ['age_0','age_1','age_2','age_3','age_4','age_5','age_6','age_7','age_8','age_9'])
age_dummies = pd.get_dummies(temp_age)
df1 = pd.concat([df1, age_dummies], axis=1)      
df1 = df1.drop(['user_age'], axis=1)

In [621]:
##df1['bio_length_0'] = (df1['bio_length'] == 0).astype(int)
##temp_bio_length = pd.cut(df1['bio_length'], bins=5, labels = ['bio_length_1', 'bio_length_2', 'bio_length_3', 'bio_length_4', 'bio_length_5'])
##bio_dummies = pd.get_dummies(temp_bio_length)
##df1 = pd.concat([df1, bio_dummies], axis=1) 
##df1 = df1.drop(['bio_length'], axis=1)


df1['bio_length_0'] = (df1['bio_length'] <= 5).astype(int)
df1['bio_length_1'] = ((df1['bio_length'] > 5) & (df1['bio_length'] <= 10)).astype(int)
df1['bio_length_2'] = ((df1['bio_length'] > 10) & (df1['bio_length'] <= 15)).astype(int)
df1['bio_length_3'] = ((df1['bio_length'] > 15) & (df1['bio_length'] <= 20)).astype(int)
df1['bio_length_4'] = ((df1['bio_length'] > 20) & (df1['bio_length'] <= 25)).astype(int)
df1['bio_length_5'] = ((df1['bio_length'] > 25) & (df1['bio_length'] <= 30)).astype(int)
df1['bio_length_6'] = ((df1['bio_length'] > 30) & (df1['bio_length'] <= 35)).astype(int)
df1['bio_length_7'] = ((df1['bio_length'] > 35) & (df1['bio_length'] <= 40)).astype(int)
df1['bio_length_8'] = ((df1['bio_length'] > 40) & (df1['bio_length'] <= 50)).astype(int)
df1['bio_length_9'] = (df1['bio_length'] > 50).astype(int)
df1 = df1.drop(['bio_length'], axis=1)


In [622]:
#df1['reasons_length_0'] = (df1['reasons_length'] == 0).astype(int)
#temp_bio_length = pd.cut(df1['reasons_length'], bins=5, labels = ['reasons_length_1', 'reasons_length_2', 'reasons_length_3', 'reasons_length_4', 'reasons_length_5'])
#bio_dummies = pd.get_dummies(temp_bio_length)
#df1 = pd.concat([df1, bio_dummies], axis=1) 


df1['reasons_length_0'] = (df1['reasons_length'] <= 10).astype(int)
df1['reasons_length_1'] = ((df1['reasons_length'] > 10) & (df1['reasons_length'] <= 20)).astype(int)
df1['reasons_length_2'] = ((df1['reasons_length'] > 20) & (df1['reasons_length'] <= 30)).astype(int)
df1['reasons_length_3'] = ((df1['reasons_length'] > 30) & (df1['reasons_length'] <= 40)).astype(int)
df1['reasons_length_4'] = ((df1['reasons_length'] > 40) & (df1['reasons_length'] <= 80)).astype(int)
df1['reasons_length_5'] = (df1['reasons_length'] > 80).astype(int)


df1 = df1.drop(['reasons_length'], axis=1)

In [623]:
'''
df1['imagine_free_length_0'] = (df1['imagine_free_length'] == 0).astype(int)
temp_bio_length = pd.qcut(df1['imagine_free_length'], 5, labels = ['imagine_free_length_1', 'imagine_free_length_2', 'imagine_free_length_3', 'imagine_free_length_4', 'imagine_free_length_5'])
bio_dummies = pd.get_dummies(temp_bio_length)
df1 = pd.concat([df1, bio_dummies], axis=1) 

'''
df1['imagine_length_0'] = (df1['imagine_free_length'] <= 5).astype(int)
df1['imagine_length_1'] = ((df1['imagine_free_length'] > 5) & (df1['imagine_free_length'] <= 10)).astype(int)
df1['imagine_length_2'] = ((df1['imagine_free_length'] > 10) & (df1['imagine_free_length'] <= 20)).astype(int)
df1['imagine_length_3'] = ((df1['imagine_free_length'] > 20) & (df1['imagine_free_length'] <= 40)).astype(int)
df1['imagine_length_4'] = ((df1['imagine_free_length'] > 40) & (df1['imagine_free_length'] <= 80)).astype(int)
df1['imagine_length_5'] = (df1['imagine_free_length'] > 80).astype(int)


df1 = df1.drop(['imagine_free_length'], axis=1)

In [624]:
#df1['pain_length_0'] = (df1['pain_description_length'] == 0).astype(int)
#temp_bio_length = pd.cut(df1['pain_description_length'], bins=5, labels = ['pain_length_1', 'pain_length_2', 'pain_length_3', 'pain_length_4', 'pain_length_5'])
#bio_dummies = pd.get_dummies(temp_bio_length)
#df1 = pd.concat([df1, bio_dummies], axis=1) 


df1['pain_length_0'] = (df1['pain_description_length'] <= 10).astype(int)
df1['pain_length_1'] = ((df1['pain_description_length'] > 10) & (df1['pain_description_length'] <= 30)).astype(int)
df1['pain_length_2'] = ((df1['pain_description_length'] > 30) & (df1['pain_description_length'] <= 60)).astype(int)
df1['pain_length_3'] = ((df1['pain_description_length'] > 60) & (df1['pain_description_length'] <= 90)).astype(int)
df1['pain_length_4'] = ((df1['pain_description_length'] > 90) & (df1['pain_description_length'] <= 120)).astype(int)
df1['pain_length_5'] = (df1['pain_description_length'] > 120).astype(int)


df1 = df1.drop(['pain_description_length'], axis=1)

In [625]:
df1['inbound_coach_length_0'] = (df1['inbound_coach_messages_length_1_week'] == 0).astype(int)
#temp_bio_length = pd.cut(df1['inbound_coach_messages_length_1_week'], bins=5, labels = ['inbound_coach_length_1', 'inbound_coach_length_2', 'inbound_coach_length_3', 'inbound_coach_length_4', 'inbound_coach_length_5'])
#bio_dummies = pd.get_dummies(temp_bio_length)
#df1 = pd.concat([df1, bio_dummies], axis=1) 
df1 = df1.drop(['inbound_coach_messages_length_1_week'], axis=1)

In [626]:
df1['inbound_coach_msg_0'] = (df1['inbound_coach_messages_1_week'] == 0).astype(int)
#temp_bio_length = pd.cut(df1['inbound_coach_messages_1_week'], bins=5, 
#                         labels = ['inbound_coach_msg_1', 'inbound_coach_msg_2', 'inbound_coach_msg_3', 'inbound_coach_msg_4', 'inbound_coach_msg_5'])
#bio_dummies = pd.get_dummies(temp_bio_length)
#df1 = pd.concat([df1, bio_dummies], axis=1) 
df1 = df1.drop(['inbound_coach_messages_1_week'], axis=1)

In [627]:
df1['inbound_member_msg_0'] = (df1['inbound_member_messages_1_week'] == 0).astype(int)
df1['inbound_member_msg_1'] = (df1['inbound_member_messages_1_week'] == 1).astype(int)
df1['inbound_member_msg_2'] = (df1['inbound_member_messages_1_week'] == 2).astype(int)
df1['inbound_member_msg_3'] = (df1['inbound_member_messages_1_week'] == 3).astype(int)
df1['inbound_member_msg_4'] = (df1['inbound_member_messages_1_week'] == 4).astype(int)
df1['inbound_member_msg_5'] = (df1['inbound_member_messages_1_week'] == 5).astype(int)
df1['inbound_member_msg_6'] = (df1['inbound_member_messages_1_week'] == 6).astype(int)
df1['inbound_member_msg_7'] = (df1['inbound_member_messages_1_week'] == 7).astype(int)
df1['inbound_member_msg_8'] = (df1['inbound_member_messages_1_week'] == 8).astype(int)
df1['inbound_member_msg_9'] = (df1['inbound_member_messages_1_week'] == 9).astype(int)
df1['inbound_member_msg_10'] = (df1['inbound_member_messages_1_week'] == 10).astype(int)
df1['inbound_member_msg_11'] = (df1['inbound_member_messages_1_week'] == 11 ).astype(int)
df1['inbound_member_msg_12'] = (df1['inbound_member_messages_1_week'] > 11 ).astype(int)



#temp_bio_length = pd.qcut(df1['inbound_member_messages_1_week'], 5, 
#                         labels = ['inbound_member_msg_1', 'inbound_memebr_msg_2', 'inbound_member_msg_3', 'inbound_member_msg_4', 'inbound_member_msg_5'])
#bio_dummies = pd.get_dummies(temp_bio_length)
#df1 = pd.concat([df1, bio_dummies], axis=1) 
df1 = df1.drop(['inbound_member_messages_1_week'], axis=1)

In [628]:
df1 = df1.drop(['transferred_to_coach_at'], axis=1, errors='ignore')

In [629]:
temp_bio_length = pd.qcut(df1['bmi'], 5, labels = ['bmi_1', 'bmi_2', 'bmi_3', 'bmi_4', 'bmi_5'])
bio_dummies = pd.get_dummies(temp_bio_length)
df1 = pd.concat([df1, bio_dummies], axis=1) 
df1 = df1.drop(['bmi'], axis=1)

In [630]:
df1['gad_0'] = (df1['gad'] == 0).astype(int) 
#temp_bio_length = pd.qcut(df1['gad'], 5, labels = ['gad_1', 'gad_2', 'gad_3', 'gad_4', 'gad_5'])
#bio_dummies = pd.get_dummies(temp_bio_length)
#df1 = pd.concat([df1, bio_dummies], axis=1) 
df1 = df1.drop(['gad'], axis=1)

In [631]:
df1['phq_0'] = (df1['phq'] == 0).astype(int) 
#temp_bio_length = pd.qcut(df1['phq'], 5, labels = ['phq_1', 'phq_2', 'phq_3', 'phq_4', 'phq_5'])
#bio_dummies = pd.get_dummies(temp_bio_length)
#df1 = pd.concat([df1, bio_dummies], axis=1) 
df1 = df1.drop(['phq'], axis=1)

In [632]:
df1['user_initiated_conv_0'] = (df1['user_initiated_conv'] == 0).astype(int) 
df1 = df1.drop(['user_initiated_conv'], axis=1)

df1 = df1.drop(['hh_initiated_conv'], axis=1)

In [633]:
temp_bio_length = pd.qcut(df1['pain_vas'], 5, labels = ['vas_1', 'vas_2', 'vas_3', 'vas_4', 'vas_5'])
bio_dummies = pd.get_dummies(temp_bio_length)
df1 = pd.concat([df1, bio_dummies], axis=1) 
df1 = df1.drop(['pain_vas'], axis=1)

In [634]:
df1['pain_0'] = (df1['pain_severity'] == 0).astype(int)
df1['pain_1'] = ((df1['pain_severity'] > 0) & (df1['pain_severity'] <= 2)).astype(int)
df1['pain_2'] = ((df1['pain_severity'] > 2) & (df1['pain_severity'] <= 4)).astype(int)
df1['pain_3'] = ((df1['pain_severity'] > 4) & (df1['pain_severity'] <= 7)).astype(int)
df1['pain_4'] = (df1['pain_severity'] > 7).astype(int)


'''
temp_bio_length = pd.cut(df1['pain_severity'], bins=5, labels = ['pain_1', 'pain_2', 'pain_3', 'pain_4', 'pain_5'])
bio_dummies = pd.get_dummies(temp_bio_length)
df1 = pd.concat([df1, bio_dummies], axis=1) 
'''
df1 = df1.drop(['pain_severity'], axis=1)

In [635]:
#temp_bio_length = pd.cut(df1['days_since_acceptance'], bins=5, labels = ['accept_day_1', 'accept_day_2', 'accept_day_3', 'accept_day_4', 'accept_day_5'])
#bio_dummies = pd.get_dummies(temp_bio_length)
#df1 = pd.concat([df1, bio_dummies], axis=1) 
#df1 = df1.drop(['days_since_acceptance'], axis=1)

In [636]:
temp_bio_length = pd.qcut(df1['days_to_first_et'], 10, 
                          labels = ['first_et_1', 'first_et_2', 'first_et_3', 'first_et_4', 'first_et_5',
                                    'first_et_6', 'first_et_7', 'first_et_8', 'first_et_9', 'first_et_10'])
bio_dummies = pd.get_dummies(temp_bio_length)
df1 = pd.concat([df1, bio_dummies], axis=1) 
df1 = df1.drop(['days_to_first_et'], axis=1)

In [637]:
# Is_sedentary, is_overweight, is_passive_coping, is_catastrophizing
# is_repeat

In [638]:
# Ignore region for now


In [639]:
df1['accept_month_1'] = (df1['acceptance_month'] == 1).astype(int)
df1['accept_month_2'] = (df1['acceptance_month'] == 2).astype(int)
df1['accept_month_3'] = (df1['acceptance_month'] == 3).astype(int)
df1['accept_month_4'] = (df1['acceptance_month'] == 4).astype(int)
df1['accept_month_5'] = (df1['acceptance_month'] == 5).astype(int)
df1['accept_month_6'] = (df1['acceptance_month'] == 6).astype(int)
df1['accept_month_7'] = (df1['acceptance_month'] == 7).astype(int)
df1['accept_month_8'] = (df1['acceptance_month'] == 8).astype(int)
df1['accept_month_9'] = (df1['acceptance_month'] == 9).astype(int)
df1['accept_month_10'] = (df1['acceptance_month'] == 10).astype(int)
df1['accept_month_11'] = (df1['acceptance_month'] == 11).astype(int)
df1['accept_month_12'] = (df1['acceptance_month'] == 12).astype(int)
df1 = df1.drop(['acceptance_month'], axis=1)

In [640]:
df1['hours_to_complete_aq_0'] = (df1['hours_to_complete_aq'] == 0).astype(int)
df1 = df1.drop(['hours_to_complete_aq'], axis=1)

# 

In [641]:
days = pd.qcut(df1['days_to_coach'], 10, 
               labels = ['days_to_coach_1', 'days_to_coach_2', 'days_to_coach_3', 'days_to_coach_4', 'days_to_coach_5',
                         'days_to_coach_6', 'days_to_coach_7', 'days_to_coach_8', 'days_to_coach_9', 'days_to_coach_10'])
bio_dummies = pd.get_dummies(days)
df1 = pd.concat([df1, bio_dummies], axis=1) 
df1 = df1.drop(['days_to_coach'], axis=1)



In [642]:
df1['conversations_0'] = (df1['conversations'] <= 1).astype(int)
df1['conversations_1'] = ((df1['conversations'] > 1) & (df1['conversations'] <= 2)).astype(int)
df1['conversations_2'] = ((df1['conversations'] > 2) & (df1['conversations'] <= 4)).astype(int)
df1['conversations_3'] = ((df1['conversations'] > 4) & (df1['conversations'] <= 8)).astype(int)
df1['conversations_4'] = (df1['conversations']  > 8).astype(int)
df1 = df1.drop(['conversations'], axis=1)

In [643]:
df1.columns

Index(['days_to_kit', 'reason_limited_time', 'reason_family_obligations',
       'reason_work_obligations', 'reason_other', 'weight', 'is_sedentary',
       'is_overweight', 'is_passive_coping', 'is_catastrophizing',
       ...
       'days_to_coach_6', 'days_to_coach_7', 'days_to_coach_8',
       'days_to_coach_9', 'days_to_coach_10', 'conversations_0',
       'conversations_1', 'conversations_2', 'conversations_3',
       'conversations_4'],
      dtype='object', length=174)

In [644]:
df1 = df1.drop(['count_opr_msg_from_coach'], axis=1) # Always 3               

In [645]:
df1['night_messages'] = (df1['night_messages'] > 0).astype(int)
df1['morning_messages'] = (df1['morning_messages'] > 0).astype(int)
df1['afternoon_messages'] = (df1['afternoon_messages'] > 0).astype(int)
df1['evening_message'] = (df1['evening_message'] > 0).astype(int)


In [646]:
days = pd.qcut(df1['conversation_density'], 5, 
               labels = ['density_1', 'density_2', 'density_3', 'density_4', 'density_5'])
bio_dummies = pd.get_dummies(days)
df1 = pd.concat([df1, bio_dummies], axis=1) 
df1 = df1.drop(['conversation_density'], axis=1)

In [647]:

df1['1st_usr_msg_0'] = (df1['days_to_first_text_msg_from_user'] <= 4).astype(int)
df1['1st_usr_msg_1'] = (df1['days_to_first_text_msg_from_user'] == 5).astype(int) # & (df1['days_to_first_text_msg_from_user'] <= 2)).astype(int)
df1['1st_usr_msg_2'] = (df1['days_to_first_text_msg_from_user'] == 6).astype(int) # & (df1['days_to_first_text_msg_from_user'] <= 4)).astype(int)
df1['1st_usr_msg_3'] = ((df1['days_to_first_text_msg_from_user'] > 6) &  (df1['days_to_first_text_msg_from_user'] <= 8)).astype(int)
df1['1st_usr_msg_4'] = ((df1['days_to_first_text_msg_from_user'] > 8) &  (df1['days_to_first_text_msg_from_user'] <= 10)).astype(int)
df1['1st_usr_msg_5'] = ((df1['days_to_first_text_msg_from_user'] > 10) & (df1['days_to_first_text_msg_from_user'] <= 14)).astype(int)
df1['1st_usr_msg_6'] = ((df1['days_to_first_text_msg_from_user'] > 14) & (df1['days_to_first_text_msg_from_user'] <= 998)).astype(int) # & (df1['days_to_first_text_msg_from_user'] <= 8)).astype(int)
#df1['1st_usr_msg_7'] = ((df1['days_to_first_text_msg_from_user'] > 30) & (df1['days_to_first_text_msg_from_user'] <= 998)).astype(int)
#df1['1st_usr_msg_8'] = ((df1['days_to_first_text_msg_from_user'] > 50) & (df1['days_to_first_text_msg_from_user'] <= 998)).astype(int)
df1['1st_usr_msg_9'] = (df1['days_to_first_text_msg_from_user'] == 999).astype(int)
df1 = df1.drop(['days_to_first_text_msg_from_user'], axis=1)


#bio_dummies = pd.get_dummies(days)
#df1 = pd.concat([df1, bio_dummies], axis=1) 
#df1 = df1.drop(['days_to_first_text_msg_from_user'], axis=1)

In [648]:

df1['1st_coach_msg_0'] = (df1['days_to_first_text_msg_from_coach'] <= 3).astype(int)
df1['1st_coach_msg_1'] = ((df1['days_to_first_text_msg_from_coach'] > 3) & (df1['days_to_first_text_msg_from_coach'] <= 4)).astype(int)
df1['1st_coach_msg_2'] = ((df1['days_to_first_text_msg_from_coach'] > 4) & (df1['days_to_first_text_msg_from_coach'] <= 5)).astype(int)
df1['1st_coach_msg_3'] = ((df1['days_to_first_text_msg_from_coach'] > 5) & (df1['days_to_first_text_msg_from_coach'] <= 6)).astype(int)
df1['1st_coach_msg_4'] = (df1['days_to_first_text_msg_from_coach']  > 7).astype(int)
df1 = df1.drop(['days_to_first_text_msg_from_coach'], axis=1)

In [649]:
days = pd.qcut(df1['count_text_msg_from_coach'], 5, 
               labels = ['1st_msg_1', '1st_msg_2', '1st_msg_3', '1st_msg_4', '1st_msg_5'])
bio_dummies = pd.get_dummies(days)
df1 = pd.concat([df1, bio_dummies], axis=1) 
df1 = df1.drop(['count_text_msg_from_coach'], axis=1)

In [650]:
### Define Y

In [651]:
df1['Y']

0        0
1        0
2        1
3        0
4        0
        ..
73606    0
73610    0
73615    0
73616    0
73618    0
Name: Y, Length: 72490, dtype: int64

In [652]:
df1['Y'].value_counts()

0    57065
1    15425
Name: Y, dtype: int64

In [653]:
# Save file
df1.to_csv('binaryData1.csv', index=False)
#df1.to_csv('cleanData1.csv', index=False)

In [654]:
df1['bio_length_0'].sum()

2668

In [655]:
df1['bio_length_1'].sum()

9209

In [656]:
df1['first_et_1'].count()

72490

In [657]:
t = df1.sum(axis=0)

In [658]:
t

days_to_kit                  362412.0
reason_limited_time           29212.0
reason_family_obligations     13562.0
reason_work_obligations       19140.0
reason_other                   4730.0
                               ...   
1st_msg_1                     24665.0
1st_msg_2                      8273.0
1st_msg_3                     14023.0
1st_msg_4                     11343.0
1st_msg_5                     14186.0
Length: 192, dtype: float64

In [659]:
t[t<5000]

reason_other                      4730.0
is_sedentary                         0.0
is_passive_coping                    0.0
is_catastrophizing                   0.0
morning_messages                  1953.0
is_repeat                         1260.0
O                                  507.0
New England                       2827.0
None                               162.0
West North Central                4572.0
Agriculture                         85.0
Conglomerate                       577.0
Conglomerates                     1544.0
Construction                       705.0
Education                         3893.0
Energy                            1087.0
Financial Services                4286.0
Hospitality                       1799.0
Information Technology            3964.0
Insurance                         2649.0
Internet & Telecommunications     3417.0
Labor and Trust                     77.0
Other                              759.0
Professional Services             4364.0
Transportation a

In [660]:
age = [t['age_0'], t['age_1'], t['age_2'], t['age_3'], t['age_4'], t['age_5'], t['age_6'], t['age_7'], t['age_8'], t['age_9']]
age

[8457.0,
 7470.0,
 6788.0,
 7143.0,
 7299.0,
 6414.0,
 8340.0,
 6493.0,
 7641.0,
 6445.0]

In [661]:
t['pain_length_4']

14825.0

In [662]:
t['inbound_member_msg_1']

20.0

In [663]:
imagine = [t['imagine_length_0'], t['imagine_length_1'], t['imagine_length_2'], t['imagine_length_3'], 
           t['imagine_length_4'], t['imagine_length_5']]
imagine

[4202.0, 8910.0, 15555.0, 20588.0, 16691.0, 6544.0]

In [664]:
pain = [t['pain_length_0'], t['pain_length_1'], t['pain_length_2'], t['pain_length_3'], 
        t['pain_length_4'], t['pain_length_5']]
pain

[6018.0, 10670.0, 17111.0, 14217.0, 14825.0, 9649.0]

In [665]:
pain = [t['pain_0'], t['pain_1'], t['pain_2'], t['pain_3'], t['pain_4']]
pain

[16757.0, 9654.0, 32746.0, 11958.0, 1375.0]

In [666]:
bio = [t['bio_length_0'], t['bio_length_1'], t['bio_length_2'], t['bio_length_3'], 
       t['bio_length_4'], t['bio_length_5'], t['bio_length_6'], t['bio_length_7'], 
       t['bio_length_8'], t['bio_length_9']]
bio

[2668.0,
 9209.0,
 6050.0,
 8705.0,
 8729.0,
 8331.0,
 6671.0,
 5312.0,
 7239.0,
 9576.0]

In [667]:
days = [t['days_to_coach_1'], t['days_to_coach_2'], t['days_to_coach_3'], t['days_to_coach_4'], t['days_to_coach_5']]
days

[9601.0, 5995.0, 11006.0, 9248.0, 5917.0]

In [668]:
[t['conversations_0'], t['conversations_1'], t['conversations_2'], t['conversations_3'], t['conversations_4']]


[4439.0, 18890.0, 28325.0, 14463.0, 6373.0]

In [669]:
t[t>20000]

days_to_kit                      362412.0
reason_limited_time               29212.0
weight                         13171129.0
is_overweight                     45520.0
week_1_words                    2357103.0
afternoon_messages                26120.0
first_et_before_coach             73355.0
first_user_msg_before_coach      231123.0
kit_arrival_before_coach         561466.0
back                              30619.0
F                                 42295.0
M                                 29688.0
reasons_length_4                  21020.0
imagine_length_3                  20588.0
inbound_coach_length_0            38231.0
inbound_coach_msg_0               38166.0
gad_0                             32732.0
phq_0                             44637.0
user_initiated_conv_0             60741.0
pain_2                            32746.0
accept_month_3                    20814.0
hours_to_complete_aq_0            64803.0
conversations_2                   28325.0
1st_usr_msg_9                     

In [670]:
[t['inbound_member_msg_0'], t['inbound_member_msg_1'], t['inbound_member_msg_2'], t['inbound_member_msg_3'], t['inbound_member_msg_4'],
 t['inbound_member_msg_5'], t['inbound_member_msg_6'], t['inbound_member_msg_7'], t['inbound_member_msg_8'], t['inbound_member_msg_9'], 
 t['inbound_member_msg_10'], t['inbound_member_msg_11']]

[1.0,
 20.0,
 227.0,
 4833.0,
 8202.0,
 16337.0,
 9733.0,
 6457.0,
 5203.0,
 4083.0,
 3301.0,
 2598.0]

In [671]:
[t['1st_usr_msg_0'], t['1st_usr_msg_1'], t['1st_usr_msg_2'], t['1st_usr_msg_3'], t['1st_usr_msg_4'], 
 t['1st_usr_msg_5'], t['1st_usr_msg_6'], t['1st_usr_msg_9']]

[6148.0, 4625.0, 6774.0, 4782.0, 2173.0, 2465.0, 3015.0, 42508.0]