In [267]:
#Read data file
import pandas as pd
from sklearn import preprocessing

df1 = pd.read_csv('clone_of_low_tier_retention_classifier-query_5-663ddaee2c8e-2022-06-15-15-52-52 (1).csv')

In [268]:
df1.shape

(53102, 34)

In [269]:
df1.columns

Index(['pathway_id', 'user_id', 'uuid', 'acceptance_date', 'first_et',
       'days_to_onboard', 'transferred_to_coach_at', 'indication',
       'is_gender_female', 'is_gender_male', 'is_gender_other',
       'user_age_years', 'gad', 'phq', 'surgery_1yr', 'pain_severity',
       'pain_vas', 'pain_description_length', 'bmi', 'bio_length',
       'reasons_length', 'imagine_free_length', 'reason_limited_time',
       'reason_family_obligations', 'reason_work_obligations', 'reason_other',
       'inbound_os_messages_all', 'inbound_os_messages',
       'inbound_coach_messages_all', 'inbound_coach_messages',
       'ets_per_week_4_wk', 'week_4_retained', 'length_messages', 'cluster'],
      dtype='object')

In [270]:
# Find memebers with multiple pathways
#tx_frequency = df1.groupby('user_id').acceptance_date.count().reset_index()
#tx_frequency.columns = ['u.uuid','frequency']
#tx_frequency[tx_frequency['frequency'] > 1].shape


In [271]:
df1 = df1.drop(['pathway_id', 'user_id', 'uuid', 'ets_per_week_4_wk', 'week_4_retained', 'cluster'], axis=1)

In [272]:
start_date = '2022-01-01'
# 'acceptance_date' -> 'first_et' -> 'transferred_to_coach_at'

df1['transferred_to_coach_day'] = (pd.to_datetime(df1['transferred_to_coach_at']) - pd.to_datetime(df1['acceptance_date'])).dt.days
df1['first_et_day'] = (pd.to_datetime(df1['first_et']) - pd.to_datetime(df1['acceptance_date'])).dt.days                     
df1['acceptance_days_ago'] = (pd.to_datetime(df1['acceptance_date']) - pd.to_datetime(start_date)).dt.days
                       
df1 = df1.drop(['acceptance_date', 'transferred_to_coach_at', 'first_et'], axis=1)

In [273]:
df1.dtypes

days_to_onboard               float64
indication                     object
is_gender_female                int64
is_gender_male                  int64
is_gender_other                 int64
user_age_years                float64
gad                           float64
phq                           float64
surgery_1yr                     int64
pain_severity                 float64
pain_vas                      float64
pain_description_length         int64
bmi                           float64
bio_length                      int64
reasons_length                  int64
imagine_free_length             int64
reason_limited_time             int64
reason_family_obligations       int64
reason_work_obligations         int64
reason_other                    int64
inbound_os_messages_all         int64
inbound_os_messages             int64
inbound_coach_messages_all      int64
inbound_coach_messages          int64
length_messages                 int64
transferred_to_coach_day      float64
first_et_day

In [274]:
indication_dummies = pd.get_dummies(df1['indication'])
df1 = pd.concat([df1, indication_dummies], axis=1)      
df1 = df1.drop(['indication'], axis=1)

In [275]:
# Define target column
# See analysis below
df1['total_messages'] = df1['inbound_os_messages_all'] + df1['inbound_coach_messages_all']
df1['Y'] = df1['total_messages'] > 9
df1 = df1.drop(['inbound_coach_messages', 'inbound_os_messages', 'total_messages'], axis=1)

In [276]:
df1.head(5)

Unnamed: 0,days_to_onboard,is_gender_female,is_gender_male,is_gender_other,user_age_years,gad,phq,surgery_1yr,pain_severity,pain_vas,...,length_messages,transferred_to_coach_day,first_et_day,acceptance_days_ago,back,hip,knee,neck,shoulder,Y
0,3.0,0,1,0,54.0,0.0,4.0,1,1.0,0.9,...,0,3.0,3,45,1,0,0,0,0,False
1,75.0,1,0,0,37.0,15.0,14.0,0,4.0,3.4,...,161,75.0,75,45,0,0,0,1,0,False
2,13.0,1,0,0,53.0,18.0,0.0,0,4.0,5.5,...,474,21.0,13,45,0,0,0,1,0,False
3,16.0,1,0,0,56.0,5.0,6.0,53,10.0,8.2,...,5008,16.0,16,50,0,0,1,0,0,True
4,5.0,1,0,0,36.0,4.0,0.0,0,1.0,5.1,...,0,5.0,5,46,0,0,0,1,0,False


### Fixing Nulls

In [277]:
print(df1.isnull().sum()) # found no missing values in the data

days_to_onboard                   0
is_gender_female                  0
is_gender_male                    0
is_gender_other                   0
user_age_years                    0
gad                               6
phq                               7
surgery_1yr                       0
pain_severity                 12162
pain_vas                       2175
pain_description_length           0
bmi                            2378
bio_length                        0
reasons_length                    0
imagine_free_length               0
reason_limited_time               0
reason_family_obligations         0
reason_work_obligations           0
reason_other                      0
inbound_os_messages_all           0
inbound_coach_messages_all        0
length_messages                   0
transferred_to_coach_day       1010
first_et_day                      0
acceptance_days_ago               0
back                              0
hip                               0
knee                        

In [278]:
# remove members with transferred_to_coach_day = null
df1 = df1[df1['transferred_to_coach_day'].notna()]

In [279]:
# if pain_severity or pain_vas is null -> 0
df1['pain_severity'].fillna(0, inplace=True)
df1['pain_vas'].fillna(0, inplace=True)
df1['gad'].fillna(0, inplace=True)
df1['phq'].fillna(0, inplace=True)

In [280]:
# but average BMI where BMI is null
df1['bmi'].fillna((df1['bmi'].mean()), inplace=True)

In [281]:
print(df1.isnull().sum()) # found no missing values in the data

days_to_onboard               0
is_gender_female              0
is_gender_male                0
is_gender_other               0
user_age_years                0
gad                           0
phq                           0
surgery_1yr                   0
pain_severity                 0
pain_vas                      0
pain_description_length       0
bmi                           0
bio_length                    0
reasons_length                0
imagine_free_length           0
reason_limited_time           0
reason_family_obligations     0
reason_work_obligations       0
reason_other                  0
inbound_os_messages_all       0
inbound_coach_messages_all    0
length_messages               0
transferred_to_coach_day      0
first_et_day                  0
acceptance_days_ago           0
back                          0
hip                           0
knee                          0
neck                          0
shoulder                      0
Y                             0
dtype: i

### Normalize

In [282]:
x = df1.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df2 = pd.DataFrame(x_scaled, columns=df1.columns)

In [283]:
df2.head()

Unnamed: 0,days_to_onboard,is_gender_female,is_gender_male,is_gender_other,user_age_years,gad,phq,surgery_1yr,pain_severity,pain_vas,...,length_messages,transferred_to_coach_day,first_et_day,acceptance_days_ago,back,hip,knee,neck,shoulder,Y
0,0.034483,0.0,1.0,0.0,0.467532,0.0,0.166667,0.01,0.1,0.09,...,0.0,0.026786,0.034483,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.862069,1.0,0.0,0.0,0.246753,0.714286,0.583333,0.0,0.4,0.34,...,0.002137,0.669643,0.862069,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.149425,1.0,0.0,0.0,0.454545,0.857143,0.0,0.0,0.4,0.55,...,0.006292,0.1875,0.149425,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.183908,1.0,0.0,0.0,0.493506,0.238095,0.25,0.53,1.0,0.82,...,0.066476,0.142857,0.183908,0.054348,0.0,0.0,1.0,0.0,0.0,1.0
4,0.057471,1.0,0.0,0.0,0.233766,0.190476,0.0,0.0,0.1,0.51,...,0.0,0.044643,0.057471,0.01087,0.0,0.0,0.0,1.0,0.0,0.0


In [284]:
df2.columns

Index(['days_to_onboard', 'is_gender_female', 'is_gender_male',
       'is_gender_other', 'user_age_years', 'gad', 'phq', 'surgery_1yr',
       'pain_severity', 'pain_vas', 'pain_description_length', 'bmi',
       'bio_length', 'reasons_length', 'imagine_free_length',
       'reason_limited_time', 'reason_family_obligations',
       'reason_work_obligations', 'reason_other', 'inbound_os_messages_all',
       'inbound_coach_messages_all', 'length_messages',
       'transferred_to_coach_day', 'first_et_day', 'acceptance_days_ago',
       'back', 'hip', 'knee', 'neck', 'shoulder', 'Y'],
      dtype='object')

In [285]:
df2.shape

(52092, 31)

## 

In [286]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

In [287]:
X = df2.iloc[:, 0:-1].values
y = df2.iloc[:, -1].values

In [288]:
X.shape

(52092, 30)

In [289]:
y.shape

(52092,)

In [290]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


In [291]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [292]:
y_pred  =  classifier.predict(X_test)

In [293]:
print("Number of mislabeled points out of a total %d points : %d" %(X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 10419 points : 885


In [294]:
from sklearn.metrics import confusion_matrix,accuracy_score
confusion_matrix(y_test, y_pred)


array([[7243,  588],
       [ 297, 2291]])

In [295]:
accuracy_score(y_test,y_pred)

0.9150590267780018

In [296]:
# how many mid tier were classified correctly / wrongly
results = pd.DataFrame()
results['pred'] = y_pred
results['actual'] = y_test
results.shape

(10419, 2)

In [297]:
results['actual'].value_counts()

0.0    7831
1.0    2588
Name: actual, dtype: int64

In [298]:
results = results[results['actual'] == 1.0]
results.shape

(2588, 2)

In [299]:
results['pred'].value_counts()

1.0    2291
0.0     297
Name: pred, dtype: int64