In [138]:
import pandas as pd
import matplotlib.pyplot as plt

In [72]:
# Load two files into pandas DataFrames

df_users = pd.read_csv('takehome_users.csv', encoding='latin1')
df_user_engagement = pd.read_csv('takehome_user_engagement.csv', encoding='latin1', parse_dates=['time_stamp'])

In [133]:
df_users.head(10)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0
5,6,2013-12-17 03:37:06,Cunha Eduardo,EduardoPereiraCunha@yahoo.com,GUEST_INVITE,1387424000.0,0,0,197,11241.0
6,7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,1356010000.0,0,1,37,
7,8,2013-07-31 05:34:02,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,,1,1,74,
8,9,2013-11-05 04:04:24,Amsel Paul,PaulAmsel@hotmail.com,PERSONAL_PROJECTS,,0,0,302,
9,10,2013-01-16 22:08:03,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,1401833000.0,1,1,318,4143.0


In [127]:
# Sort DataFrame by user_id and then time_stamp

df_user_engagement = df_user_engagement.sort_values(by = ['user_id', 'time_stamp'])

In [130]:
df_user_engagement.head(20)

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1
5,2013-12-31 03:45:04,2,1
6,2014-01-08 03:45:04,2,1
7,2014-02-03 03:45:04,2,1
8,2014-02-08 03:45:04,2,1
9,2014-02-09 03:45:04,2,1


In [119]:
# Calculate the number of visits each user made in a 7-day window

one_week_visits = df_user_engagement.groupby(['user_id', pd.Grouper(key='time_stamp', freq='7D')]).sum()

In [129]:
one_week_visits.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,visited
user_id,time_stamp,Unnamed: 2_level_1
1,2014-04-17 08:20:06,1
2,2013-11-14 08:20:06,1
2,2013-11-28 08:20:06,1
2,2013-12-05 08:20:06,1
2,2013-12-19 08:20:06,1
2,2013-12-26 08:20:06,1
2,2014-01-02 08:20:06,1
2,2014-01-30 08:20:06,1
2,2014-02-06 08:20:06,3
2,2014-02-13 08:20:06,1


In [121]:
# Initialize DataFrame with user_ids appearing in df_user_engagement table

adopted_users = pd.DataFrame(df_user_engagement['user_id'].unique(), columns=['id'])

In [122]:
adopted_users.head()

Unnamed: 0,id
0,1
1,2
2,3
3,4
4,5


In [131]:
# Create and apply a function that defines a user as "adopted" if there was at least one week in one_week_visits where the 
# number of visits was at least 3

def is_adopted(user):
    
    if one_week_visits.loc[user].max().item() >= 3:
        return 'adopted'
    else:
        return 'not adopted'
    
adopted_users['adopted'] = adopted_users['id'].apply(is_adopted)

adopted_users.head(20)

Unnamed: 0,id,adopted
0,1,not adopted
1,2,adopted
2,3,not adopted
3,4,not adopted
4,5,not adopted
5,6,not adopted
6,7,not adopted
7,10,adopted
8,11,not adopted
9,13,not adopted


In [146]:
# Left merge the df_users table on the "object_id" column with the "id" column of our newly created adopted_users table

df = df_users.merge(adopted_users, how='left', left_on = 'object_id', right_on = 'id')

df.head(10)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,id,adopted
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,1.0,not adopted
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,2.0,adopted
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,3.0,not adopted
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,4.0,not adopted
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,5.0,not adopted
5,6,2013-12-17 03:37:06,Cunha Eduardo,EduardoPereiraCunha@yahoo.com,GUEST_INVITE,1387424000.0,0,0,197,11241.0,6.0,not adopted
6,7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,1356010000.0,0,1,37,,7.0,not adopted
7,8,2013-07-31 05:34:02,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,,1,1,74,,,
8,9,2013-11-05 04:04:24,Amsel Paul,PaulAmsel@hotmail.com,PERSONAL_PROJECTS,,0,0,302,,,
9,10,2013-01-16 22:08:03,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,1401833000.0,1,1,318,4143.0,10.0,adopted


In [147]:
# Fill in NaN values of "adopted" column with 'not adopted'

df['adopted'] = df['adopted'].fillna('not adopted')

df.head(10)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,id,adopted
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,1.0,not adopted
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,2.0,adopted
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,3.0,not adopted
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,4.0,not adopted
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,5.0,not adopted
5,6,2013-12-17 03:37:06,Cunha Eduardo,EduardoPereiraCunha@yahoo.com,GUEST_INVITE,1387424000.0,0,0,197,11241.0,6.0,not adopted
6,7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,1356010000.0,0,1,37,,7.0,not adopted
7,8,2013-07-31 05:34:02,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,,1,1,74,,,not adopted
8,9,2013-11-05 04:04:24,Amsel Paul,PaulAmsel@hotmail.com,PERSONAL_PROJECTS,,0,0,302,,,not adopted
9,10,2013-01-16 22:08:03,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,1401833000.0,1,1,318,4143.0,10.0,adopted


In [148]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 0 to 11999
Data columns (total 12 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
id                            8823 non-null float64
adopted                       12000 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 1.2+ MB


## Which factors predict future adoption?

In [159]:
# Create df where all users are adopted users

df_adopted = df[df['adopted'] == 'adopted']

# Examine the percentage of adopted users for the different classes in creation_source, opted_in_to_mailing_list,
# and enabled_for_marketing_drip columns

print(df_adopted.groupby('creation_source')['adopted'].count() / len(df['adopted']) * 100)

print(df_adopted.groupby('opted_in_to_mailing_list')['adopted'].count() / len(df['adopted']) * 100)

print(df_adopted.groupby('enabled_for_marketing_drip')['adopted'].count() / len(df['adopted']) * 100)

creation_source
GUEST_INVITE          2.741667
ORG_INVITE            4.158333
PERSONAL_PROJECTS     1.150000
SIGNUP                2.250000
SIGNUP_GOOGLE_AUTH    1.691667
Name: adopted, dtype: float64
opted_in_to_mailing_list
0    8.916667
1    3.075000
Name: adopted, dtype: float64
enabled_for_marketing_drip
0    10.166667
1     1.825000
Name: adopted, dtype: float64


+ From this printout we see that if a user was invited to an organization, they were almsot twice as likely than any other source to become an adopted user.
+ If the user did not opt into the mailing list, they are more than three times more likely to become an adopted user.
+ If the user did not enable marketing drop, they are more than five times more likely to become an adopted user!
+ We suspect that these three features are strong predictors in future user adoption. We will fit several machine learning algorithms to these features and use the models to predict user adoption.

+ We will use the creation_source, opted_in_to_mailing_list, and enabled_for_marketing_drip columns as our predictive features, and the adopted column as our target variable. First, we must OneHotEncode the creation_source column

In [160]:
# Create dummy variables for the creation_source column

source_dummies = pd.get_dummies(df['creation_source'], prefix = 'source')

df = pd.concat([df, source_dummies], axis=1)

df = df.drop('creation_source', axis=1)

df.head()

Unnamed: 0,object_id,creation_time,name,email,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,id,adopted,source_GUEST_INVITE,source_ORG_INVITE,source_PERSONAL_PROJECTS,source_SIGNUP,source_SIGNUP_GOOGLE_AUTH
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,1398139000.0,1,0,11,10803.0,1.0,not adopted,1,0,0,0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,1396238000.0,0,0,1,316.0,2.0,adopted,0,1,0,0,0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,1363735000.0,0,0,94,1525.0,3.0,not adopted,0,1,0,0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,1369210000.0,0,0,1,5151.0,4.0,not adopted,1,0,0,0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,1358850000.0,0,0,193,5240.0,5.0,not adopted,1,0,0,0,0


+ I will create two classifiers, each using three different forms of resampling data: a Random Forest, and logistic regression. This gives us six models total. I will analyze the accuracy and classification report for each.

In [170]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import ClusterCentroids
import numpy as np

In [171]:
# Create X and y, the feature space and target variable, respectively.
# X will contain the features described above.
# y will denote whether or not the user is adopted, with a 1 corresponding to adopted.

X = df[['opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'source_GUEST_INVITE', 'source_ORG_INVITE', 'source_PERSONAL_PROJECTS', 'source_SIGNUP', 'source_SIGNUP_GOOGLE_AUTH']].values
y = (df['adopted'] == 'adopted').values

In [169]:
df['adopted'].value_counts() / len(df['adopted']) * 100

not adopted    88.008333
adopted        11.991667
Name: adopted, dtype: float64

+ Since there are far less adopted users than unadopted, we will experiment using RandomOverSampling, SMOTE, and ClusterCentroids to attempt to fix the class imbalance.

In [172]:
ros = RandomOverSampler(random_state=0)
X_ros, y_ros = ros.fit_sample(X,y)
X_train_ros, X_test_ros, y_train_ros, y_test_ros = train_test_split(X_ros, y_ros, test_size = 0.3, random_state=42)

sm = SMOTE(random_state=12, ratio=1.0)
X_sm, y_sm = sm.fit_sample(X,y)
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm, test_size = 0.3, random_state=42)

cc = ClusterCentroids(random_state=15)
X_cc, y_cc = cc.fit_sample(X,y)
X_train_cc, X_test_cc, y_train_cc, y_test_cc = train_test_split(X_cc, y_cc, test_size = 0.3, random_state=42)



In [173]:
### Logistic Regression

# Use GridSearch to tune the C parameter, fit to the training data, and predict using the testing data

c_space = np.logspace(-5,8,15)
params = {'C': c_space}

logreg = LogisticRegression()

np.random.seed(42)

logreg_cv = GridSearchCV(logreg, params, cv=5)
logreg_cv.fit(X_train_ros, y_train_ros)

log_pred = logreg_cv.predict(X_test_ros)

# Print the accuracy score and classification report

print('The accuracy score on our test data is:', logreg_cv.score(X_test_ros, y_test_ros))

print(classification_report(y_test_ros, log_pred))

The accuracy score on our test data is: 0.5611488085845037
             precision    recall  f1-score   support

      False       0.57      0.55      0.56      3196
       True       0.56      0.57      0.56      3141

avg / total       0.56      0.56      0.56      6337



In [174]:
### Logistic Regression

# Use GridSearch to tune the C parameter, fit to the training data, and predict using the testing data

c_space = np.logspace(-5,8,15)
params = {'C': c_space}

logreg = LogisticRegression()

np.random.seed(42)

logreg_cv = GridSearchCV(logreg, params, cv=5)
logreg_cv.fit(X_train_sm, y_train_sm)

log_pred = logreg_cv.predict(X_test_sm)

# Print the accuracy score and classification report

print('The accuracy score on our test data is:', logreg_cv.score(X_test_sm, y_test_sm))

print(classification_report(y_test_sm, log_pred))

The accuracy score on our test data is: 0.5567303140287202
             precision    recall  f1-score   support

      False       0.58      0.46      0.51      3196
       True       0.54      0.65      0.59      3141

avg / total       0.56      0.56      0.55      6337



In [175]:
### Logistic Regression

# Use GridSearch to tune the C parameter, fit to the training data, and predict using the testing data

c_space = np.logspace(-5,8,15)
params = {'C': c_space}

logreg = LogisticRegression()

np.random.seed(42)

logreg_cv = GridSearchCV(logreg, params, cv=5)
logreg_cv.fit(X_train_cc, y_train_cc)

log_pred = logreg_cv.predict(X_test_cc)

# Print the accuracy score and classification report

print('The accuracy score on our test data is:', logreg_cv.score(X_test_cc, y_test_cc))

print(classification_report(y_test_cc, log_pred))

The accuracy score on our test data is: 0.5358796296296297
             precision    recall  f1-score   support

      False       0.68      0.19      0.30       446
       True       0.51      0.90      0.65       418

avg / total       0.60      0.54      0.47       864



In [177]:
### Random Forest Classifier

# Use GridSearchCV to tune the n_estimators (number of trees) parameter

num_trees = [10,50,100,200,500]
params = {'n_estimators': num_trees}

forest = RandomForestClassifier()

np.random.seed(42)

forest_cv = GridSearchCV(forest, params, cv=5)
forest_cv.fit(X_train_ros,y_train_ros)

forest_pred = forest_cv.predict(X_test_ros)

# Print the accuracy score and classification report

print('The accuracy score on our test data is:', forest_cv.score(X_test_ros, y_test_ros))

print(classification_report(y_test_ros, forest_pred))

The accuracy score on our test data is: 0.5624112356004418
             precision    recall  f1-score   support

      False       0.57      0.55      0.56      3196
       True       0.56      0.58      0.57      3141

avg / total       0.56      0.56      0.56      6337



In [178]:
### Random Forest Classifier

# Use GridSearchCV to tune the n_estimators (number of trees) parameter

num_trees = [10,50,100,200,500]
params = {'n_estimators': num_trees}

forest = RandomForestClassifier()

np.random.seed(42)

forest_cv = GridSearchCV(forest, params, cv=5)
forest_cv.fit(X_train_sm,y_train_sm)

forest_pred = forest_cv.predict(X_test_sm)

# Print the accuracy score and classification report

print('The accuracy score on our test data is:', forest_cv.score(X_test_sm, y_test_sm))

print(classification_report(y_test_sm, forest_pred))

The accuracy score on our test data is: 0.5589395613066119
             precision    recall  f1-score   support

      False       0.57      0.51      0.54      3196
       True       0.55      0.61      0.58      3141

avg / total       0.56      0.56      0.56      6337



In [179]:
### Random Forest Classifier

# Use GridSearchCV to tune the n_estimators (number of trees) parameter

num_trees = [10,50,100,200,500]
params = {'n_estimators': num_trees}

forest = RandomForestClassifier()

np.random.seed(42)

forest_cv = GridSearchCV(forest, params, cv=5)
forest_cv.fit(X_train_cc,y_train_cc)

forest_pred = forest_cv.predict(X_test_cc)

# Print the accuracy score and classification report

print('The accuracy score on our test data is:', forest_cv.score(X_test_cc, y_test_cc))

print(classification_report(y_test_cc, forest_pred))

The accuracy score on our test data is: 0.5231481481481481
             precision    recall  f1-score   support

      False       0.54      0.47      0.50       446
       True       0.51      0.58      0.54       418

avg / total       0.53      0.52      0.52       864



# Final Analysis

+ Our preliminary analysis showed that the classes contained in creation source, opting into mailing list, and opting into marketing drip seemed to contain significantly different percentages of adopted users. However, because of the sparse nature of the data, we were uncertain if we could use these features as reliable predictors for future user adoption. Even after attempting to rectify the class imbalance using several methods, we still ended up with relatively ineffective classifiers.
+ The best model we obtained was using a Random Forest with RandomOverSampling, but even here we only obtained an accuracy score of 56%, and average F1 scores for the two classes of 0.56. We suspect that if we had more detailed user information metrics (such as demographic info), we might be able to build a more reliable classifier.