In [1]:
# Imports

import numpy as np
import pandas as pd
import pandas_profiling as pp


In [2]:
# Step 1: Read in the tables. Without encoding = "ISO-8859-1" for users, receive unicodeDecodeError of invalid continuation byte

engagement = pd.read_csv('takehome_user_engagement.csv')
users = pd.read_csv('takehome_users.csv', encoding = "ISO-8859-1")


In [3]:
# Step 2: Explore data using pandas profiling. Do these in two separate notebook cells.

pp.ProfileReport(engagement)



In [4]:
pp.ProfileReport(users)



<h4>The challenge asks us to get users who are active 3 or more days in a 7-day period. This list will get users if they were active 3 times in a week, with weeks starting on Sunday. It does not catch "rolling" users who might have logged in only on Saturday, Sunday, and Monday of the next week, for example, and also counts users that may have visited three times in one day as an active user.</h4>

In [5]:
# Change timestamp to pd.datetime from categorical
engagement['time_stamp'] = pd.to_datetime(engagement['time_stamp'])

In [6]:
# Group by both engagement and timestamp to see if user visited 3 or more times in a week.
df = engagement.groupby(['user_id', engagement['time_stamp'].dt.week]).sum() >= 3

# Get user_ids by calling groupby on 'user_id'. Get the indices as a list by calling index and changing it tolist.
active_users = df[df['visited'] == True].groupby('user_id').sum().index.tolist()

In [7]:
# Add a true/false column to users if user is active

users['active_user'] = users.object_id.isin(active_users)

<h4> Data cleaning and feature engineering: </h4>
<ul>
1.) Set the index as object_id. <br>
2.) Change creation_time to datetime object. Get year, month, day of week for features. <br>
3.) Check if there are common (n > 10) last names. <br>
4.) Create email_domain column with email domain (regexed) <br>
5.) Label encode creation_source will be done in sklearn <br>
6.) Last-session-created is going to be dropped. Too related to our target variable. <br>
7.) Opted in mailing list is fine <br>
8.) Enabled for drip is fine<br>
9.) Gets whether the user who referred the user is still active.<br>
</ul>

In [8]:
# Step 1.) Set index as object_id

users = users.set_index('object_id')

In [9]:
# 2.) Change creation_time to datetime object. Get year, month, day of week for features.

users['creation_time'] = pd.to_datetime(users['creation_time'])
users['year'] = users['creation_time'].dt.year
users['month'] = users['creation_time'].dt.month
users['day_of_week'] = users['creation_time'].dt.day_name()

users.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,active_user,year,month,day_of_week
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,False,2014,4,Tuesday
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,True,2013,11,Friday
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,False,2013,3,Tuesday
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,False,2013,5,Tuesday
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,False,2013,1,Thursday


In [10]:
# 3.) Check to see if common last name. Use common last names as features to check relationships.

users['last_name'] = users['name'].apply(lambda x: x.split(" ")[0])

In [11]:
# This is a list of the 50 most common last names. Interesting to check to see if certain last names are more active.


users.last_name.value_counts()
top50 = users.last_name.value_counts().nlargest(50).index
users['last_name_common'] = users['last_name'].where(users['last_name'].isin(top50), other='Other')

In [12]:
# 4.) Create email_domain column with email domain (regexed)

users['email_domain'] = users['email'].apply(lambda x: x.split('@')[-1])

In [13]:
# Reduces the number of common email domains to six. (Lots of domains with 2-3 users each.)
topsix = users.email_domain.value_counts().nlargest(6).index

users['email_updated'] = users['email_domain'].where(users['email_domain'].isin(topsix), other='Other')
users['email_updated'].value_counts()

# See https://www.dataschool.io/python-pandas-tips-and-tricks/ #77

gmail.com         3562
yahoo.com         2447
jourrapide.com    1259
cuvox.de          1202
Other             1186
gustr.com         1179
hotmail.com       1165
Name: email_updated, dtype: int64

In [14]:
users.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,active_user,year,month,day_of_week,last_name,last_name_common,email_domain,email_updated
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,False,2014,4,Tuesday,Clausen,Clausen,yahoo.com,yahoo.com
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,True,2013,11,Friday,Poole,Other,gustr.com,gustr.com
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,False,2013,3,Tuesday,Bottrill,Other,gustr.com,gustr.com
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,False,2013,5,Tuesday,Clausen,Clausen,yahoo.com,yahoo.com
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,False,2013,1,Thursday,Raw,Other,yahoo.com,yahoo.com


In [15]:
# 6.) Last-session-created is going to be dropped. Too related to our target variable.
# 7.) Opted in mailing list is fine
# 8.) Enabled for drip is fine

In [16]:
# 9.) Change invited_by_user_id into string and check if that string is an active user.

users['invited_by_user_id'].fillna(0, inplace=True)
users['invited_by_user_id'] = users['invited_by_user_id'].apply(lambda x: str(int(x)))

users.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,active_user,year,month,day_of_week,last_name,last_name_common,email_domain,email_updated
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803,False,2014,4,Tuesday,Clausen,Clausen,yahoo.com,yahoo.com
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316,True,2013,11,Friday,Poole,Other,gustr.com,gustr.com
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525,False,2013,3,Tuesday,Bottrill,Other,gustr.com,gustr.com
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151,False,2013,5,Tuesday,Clausen,Clausen,yahoo.com,yahoo.com
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240,False,2013,1,Thursday,Raw,Other,yahoo.com,yahoo.com


In [17]:
# 9.) Gets whether the user who referred the user is still active. Common thing to watch out for in retention modeling.

users['invited_by_active_user'] = users['invited_by_user_id'].apply(lambda x: int(x) in active_users)

In [18]:
users['invited'] = users['invited_by_user_id'].apply(lambda x: int(x) > 0)

In [19]:
# Create X and y matrices for machine learning.

X = users[['creation_source', 'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'year',
                      'month', 'day_of_week', 'last_name_common', 'email_updated', 'invited_by_active_user']]
y = users['active_user']

<h3>Machine Learning and Statistical Modeling</h3>

In [20]:
# Build a pipeline to dummy encode categorical features
from sklearn.preprocessing import OneHotEncoder

# use when different features need different preprocessing
from sklearn.compose import make_column_transformer

ohe = OneHotEncoder(sparse=False)

X = X.astype(str)

In [21]:
# One hot encode all columns

column_trans = make_column_transformer(
    (OneHotEncoder(), ['creation_source', 'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'year',
                      'month', 'day_of_week', 'last_name_common', 'email_updated', 'invited_by_active_user']),
    remainder='drop')

In [22]:
# Create a matrix with encoded columns. This loses column names which is a pain.

transformed = column_trans.fit_transform(X)

In [23]:
# Apply logistic regression and random forest models. 

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs')

In [24]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [25]:
# Create pipeline to encode columns and run predictor for each model. 
# Useful to grid search the pipeline instead of individual models. 

from sklearn.pipeline import make_pipeline
pipe_lr = make_pipeline(column_trans, logreg)
pipe_rf = make_pipeline(column_trans, rf)

In [26]:
# Score each of the models with cross validation. Baseline is random guess based on proportion of active users.

from sklearn.model_selection import cross_val_score
print('Baseline: ', 1 - len(active_users) / len(y))
print('Logistic Regression: ', cross_val_score(pipe_lr, X, y, cv=10, scoring='accuracy').mean())
print('Random Forest: ', cross_val_score(pipe_rf, X, y, cv=10, scoring='accuracy').mean())

Baseline:  0.8795833333333334
Logistic Regression:  0.8795003329863423
Random Forest:  0.8500838542248987


<h3>At this point, we have run two models, logistic regression and random forest. Neither has beaten the baseline of guessing that none of the users will be active. Things to try from here:</h3>
    <ul>
    <li>Other ML models (XGBooxt)<br></li>
    <li>Variable selection processes to prevent overfitting<br></li>
    <li>Improvement on figuring out who is an "active user"<br></li>
    <li>Messing around with hyperparameter tuning</li>
   </ul>
   