### Relax Data Science Challenge
    Varun Nadgir

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# load in both csv
engagement = pd.read_csv('takehome_user_engagement.csv')
users = pd.read_csv('takehome_users.csv', encoding='latin-1')

In [3]:
# turn column into datatime 
engagement['time_stamp'] = pd.to_datetime(engagement['time_stamp'])
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [4]:
# convert to datetime
users['creation_time'] = pd.to_datetime(users['creation_time'])
# convert to datetime, use seconds as unit
users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'], unit='s')
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0


In [5]:
# get a sense for missing data 
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null datetime64[ns]
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: datetime64[ns](2), float64(1), int64(4), object(3)
memory usage: 937.6+ KB


In [6]:
# function for iterating through user_id groups, ignoring users with fewer than 3 entries
def find_adopted(x):
    if len(x) >= 3:
        x = [i for i in x] 
        x.sort()
        # get timedelta values for consecutive pairs of days
        x = [x[i+1] - x[i] for i in range(len(x)-2)]
        # if 3 consecutive timedelta values sum to less than 7, then user is adopted user
        x = [1 for i in range(len(x)-2) if x[i] + x[i+1] + x[i+2] <= timedelta(days=7)]
        if 1 in x:
            return 1

# run function and replace NA with 0
adopted = engagement.groupby('user_id').agg(find_adopted)
adopted.fillna(0, inplace=True)
adopted.columns = ['adopted_user']
adopted.head()

Unnamed: 0_level_0,adopted_user
user_id,Unnamed: 1_level_1
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0


In [7]:
# combine dataframes
df = users.join(adopted, how='left')

In [8]:
# some data cleaning
# replace NA with 0 if not invited
df['invited_by_user_id'].fillna(0, inplace=True)

# remove null rows
df.dropna(axis=0, inplace=True)

# calculate membership as difference of last use and signup, drop original columns
df['membership'] = df['last_session_creation_time'] - df['creation_time']
df['membership'] = df['membership'].dt.days

# select columns needed for modeling
columns = ['object_id', 'creation_source', 'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'org_id', 
           'invited_by_user_id', 'membership', 'adopted_user']
df = df[columns]

# reindex df
df.index = df['object_id']
df = df.drop('object_id', axis=1)

df.head()

Unnamed: 0_level_0,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,membership,adopted_user
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,ORG_INVITE,0,0,1,316.0,136,0.0
3,ORG_INVITE,0,0,94,1525.0,0,0.0
4,GUEST_INVITE,0,0,1,5151.0,1,0.0
5,GUEST_INVITE,0,0,193,5240.0,5,0.0
6,GUEST_INVITE,0,0,197,11241.0,2,0.0


In [9]:
# check unique values for creation source column
df['creation_source'].unique()

array(['ORG_INVITE', 'GUEST_INVITE', 'SIGNUP', 'PERSONAL_PROJECTS',
       'SIGNUP_GOOGLE_AUTH'], dtype=object)

In [10]:
# modeling with original column returns error, map values to numbers instead
creation = {'ORG_INVITE': 1, 'GUEST_INVITE': 2, 'SIGNUP': 3, 'PERSONAL_PROJECTS': 4, 'SIGNUP_GOOGLE_AUTH': 5}
df['creation_source'] = df['creation_source'].map(lambda x: creation[x])

In [11]:
df.head()

Unnamed: 0_level_0,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,membership,adopted_user
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,1,0,0,1,316.0,136,0.0
3,1,0,0,94,1525.0,0,0.0
4,2,0,0,1,5151.0,1,0.0
5,2,0,0,193,5240.0,5,0.0
6,2,0,0,197,11241.0,2,0.0


In [15]:
# create x,y variables for model
x = df.drop(['adopted_user'], axis=1)
y = df['adopted_user']

# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=42)

# initialize, fit, and predict tree
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)

In [21]:
# print feature importances
for i, x in enumerate(tree.feature_importances_):
    print(df.columns[i],": \t", x)

creation_source : 	 0.03378994267503183
opted_in_to_mailing_list : 	 0.043630111941827344
enabled_for_marketing_drip : 	 0.01740291739305573
org_id : 	 0.4622382738564692
invited_by_user_id : 	 0.24246657321309117
membership : 	 0.2004721809205247


In [17]:
accuracy_score(y_test, y_pred)

0.7751698579369981

With an accuracy score of 77.52, the decision tree shows that the most important features for predicting for adopted users are 'org_id', 'invited_by_user_id', and 'membership', in that order. Since we replaced the 'invited_by_user_id' for those who were not invited by anyone, this column may be weighted too heavily. In either case, it seems as though the org