## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier


In [2]:
users = pd.read_csv('takehome_users.csv',  encoding='ISO-8859-1')
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [3]:
users.dtypes

object_id                       int64
creation_time                  object
name                           object
email                          object
creation_source                object
last_session_creation_time    float64
opted_in_to_mailing_list        int64
enabled_for_marketing_drip      int64
org_id                          int64
invited_by_user_id            float64
dtype: object

In [4]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [5]:
# convert the time column to datetime
users['creation_time'] = pd.to_datetime(users['creation_time'])
users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'])

users.dtypes

object_id                              int64
creation_time                 datetime64[ns]
name                                  object
email                                 object
creation_source                       object
last_session_creation_time    datetime64[ns]
opted_in_to_mailing_list               int64
enabled_for_marketing_drip             int64
org_id                                 int64
invited_by_user_id                   float64
dtype: object

In [6]:
# convert the 0,1 value columns to boolean
users['opted_in_to_mailing_list'] = users['opted_in_to_mailing_list'].astype('bool')
users['enabled_for_marketing_drip'] = users['enabled_for_marketing_drip'].astype('bool')

users.dtypes

object_id                              int64
creation_time                 datetime64[ns]
name                                  object
email                                 object
creation_source                       object
last_session_creation_time    datetime64[ns]
opted_in_to_mailing_list                bool
enabled_for_marketing_drip              bool
org_id                                 int64
invited_by_user_id                   float64
dtype: object

In [7]:
# convert creation_source to categorical type
users['creation_source'] = users['creation_source'].astype('category')

users.dtypes

object_id                              int64
creation_time                 datetime64[ns]
name                                  object
email                                 object
creation_source                     category
last_session_creation_time    datetime64[ns]
opted_in_to_mailing_list                bool
enabled_for_marketing_drip              bool
org_id                                 int64
invited_by_user_id                   float64
dtype: object

In [8]:
# rename object_id to user_id
users = users.rename(columns={'object_id':'user_id'})
users.columns

Index(['user_id', 'creation_time', 'name', 'email', 'creation_source',
       'last_session_creation_time', 'opted_in_to_mailing_list',
       'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id'],
      dtype='object')

In [9]:
# keep info if user was invited
users['invited'] = np.where(users['invited_by_user_id'].isnull(), True, False)
users.drop('invited_by_user_id', axis=1, inplace=True)

# also drop columns with personal information
users.drop(['name', 'email'], axis=1, inplace=True)

users.head()

Unnamed: 0,user_id,creation_time,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited
0,1,2014-04-22 03:53:30,GUEST_INVITE,1970-01-01 00:00:01.398138810,True,False,11,False
1,2,2013-11-15 03:45:04,ORG_INVITE,1970-01-01 00:00:01.396237504,False,False,1,False
2,3,2013-03-19 23:14:52,ORG_INVITE,1970-01-01 00:00:01.363734892,False,False,94,False
3,4,2013-05-21 08:09:28,GUEST_INVITE,1970-01-01 00:00:01.369210168,False,False,1,False
4,5,2013-01-17 10:14:20,GUEST_INVITE,1970-01-01 00:00:01.358849660,False,False,193,False


In [10]:
engagement = pd.read_csv('takehome_user_engagement.csv', parse_dates=['time_stamp'])
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [11]:
engagement.dtypes

time_stamp    datetime64[ns]
user_id                int64
visited                int64
dtype: object

In [12]:
engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   time_stamp  207917 non-null  datetime64[ns]
 1   user_id     207917 non-null  int64         
 2   visited     207917 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 4.8 MB


In [13]:
#Drop visited column, its always 1
engagement.drop('visited', axis=1, inplace=True)

engagement.head()

Unnamed: 0,time_stamp,user_id
0,2014-04-22 03:53:30,1
1,2013-11-15 03:45:04,2
2,2013-11-29 03:45:04,2
3,2013-12-09 03:45:04,2
4,2013-12-25 03:45:04,2


In [14]:
users['creation_year'] = users['creation_time'].dt.year
users['creation_month'] = users['creation_time'].dt.month
users['creation_day'] = users['creation_time'].dt.day
users['last_session_year'] = users['last_session_creation_time'].dt.year
users['last_session_month'] = users['last_session_creation_time'].dt.month
users['last_session_day'] = users['last_session_creation_time'].dt.day

#Drop unnecessary columns
users.drop(['creation_time', 'last_session_creation_time', 'user_id'], axis=1, inplace=True)

In [15]:
#Fill null values with something obvious for the model
users['last_session_day'].fillna(0, inplace=True)
users['last_session_month'].fillna(0, inplace=True)
users['last_session_year'].fillna(0, inplace=True)

#Convert creation_source to indicator variables
users = pd.get_dummies(users, drop_first=True) #Drop first dummy column, since it is correlated to the others