### Relax Challenge

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
users_df = pd.read_csv("takehome_users.csv", encoding = 'ISO-8859-1')
users_df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [3]:
# encoding - UTF 8is throwing error, used ISO 8859 -1 standard 

In [4]:
users_df.dtypes

object_id                       int64
creation_time                  object
name                           object
email                          object
creation_source                object
last_session_creation_time    float64
opted_in_to_mailing_list        int64
enabled_for_marketing_drip      int64
org_id                          int64
invited_by_user_id            float64
dtype: object

In [5]:
users_df['creation_time'] = pd.to_datetime(users_df['creation_time'])
users_df['last_session_creation_time'] = pd.to_datetime(users_df['last_session_creation_time'])

In [6]:
users_df.tail(3)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
11997,11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,1970-01-01 00:00:01.398602716,1,1,83,8074.0
11998,11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,1970-01-01 00:00:01.338638159,0,0,6,
11999,12000,2014-01-26 08:57:12,Lima Thaís,ThaisMeloLima@hotmail.com,SIGNUP,1970-01-01 00:00:01.390726632,0,1,0,


In [7]:
users_df.isna().sum()

object_id                        0
creation_time                    0
name                             0
email                            0
creation_source                  0
last_session_creation_time    3177
opted_in_to_mailing_list         0
enabled_for_marketing_drip       0
org_id                           0
invited_by_user_id            5583
dtype: int64

***'last_session_creation_time'*** and ***'invited_by_user_id'*** have null values.
We update ***'invited_by_user_id'*** to 0 in place of NaN values.


In [8]:
users_df['invited_by_user_id'] = users_df['invited_by_user_id'].fillna(0)      # Filling null values with 0

For nulls in ***last_session_creation_time*** (time of last login), we replace it with ***creation_time***

In [9]:
users_df['last_session_creation_time'] = users_df['last_session_creation_time'].fillna(users_df['creation_time'])

### User engagement data

In [10]:
eng_df = pd.read_csv("takehome_user_engagement.csv", encoding = 'ISO-8859-1', parse_dates = ['time_stamp'])
eng_df.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


Converting of 'time_stamp' column to datetime type was not possible using pd.to_datetime method, instead parsed dates 
while reading file into dataframe

### Defining an "adopted user" as a user who has logged into the product on three separate days in at least one seven­day period

In [11]:
def get_rolling_count(grp, freq):
    return grp.rolling(freq, on='time_stamp')['visited'].count()

eng_df['visits_7_days'] = eng_df.groupby('user_id', as_index=False, group_keys=False)\
                                                                        .apply(get_rolling_count, freq='7D')


In [12]:
eng_df.head()

Unnamed: 0,time_stamp,user_id,visited,visits_7_days
0,2014-04-22 03:53:30,1,1,1.0
1,2013-11-15 03:45:04,2,1,1.0
2,2013-11-29 03:45:04,2,1,1.0
3,2013-12-09 03:45:04,2,1,1.0
4,2013-12-25 03:45:04,2,1,1.0


In [13]:
eng_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 4 columns):
time_stamp       207917 non-null datetime64[ns]
user_id          207917 non-null int64
visited          207917 non-null int64
visits_7_days    207917 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 6.3 MB


All are non-null values in the above dataframe

In [14]:
print('The number of users is {}'.format(eng_df['user_id'].nunique()))

The number of users is 8823


In [15]:
eng_df['adopted_user'] = eng_df['visits_7_days'].apply(lambda x: 1 if x>3 else 0).fillna(0)

In [16]:
eng_df.head(3)

Unnamed: 0,time_stamp,user_id,visited,visits_7_days,adopted_user
0,2014-04-22 03:53:30,1,1,1.0,0
1,2013-11-15 03:45:04,2,1,1.0,0
2,2013-11-29 03:45:04,2,1,1.0,0


Merging both the dataframes on user_id (object_id)

In [17]:
user_engagement = pd.merge(users_df, eng_df, how = 'left', left_on = 'object_id', right_on = 'user_id')

In [18]:
user_engagement.shape

(211094, 15)

In [19]:
user_engagement.head(3)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,time_stamp,user_id,visited,visits_7_days,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1970-01-01 00:00:01.398138810,1,0,11,10803.0,2014-04-22 03:53:30,1.0,1.0,1.0,0.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1970-01-01 00:00:01.396237504,0,0,1,316.0,2013-11-15 03:45:04,2.0,1.0,1.0,0.0
2,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1970-01-01 00:00:01.396237504,0,0,1,316.0,2013-11-29 03:45:04,2.0,1.0,1.0,0.0


In [20]:
user_engagement.dtypes

object_id                              int64
creation_time                 datetime64[ns]
name                                  object
email                                 object
creation_source                       object
last_session_creation_time    datetime64[ns]
opted_in_to_mailing_list               int64
enabled_for_marketing_drip             int64
org_id                                 int64
invited_by_user_id                   float64
time_stamp                    datetime64[ns]
user_id                              float64
visited                              float64
visits_7_days                        float64
adopted_user                         float64
dtype: object

In [21]:
user_engagement.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211094 entries, 0 to 211093
Data columns (total 15 columns):
object_id                     211094 non-null int64
creation_time                 211094 non-null datetime64[ns]
name                          211094 non-null object
email                         211094 non-null object
creation_source               211094 non-null object
last_session_creation_time    211094 non-null datetime64[ns]
opted_in_to_mailing_list      211094 non-null int64
enabled_for_marketing_drip    211094 non-null int64
org_id                        211094 non-null int64
invited_by_user_id            211094 non-null float64
time_stamp                    207917 non-null datetime64[ns]
user_id                       207917 non-null float64
visited                       207917 non-null float64
visits_7_days                 207917 non-null float64
adopted_user                  207917 non-null float64
dtypes: datetime64[ns](3), float64(5), int64(4), object(3)
memory usa

In [22]:
user_engagement['visits_7_days'].isna().sum()

3177

Filling the NaN values with 0

In [23]:
user_engagement['visits_7_days'] = user_engagement['visits_7_days'].fillna(0)
user_engagement['adopted_user'] = user_engagement['adopted_user'].fillna(0)
user_engagement['visited'] = user_engagement['visited'].fillna(0)

In [24]:
# Converting the data types :

user_engagement['visits_7_days'] = user_engagement['visits_7_days'].astype('int')
user_engagement['adopted_user'] = user_engagement['adopted_user'].astype('int')
user_engagement['invited_by_user_id'] = user_engagement['invited_by_user_id'].astype('int')
user_engagement['visited'] = user_engagement['visited'].astype('int')   

for 'invited_by_user_id', instead of one hot encoding all numericals, we will map existing values to 1 and missing 
values to 0.

In [25]:
invitation = lambda x: 1 if x>0 else 0
user_engagement['invited_by_user_id'] = user_engagement['invited_by_user_id'].apply(invitation)

### Feature Engineering

In [26]:
feature_set = user_engagement[["creation_source", "opted_in_to_mailing_list", "enabled_for_marketing_drip",\
                               "invited_by_user_id", "adopted_user"]]


In [27]:
feature_set.dtypes

creation_source               object
opted_in_to_mailing_list       int64
enabled_for_marketing_drip     int64
invited_by_user_id             int64
adopted_user                   int64
dtype: object

In [28]:
feature_set.creation_source.unique()

array(['GUEST_INVITE', 'ORG_INVITE', 'SIGNUP', 'PERSONAL_PROJECTS',
       'SIGNUP_GOOGLE_AUTH'], dtype=object)

In [29]:
feature_set['creation_source'] = feature_set['creation_source'].astype("category")

model_df = pd.get_dummies(feature_set, drop_first=True)

X = model_df.drop(columns=['adopted_user']).copy(deep = True)
y = model_df[['adopted_user']].copy(deep = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [30]:
X.head(3)

Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,invited_by_user_id,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH
0,1,0,1,0,0,0,0
1,0,0,1,1,0,0,0
2,0,0,1,1,0,0,0


In [31]:
y.head(3)

Unnamed: 0,adopted_user
0,0
1,0
2,0


In [32]:
X.shape, y.shape

((211094, 7), (211094, 1))

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.65, stratify = y, random_state = 100)

In [35]:
rf = RandomForestClassifier(n_estimators=1000, max_depth = 15, random_state=100)

model  = rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)


test_accuracy = accuracy_score(y_test, y_pred)

  This is separate from the ipykernel package so we can avoid doing imports until


In [36]:
# test_accuracy, 
accuracy_score(y_test, y_pred)

0.6285018803020144

In [37]:
print(rf.feature_importances_)

[0.09373822 0.3603124  0.07592959 0.13193216 0.14638436 0.04991702
 0.14178627]


In [38]:
for feature in zip(X.columns, rf.feature_importances_):
    print(feature)

('opted_in_to_mailing_list', 0.09373821527764638)
('enabled_for_marketing_drip', 0.36031240028619793)
('invited_by_user_id', 0.07592958586393136)
('creation_source_ORG_INVITE', 0.13193215630306881)
('creation_source_PERSONAL_PROJECTS', 0.14638435763904858)
('creation_source_SIGNUP', 0.04991701753966072)
('creation_source_SIGNUP_GOOGLE_AUTH', 0.14178626709044656)


From the available dataset, we've arrived at the following feature importance:

opted_in_to_mailing_list - whether user has opted into receiving marketing emails <br/>
enabled_for_marketing_drip - whether they are on the regular marketing email drip<br/>
creation_source_XXXXX - How the account was created <br/>
invited_by_user_id - if a user was referred by another user 
