## User Engagement Investigation

#### 1. Importing packages and data

In [1]:
#Importing necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Loading login information
engagement= pd.read_csv('data/takehome_user_engagement.csv', 
                         parse_dates=['time_stamp'], infer_datetime_format=True)
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [3]:
engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null datetime64[ns]
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: datetime64[ns](1), int64(2)
memory usage: 4.8 MB


In [4]:
#loading information about users
users= pd.read_csv('data/takehome_users.csv', 
                   parse_dates=['creation_time'], infer_datetime_format=True,
                   encoding='latin-1', index_col=0)

users.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [5]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 1 to 12000
Data columns (total 9 columns):
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(3), object(3)
memory usage: 937.5+ KB


#### 2. Data Cleaning

**Missing Values**

**last_session_creation_time**: Missing a substantial amount of data.

In [8]:
users.last_session_creation_time.isna().sum()/len(users)

0.0

In [6]:
users.last_session_creation_time.fillna(0, inplace=True)

In [None]:
users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'], unit='s')

In [None]:
users.invited_by_user_id.fillna(99999, inplace=True)

In [None]:
users.creation_source = users.creation_source.astype('category')


In [None]:
users['email_domain'] = users.apply(lambda row: row.email.split('@')[1].split('.')[0], axis=1)

In [None]:
users.loc[~users.email_domain.str.contains('gmail|yahoo|jourrapide|cuvox|gustr|hotmail'), 'email_domain'] = 'other'

In [None]:
users.creation_source = users.creation_source.astype('category')
users.loc[~users.email_domain.str.contains('gmail|yahoo|jourrapide|cuvox|gustr|hotmail'), 'email_domain'] = 'other'
users.email_domain = users.email_domain.astype('category')

users.head()

In [None]:
users.info()

In [None]:
engagement['time_stamp'] = engagement['time_stamp'].dt.floor('d').astype(np.int64)
engagement = engagement.sort_values(['user_id', 'time_stamp']).drop_duplicates()
a = engagement.groupby('user_id')['time_stamp'].rolling(window=3)
b = pd.to_timedelta((a.max() - a.min())).dt.days
c = b[b == 7].index.get_level_values('user_id').drop_duplicates()

users['adopted'] = 0
users.loc[c, 'adopted'] = 1

In [None]:
users.adopted.value_counts()

In [None]:
users_ml = pd.get_dummies(data=users, columns=['creation_source', 'email_domain'], drop_first=True)

In [None]:
users_ml.info()

In [None]:
users_ml.drop(['creation_time', 'name', 'email', 'last_session_creation_time'], axis=1, inplace=True)

In [None]:
users_ml.info()

In [None]:
from sklearn.preprocessing import scale

X = users_ml.drop('adopted', axis=1)
y = users_ml['adopted']

x = scale(X)

In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3)
logreg = LogisticRegression()
logreg.fit(xtrain,ytrain)
y_pred = logreg.predict(xtest)

from sklearn.metrics import roc_curve

y_pred_prob = logreg.predict_proba(xtest)[:,1]
fpr, tpr, thresholds = roc_curve(ytest, y_pred_prob)
plt.plot([0,1], [0,1], 'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.show()

In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3)
logreg = LogisticRegression()
logreg.fit(Xtrain,ytrain)
y_pred = logreg.predict(Xtest)

from sklearn.metrics import roc_curve

y_pred_prob = logreg.predict_proba(Xtest)[:,1]
fpr, tpr, thresholds = roc_curve(ytest, y_pred_prob)
plt.plot([0,1], [0,1], 'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X = users_ml.drop('adopted', axis=1)
y = users_ml['adopted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': np.arange(1,50)}
knn_cv = GridSearchCV(knn, param_grid, cv=5)
knn_cv.fit(X,y)
print(knn_cv.best_params_)

print(knn_cv.best_score_)
