In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
engagement = pd.read_csv('data/takehome_user_engagement.csv', 
                         parse_dates=['time_stamp'], infer_datetime_format=True)
engagement.head()

In [None]:
engagement.info()

In [None]:
users= pd.read_csv('data/takehome_users.csv', 
                   parse_dates=['creation_time'], infer_datetime_format=True,
                   encoding='latin-1', index_col=0)



In [None]:
users.last_session_creation_time.fillna(0, inplace=True)

In [None]:
users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'], unit='s')

In [None]:
users.invited_by_user_id.fillna(99999, inplace=True)

In [None]:
users.creation_source = users.creation_source.astype('category')


In [None]:
def domain_name(row):
    return row.email.split('@')[1].split('.')[0]

users['email_domain'] = users.apply(domain_name, axis=1)

In [None]:
users.loc[~users.email_domain.str.contains('gmail|yahoo|jourrapide|cuvox|gustr|hotmail'), 'email_domain'] = 'other'

In [None]:
users.info()

In [None]:
users.head()

In [None]:
users.creation_source = users.creation_source.astype('category')
users.loc[~users.email_domain.str.contains('gmail|yahoo|jourrapide|cuvox|gustr|hotmail'), 'email_domain'] = 'other'
users.email_domain = users.email_domain.astype('category')

users.head()

In [None]:
users.info()

In [None]:
engagement['time_stamp'] = engagement['time_stamp'].dt.floor('d').astype(np.int64)
engagement = engagement.sort_values(['user_id', 'time_stamp']).drop_duplicates()
a = engagement.groupby('user_id')['time_stamp'].rolling(window=3)
b = pd.to_timedelta((a.max() - a.min())).dt.days
c = b[b == 7].index.get_level_values('user_id').drop_duplicates()

users['adopted'] = 0
users.loc[c, 'adopted'] = 1

In [None]:
users_ml = pd.get_dummies(data=users, columns=['creation_source', 'email_domain'], drop_first=True)

In [None]:
users_ml.info()

In [None]:
users_ml.drop(['creation_time', 'name', 'email', 'last_session_creation_time'], axis=1, inplace=True)

In [None]:
users_ml.info()

In [None]:
from sklearn.preprocessing import scale

X = users_ml.drop('adopted', axis=1)
y = users_ml['adopted']

x = scale(X)

In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3)
logreg = LogisticRegression()
logreg.fit(xtrain,ytrain)
y_pred = logreg.predict(xtest)

from sklearn.metrics import roc_curve

y_pred_prob = logreg.predict_proba(xtest)[:,1]
fpr, tpr, thresholds = roc_curve(ytest, y_pred_prob)
plt.plot([0,1], [0,1], 'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.show()

In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3)
logreg = LogisticRegression()
logreg.fit(Xtrain,ytrain)
y_pred = logreg.predict(Xtest)

from sklearn.metrics import roc_curve

y_pred_prob = logreg.predict_proba(Xtest)[:,1]
fpr, tpr, thresholds = roc_curve(ytest, y_pred_prob)
plt.plot([0,1], [0,1], 'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X = users_ml.drop('adopted', axis=1)
y = users_ml['adopted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': np.arange(1,50)}
knn_cv = GridSearchCV(knn, param_grid, cv=5)
knn_cv.fit(X,y)
print(knn_cv.best_params_)

print(knn_cv.best_score_)
