In [0]:
import pandas as pd
import numpy as np
import datetime as datetime
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, recall_score
import seaborn as sns

In [0]:
from google.colab import drive
drive.mount("/drive")

In [0]:
df_time = pd.read_csv("/drive/My Drive/Googel Analytics - Capstone/Fatema/hittime_fullyear.csv")
df_time = df_time.sort_values(by=['visitId', 'hitnum'])

In [0]:
df_time

# Create time on home page feature

In [0]:
df_time["totalVisitHits"] = df_time["hitnum"].groupby(df_time["visitId"]).transform('count')
df_time["timeOnPage"] = df_time.hittime.diff(periods = 1)
df_time["timeOnPage"] = df_time["timeOnPage"].shift(-1)
df_time

In [0]:
df_time.timeOnPage = np.where(df_time.timeOnPage < 0, 0, df_time.timeOnPage)
df_time

In [0]:
df_timeOnHome = df_time[(df_time.pt == "Home") | (df_time.pt=="Google Online Store")]

In [0]:
df_timeOnHome = df_timeOnHome[df_timeOnHome.hitnum == 1]

In [0]:
df_timeOnHome

In [0]:
#df.loc[df.groupby("item")["diff"].idxmin()]

# merge with traffic source data and clean up

In [0]:
ts = pd.read_csv("/drive/My Drive/Googel Analytics - Capstone/Fatema/trafficSource_fullyear2.csv")

In [0]:
ts = ts.drop(["Unnamed: 0", "Unnamed: 0.1"], axis = 1)

In [0]:
df_ts = ts.merge(df_timeOnHome, on = "visitId").drop(["fullVisitorId_y","timeOnSite_y"], axis = 1)

In [0]:
#df_ts = df_ts.drop(["totalVisitHits", "networkDomain", "metro", "city", "region", "timeOnSite_x",
#                   "pageviews", "hits", "fullVisitorId_x", "visitId", "date", 
#                  "transactionRevenue", "totalTransactionRevenue", "sessionQualityDim", "continent", "subContinent",
#                   "isMobile", "hitnum", "pt", "gclId"], axis = 1)

df_ts = df_ts.drop(["metro", "city", "region", "timeOnSite_x", "totalVisitHits",
                    "fullVisitorId_x", "visitId", "date", 
                   "transactionRevenue", "totalTransactionRevenue", "sessionQualityDim", "continent", "subContinent",
                    "pt", "gclId" ,"pageviews", "hits"], axis = 1)


df_ts

# Add DoW and ToD features

In [0]:
df_ts['dow'] = df_ts.visitStartTime.apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%y %H:%M').weekday())
df_ts['hour'] = df_ts.visitStartTime   .apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%y %H:%M').hour)

# Impute and drop unnecessary columns

In [0]:
x_or = df_ts.loc[:, df_ts.columns != 'transactions']
y_or = df_ts.transactions
y_or = np.where(y_or.isna(), 0, 1)
y_or

In [0]:
nacol = pd.DataFrame(x_or.isna().any()).reset_index()
nacol.columns = ['column', 'Bool']
nacol = nacol[nacol.Bool == True]
nacol

In [0]:
for i in nacol.column:
    try:
        x_or[i] = np.where(np.isnan(x_or[i]),0, x_or[i])
    except:
        x_or[i] = np.where(x_or[i].isna(),"No", x_or[i])
#x.isna().any()

In [0]:
x_or.columns

# Label encode categoricals

In [0]:
#xcat = ['channelGrouping', 'referralPath',
 #      'campaign', 'source', 'medium', 'keyword', 'adContent', 'page', 'slot',
 #      'adNetworkType', 'browser', 'operatingSystem', 'deviceCategory',
  #     'country','dow', 'hour']

xcat = ['channelGrouping',
       'referralPath', 'campaign', 'source', 'medium', 'keyword', 'adContent',
       'page', 'slot', 'adNetworkType', 'browser', 'operatingSystem',
       'isMobile', 'deviceCategory', 'country', 'networkDomain', 'dow', 'hour']
      

xcat

In [0]:
le = preprocessing.LabelEncoder()

for i in xcat:
    x_or[i] = le.fit_transform(x_or[i])

In [0]:
x_or = x_or.drop(["visitStartTime"], axis = 1)

In [0]:
x_or

In [0]:
x, x_test, y, y_test = train_test_split(x_or, y_or, test_size=0.3, random_state=0)

# Random Forest Classifier

In [0]:
rf = RandomForestClassifier(n_estimators= 7)
rf.fit(x, y) 

In [0]:
ypred_test = rf.predict(x_test)

In [0]:
print('Accuracy Score : ' + str(accuracy_score(y_test,ypred_test)))
print('Precision Score : ' + str(precision_score(y_test,ypred_test)))
print('F1 Score : ' + str(f1_score(y_test,ypred_test)))



# nest =10
#Accuracy Score : 0.9873223224005292
#Precision Score : 0.40064102564102566
#F1 Score : 0.17041581458759372

# nest =7
#Accuracy Score : 0.9867493788771349
#Precision Score : 0.39246323529411764
#F1 Score : 0.2513243084167157

In [0]:
scores = cross_val_score(rf, x, y, scoring = "precision", cv=5)
scores

# RF Confusion Matrix

In [0]:
ypred = rf.predict(x)
confusion_matrix(y_test, ypred_test)

# RF feature importances

In [0]:
pd.Series(rf.feature_importances_, index=x.columns).sort_values(ascending=False)

# RF Probabilities

In [0]:
yproba_rf = rf.predict_proba(x_test)
yproba_rf = yproba_rf[:,1]

In [0]:
for i in list(range(0,99,10)):
    i = i/100
    yproba_rf = rf.predict_proba(x_test)
    yproba_rf = yproba_rf[:,1]
    yproba_rf[yproba_rf>i] = 1
    yproba_rf[yproba_rf<=i] = 0
    #print(i, " ", precision_score(y, yproba_rf))   
    print(i, " ",precision_score(y_test,yproba_rf), recall_score(y_test,yproba_rf), f1_score(y_test, yproba_rf))
    #print cut off and false results

In [0]:
yproba_rf = rf.predict_proba(x_test)
yproba_rf = yproba_rf[:,1]
yproba_rf[yproba_rf>0.1] = 1
yproba_rf[yproba_rf<0.1] = 0

print(precision_score(y_test,yproba_rf))
a = confusion_matrix(y_test, yproba_rf)
print(a)

# Grid search

In [0]:
rf = RandomForestClassifier()

param_grid = { 
    'n_estimators': list(range(2,30,10)),
    'max_features': ['auto'], 
    'max_depth': list(range(3,15)),  
}
 
np.random.seed(0)
grid_search = GridSearchCV(rf,param_grid, cv=5,return_train_score=True, n_jobs = -1, verbose = True)
grid_search.fit(x,y)

In [0]:
ypred_test = grid_search.predict(x_test)
confusion_matrix(y_test, ypred_test)

In [0]:
print('Accuracy Score : ' + str(accuracy_score(y,ypred)))
print('Precision Score : ' + str(precision_score(y,ypred)))
print('F1 Score : ' + str(f1_score(y,ypred)))

In [0]:
grid_search.best_params_

In [0]:
for i in list(range(0,99,10)):
    i = i/100
    yproba_rf = grid_search.predict_proba(x_test)
    yproba_rf = yproba_rf[:,1]
    yproba_rf[yproba_rf>i] = 1
    yproba_rf[yproba_rf<=i] = 0
    #print(i, " ", precision_score(y, yproba_rf))   
    print(i, " ",precision_score(y_test,yproba_rf), recall_score(y_test,yproba_rf), f1_score(y_test, yproba_rf))
    #print cut off and false results

In [0]:
yproba_rf = grid_search.predict_proba(x_test)
yproba_rf = yproba_rf[:,1]
yproba_rf[yproba_rf>0.2] = 1
yproba_rf[yproba_rf<=0.2] = 0

rf_acc = accuracy_score(y_test,yproba_rf)
rf_pscore = precision_score(y_test,yproba_rf)
rf_recall = recall_score(y_test,yproba_rf)
rf_f1 = f1_score(y_test,yproba_rf)

In [0]:
bestrf = grid_search.best_estimator_
rf_fi = pd.Series(bestrf.feature_importances_, index=x.columns).sort_values(ascending=False)
rf_fi

# Boost

In [0]:
np.random.seed(0)
param_grid = [
    {'subsample':[0.5,1],
    'learning_rate':[0.025,0.05,0.1],
    'max_depth':[2,3,4],
    'n_estimators':list(range(2,30,10))}]
boost = GradientBoostingClassifier()
boostgs = GridSearchCV(boost,param_grid, cv=5, scoring='f1',return_train_score=True, n_jobs=-1)
np.random.seed(0)
boostgs.fit(x,y)

In [0]:
ypred_boost_test=boostgs.predict(x_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,ypred_boost_test)))
print('Precision Score : ' + str(precision_score(y_test,ypred_boost_test)))
print('F1 Score : ' + str(f1_score(y_test,ypred_boost_test)))

print(confusion_matrix(y_test, ypred_boost_test))

-------

In [0]:
for i in list(range(0,99,10)):
    i = i/100
    yproba_gb = boostgs.predict_proba(x_test)
    yproba_gb = yproba_gb[:,1]
    yproba_gb[yproba_gb>i] = 1
    yproba_gb[yproba_gb<=i] = 0
    #print(i, " ", precision_score(y, yproba_rf))   
    print(i, " ", precision_score(y_test,yproba_gb), recall_score(y_test,yproba_gb) ,f1_score(y_test, yproba_gb))
    #print cut off and false results

In [0]:
yproba_gb = boostgs.predict_proba(x_test)
yproba_gb = yproba_gb[:,1]
yproba_gb[yproba_gb>0.1] = 1
yproba_gb[yproba_gb<=0.1] = 0

gb_acc = accuracy_score(y_test,yproba_gb)
gb_pscore = precision_score(y_test,yproba_gb)
gb_recall = recall_score(y_test,yproba_gb)
gb_f1 = f1_score(y_test,yproba_gb)

In [0]:
gb_pscore

In [0]:
bestboost = boostgs.best_estimator_
boost_fi = pd.Series(bestboost.feature_importances_, index=x.columns).sort_values(ascending=False)
boost_fi = pd.DataFrame(boost_fi)
boost_fi = boost_fi.reset_index()
boost_fi.columns = ["Features", "Importances"]

sns.barplot(x="Importances", y="Features",  data=boost_fi)

#Scores

In [0]:
scores = pd.DataFrame({'Type':['Accuracy', 'Precision', 'Recall', 'F1'], 'Random Forest':[rf_acc, rf_pscore, rf_recall, rf_f1], 
                       "GB": [gb_acc, gb_pscore, gb_recall, gb_f1]})
scores = pd.melt(scores, id_vars = "Type", value_vars=['Random Forest', 'GB'])
scores.columns = ["Type", "Model", "Score"]
print(scores)

In [0]:
sns.set(style="whitegrid")
sns.despine()
sns.barplot(x="Score", y="Type", hue = "Model", palette = "hls", data=scores)
