In [116]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['figure.figsize'] = (7, 5)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix,auc,roc_auc_score,accuracy_score,roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [117]:
df = pd.read_csv("online_shoppers_intention.csv")

In [118]:
df["SpecialDay"] = df["SpecialDay"].astype("object")

In [119]:
df = df[df["VisitorType"]!="Other"]

In [120]:
df["Weekend"] = df["Weekend"].map({True:1,False:0})
#df["Revenue"] = df["Revenue"].map({True:1,False:0})

In [121]:
from sklearn.preprocessing import LabelEncoder
cat_cols = ["Month","VisitorType"]
le  = LabelEncoder()
for var in cat_cols:
    df[var] = le.fit_transform(df[var])

In [122]:
df.shape

(12245, 18)

In [123]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0,2,1,1,1,1,1,0,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0,2,2,2,1,2,1,0,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0,2,4,1,9,3,1,0,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0,2,3,2,2,4,1,0,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0,2,3,3,1,4,1,1,False


In [124]:
X = df.drop(["Revenue"],axis=1)
y = df["Revenue"]

In [125]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [108]:
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

X_train = pd.DataFrame(X_train,columns=X.columns)
X_test = pd.DataFrame(X_test,columns=X.columns)

###### Smoting Training data alone

In [126]:

from imblearn.over_sampling import SMOTE
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=2,ratio=1)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

Before OverSampling, counts of label '1': 1333
Before OverSampling, counts of label '0': 7238 

After OverSampling, counts of label '1': 7238
After OverSampling, counts of label '0': 7238


In [127]:
X_train_res = pd.DataFrame(X_train_res,columns = X_train.columns)

##### Random Forest

In [128]:
RF=RandomForestClassifier(random_state=0,max_depth=40,min_samples_split=2,min_samples_leaf=1,n_estimators=300)
RF.fit(X_train_res, y_train_res)
y_pred_RF = RF.predict(X_test)
print(classification_report(y_test,y_pred_RF))

print("Accuracy on training set: {:.3f}".format(RF.score(X_train_res, y_train_res)))
print("Accuracy on test set: {:.3f}".format(RF.score(X_test, y_test)))

              precision    recall  f1-score   support

       False       0.95      0.92      0.94      3115
        True       0.63      0.74      0.68       559

    accuracy                           0.89      3674
   macro avg       0.79      0.83      0.81      3674
weighted avg       0.90      0.89      0.90      3674

Accuracy on training set: 1.000
Accuracy on test set: 0.894


##### Random Forest Hyper Parameter tuning

In [54]:
RF=RandomForestClassifier(random_state=0)
parameters = {'n_estimators':[300],
             'criterion':['gini'],
              'max_depth':[20,30,40],
             'min_samples_split':[2,4,6],
           'min_samples_leaf':[1,2,3]
             }
grid_search = GridSearchCV(RF,param_grid=parameters,cv=5,scoring='f1_weighted',verbose=3)
grid_search.fit(X_train_res,y_train_res)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300, score=0.876, total=   3.9s
[CV] criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.8s remaining:    0.0s


[CV]  criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300, score=0.893, total=   3.5s
[CV] criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.4s remaining:    0.0s


[CV]  criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300, score=0.881, total=   3.4s
[CV] criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300 
[CV]  criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300, score=0.941, total=   3.5s
[CV] criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300 
[CV]  criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300, score=0.940, total=   3.4s
[CV] criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=4, n_estimators=300 
[CV]  criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=4, n_estimators=300, score=0.876, total=   3.7s
[CV] criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=4, n_estimators=300 
[CV]  criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=4, n_estimators=300, score=0.888, total=   3.3s
[CV] criterion

[CV]  criterion=gini, max_depth=20, min_samples_leaf=3, min_samples_split=6, n_estimators=300, score=0.875, total=   3.6s
[CV] criterion=gini, max_depth=20, min_samples_leaf=3, min_samples_split=6, n_estimators=300 
[CV]  criterion=gini, max_depth=20, min_samples_leaf=3, min_samples_split=6, n_estimators=300, score=0.890, total=   3.5s
[CV] criterion=gini, max_depth=20, min_samples_leaf=3, min_samples_split=6, n_estimators=300 
[CV]  criterion=gini, max_depth=20, min_samples_leaf=3, min_samples_split=6, n_estimators=300, score=0.881, total=   3.6s
[CV] criterion=gini, max_depth=20, min_samples_leaf=3, min_samples_split=6, n_estimators=300 
[CV]  criterion=gini, max_depth=20, min_samples_leaf=3, min_samples_split=6, n_estimators=300, score=0.931, total=   4.3s
[CV] criterion=gini, max_depth=20, min_samples_leaf=3, min_samples_split=6, n_estimators=300 
[CV]  criterion=gini, max_depth=20, min_samples_leaf=3, min_samples_split=6, n_estimators=300, score=0.930, total=   3.4s
[CV] criterion

[CV]  criterion=gini, max_depth=30, min_samples_leaf=3, min_samples_split=2, n_estimators=300, score=0.931, total=   3.2s
[CV] criterion=gini, max_depth=30, min_samples_leaf=3, min_samples_split=2, n_estimators=300 
[CV]  criterion=gini, max_depth=30, min_samples_leaf=3, min_samples_split=2, n_estimators=300, score=0.929, total=   3.7s
[CV] criterion=gini, max_depth=30, min_samples_leaf=3, min_samples_split=4, n_estimators=300 
[CV]  criterion=gini, max_depth=30, min_samples_leaf=3, min_samples_split=4, n_estimators=300, score=0.873, total=   3.1s
[CV] criterion=gini, max_depth=30, min_samples_leaf=3, min_samples_split=4, n_estimators=300 
[CV]  criterion=gini, max_depth=30, min_samples_leaf=3, min_samples_split=4, n_estimators=300, score=0.890, total=   3.1s
[CV] criterion=gini, max_depth=30, min_samples_leaf=3, min_samples_split=4, n_estimators=300 
[CV]  criterion=gini, max_depth=30, min_samples_leaf=3, min_samples_split=4, n_estimators=300, score=0.881, total=   3.1s
[CV] criterion

[CV]  criterion=gini, max_depth=40, min_samples_leaf=2, min_samples_split=6, n_estimators=300, score=0.891, total=   3.1s
[CV] criterion=gini, max_depth=40, min_samples_leaf=2, min_samples_split=6, n_estimators=300 
[CV]  criterion=gini, max_depth=40, min_samples_leaf=2, min_samples_split=6, n_estimators=300, score=0.882, total=   3.2s
[CV] criterion=gini, max_depth=40, min_samples_leaf=2, min_samples_split=6, n_estimators=300 
[CV]  criterion=gini, max_depth=40, min_samples_leaf=2, min_samples_split=6, n_estimators=300, score=0.935, total=   3.4s
[CV] criterion=gini, max_depth=40, min_samples_leaf=2, min_samples_split=6, n_estimators=300 
[CV]  criterion=gini, max_depth=40, min_samples_leaf=2, min_samples_split=6, n_estimators=300, score=0.930, total=   3.1s
[CV] criterion=gini, max_depth=40, min_samples_leaf=3, min_samples_split=2, n_estimators=300 
[CV]  criterion=gini, max_depth=40, min_samples_leaf=3, min_samples_split=2, n_estimators=300, score=0.873, total=   3.1s
[CV] criterion

[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed:  7.9min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

In [55]:
grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 40,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}

In [56]:
grid_search.best_score_

0.9064623347797248

In [129]:
rf_features = ['PageValues',
 'ExitRates',
 'Administrative',
 'Month',
 'ProductRelated_Duration']

In [133]:
Random_forest = RandomForestClassifier(random_state=40,max_depth=20,min_samples_split=2,min_samples_leaf=1,n_estimators=300)
Random_forest.fit(X_train_res[rf_features], y_train_res)
y_pred_rf = Random_forest.predict(X_test[rf_features])
print(classification_report(y_test,y_pred_rf))

print("Accuracy on training set: {:.3f}".format(Random_forest.score(X_train_res[rf_features], y_train_res)))
print("Accuracy on test set: {:.3f}".format(Random_forest.score(X_test[rf_features], y_test)))

              precision    recall  f1-score   support

       False       0.95      0.91      0.93      3115
        True       0.60      0.75      0.67       559

    accuracy                           0.89      3674
   macro avg       0.78      0.83      0.80      3674
weighted avg       0.90      0.89      0.89      3674

Accuracy on training set: 1.000
Accuracy on test set: 0.885


In [153]:
import pickle
# Saving model to disk
pickle.dump(Random_forest, open('model1.pkl','wb'))

# Loading model to compare the results
model = pickle.load(open('model1.pkl','rb'))
a = model.predict([[999,0,3,7,9999]])

In [154]:
if(a):
    print('Hello True')
else:
    print("Hello")

Hello True
