In [1]:
import pandas as pd
import numpy as np
from numpy import meshgrid

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

dfs = pd.read_excel("Acme.xlsx")

# Here we are predicting for P(buy | click) thus we look at the subset where click is always 1.
dfs_click_copy = dfs.loc[dfs['click'] == 1,].copy()

X = dfs_click_copy[['currently_insured', 'number_of_vehicles', 'number_of_drivers', 'marital_status','rank']]
y = dfs_click_copy['policies sold']


# we take out impression_id since it is just the index + 1.

for i in range(1,6):
    print("rank:", i,", # of people:", dfs_click_copy.loc[dfs_click_copy["rank"]==i,"impression_id"].count())


rank: 1 , # of people: 581
rank: 2 , # of people: 267
rank: 3 , # of people: 375
rank: 4 , # of people: 100
rank: 5 , # of people: 51


In [2]:
cols = ['currently_insured','marital_status','number_of_vehicles','number_of_drivers','rank']

X[cols] = X[cols].astype(str)

#dummies = pd.get_dummies(X[cols])
#ydummies = pd.get_dummies(y)

# NOTE: here, for multiclass and multi-label classification, we DON'T one-hot encode anything.
X_prime = X[cols]

# BUT we do have to make purely string labels into number labels:
X_prime.loc[X_prime.currently_insured == "Y",'currently_insured'] = 1
X_prime.loc[X_prime.currently_insured == "N",'currently_insured'] = 0
X_prime.loc[X_prime.marital_status == "M",'marital_status'] = 1
X_prime.loc[X_prime.marital_status == "S",'marital_status'] = 0

y_prime = y.astype(str)


# Here we combine ranks 2,3 into rank 2 and ranks 4,5 into rank 3: 
# Don't do this?

#X_prime.loc[X_prime['rank'] == "3",'rank'] = "2"
#X_prime.loc[X_prime['rank'] == "2",'rank'] = "2"
#X_prime.loc[X_prime['rank'] == "1",'rank'] = "1"
#X_prime.loc[X_prime['rank'] == "5",'rank'] = "3"         # we have to do it in this order to avoid conflicts 
#X_prime.loc[X_prime['rank'] == "4",'rank'] = "3"         # between the assignments involving "3".

X_prime.head(25)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,currently_insured,marital_status,number_of_vehicles,number_of_drivers,rank
0,0,1,1,1,1
8,0,1,2,1,3
16,0,1,1,2,2
19,1,1,2,1,5
27,0,0,1,2,1
40,0,0,1,2,1
42,0,1,1,2,1
43,0,1,1,1,4
45,0,1,1,2,1
46,0,1,1,2,1


In [3]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_prime,y_prime,test_size = .25,random_state = 614,shuffle = True,stratify = y_prime)


# We are stratifying by y (= policies sold) but we might want to stratify by rank instead...


for i in range(1,6):
    print("rank:", i, ", # of people:", X_train.loc[X_train["rank"]==str(i),"rank"].count())

# So X_train has 436 rank 1 people
# and 200 rank 2 people
# and 281 rank 3 people
# and 75 rank 4 people
# and 38 rank 5 people

rank: 1 , # of people: 447
rank: 2 , # of people: 200
rank: 3 , # of people: 273
rank: 4 , # of people: 77
rank: 5 , # of people: 33


In [4]:
# This is for balancing the training dataset:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()

# this oversamples with respect to policies sold:
X_train,y_train = oversample.fit_resample(X_train,y_train)
y_train.value_counts()

# oversample with rank instead:
#X_train,y_train = oversample.fit_resample(X_train,X_train['rank'])


for i in range(1,6):
    print("rank:", i, ", # of people:", X_train.loc[X_train["rank"]==i,"rank"].count())

# POSSIBLE PROBLEM: Here we are balancing the training set X for POLICIES SOLD, not for RANK!!
# How do we fix this???


Using TensorFlow backend.


rank: 1 , # of people: 548
rank: 2 , # of people: 248
rank: 3 , # of people: 316
rank: 4 , # of people: 91
rank: 5 , # of people: 36


In [5]:
# This takes 15-20 minutes to run, be careful!

from sklearn import metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import ComplementNB

from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold

# Here we try a neural network of 5 hidden layers, 65 nodes per layer.
mlp = MLPClassifier(hidden_layer_sizes=(100,5),max_iter=1000000)

svc = SVC(kernel = 'linear', probability = True)                   # linear kernel seems to work best for SVC.
rfc = RandomForestClassifier(max_depth = 10,n_estimators = 500)
lda = LinearDiscriminantAnalysis()
knn = KNeighborsClassifier(n_neighbors = 15)
cnb = ComplementNB()
bag_knn_clf = BaggingClassifier(KNeighborsClassifier(20),
                            n_estimators = 1000,
                            max_samples = 100,
                            bootstrap = True)

# Here we use a bagging neural network (only 10 estimators), as this was said to improve neural nets during class.
bag_mlp_clf = BaggingClassifier(MLPClassifier(hidden_layer_sizes=(100,5),max_iter=1000000),
                            n_estimators = 30,
                            max_samples = 100,
                            bootstrap = True)


paste_knn_clf = BaggingClassifier(KNeighborsClassifier(20),
                            n_estimators = 1000,
                            max_samples = 100,
                            bootstrap = False)

# Here we try AdaBoost with RFC. Maybe some other weak learner algorithm will work better.
ada_clf = AdaBoostClassifier(RandomForestClassifier(max_depth = 10,n_estimators = 500),
                n_estimators = 50,
                algorithm="SAMME.R",
                learning_rate = .3)

voting_clf = VotingClassifier(
                [('lda',lda),
                ('rfc',rfc),
                ('svc',svc),
                ('knn',knn),
                ('mlp',mlp),
                ('ada',ada_clf)],
                voting = "soft")



cv = StratifiedKFold(5, shuffle = True, random_state=614)


# Here 11 is the number of classifiers we're using
finalacc = np.empty(11)
finalprec = np.empty(11)
finalrec = np.empty(11)

# Here we run over all classifiers and then do cross-validation.
k = 0
for name,clf in (["LDA",lda],["RFC",rfc],["SVC",svc],["KNN",knn],["MLP",mlp],["CNB",cnb],["AdaBoost_clf",ada_clf],["Voting_clf",voting_clf],["Bagging_KNN_clf",bag_knn_clf],["Bagging_MLP_clf",bag_mlp_clf],["Pasting_KNN_clf",paste_knn_clf]):
    
    a = np.empty(5)          # 5 for the number of cv-splits.
    p = np.empty(5)
    r = np.empty(5)
    
    j = 0
    for train_idx, test_idx in cv.split(X_train,y_train):

        X_train2 = X_train.iloc[train_idx]
        y_train2 = y_train.iloc[train_idx]
        X_test2 = X_train.iloc[test_idx]
        y_test2 = y_train.iloc[test_idx]
        
        
        clone_clf = clone(clf)
        clone_clf.fit(X_train2,y_train2.ravel())
        
        y_predict = clone_clf.predict(X_test2)
        
        #y_predict = 1*(y_prob >= cutoff/100)       # Just make y_predict the max class...otherwise we end up
                                                    # with some indices being all rank 1,2,3,4, and 5.
        # We basically can't use proba_predict because we have multiple classes of y.    

        a[j] = 100*metrics.accuracy_score(y_test2, y_predict)
        p[j] = 100*metrics.precision_score(y_test2, y_predict, zero_division = 1,average='macro')
        r[j] = 100*metrics.recall_score(y_test2, y_predict, zero_division = 1,average='macro')
        
        # Using macro as the averaging is the same as taking np.mean of the 5 labels' accuracy, precision, and recall.
        
        j = j + 1
        
    # The mean over cross-validations of accuracy, precision, and recall
    finalacc[k] = np.mean(a)
    finalprec[k] = np.mean(p)
    finalrec[k] = np.mean(r)
    
    k = k + 1


In [6]:

print("We predict for y = (policies sold | click == 1,  type = t, rank = x). Oversampled on policies sold.")
print()

k = 0
for name in ["LDA","RFC","SVC","KNN","MLP","CNB","AdaBoost_clf","Voting_clf","Bagging_KNN_clf","Bagging_MLP_clf","Pasting_KNN_clf"]:
    print(name)
    print("accuracy:",np.round(finalacc[k],7),"%")
    print("precision:",np.round(finalprec[k],7),"%")
    print("recall:",np.round(finalrec[k],7),"%")
    print()
    k = k + 1
    

We predict for y = (policies sold | click == 1,  type = t, rank = x). Oversampled on policies sold.

LDA
accuracy: 61.5954139 %
precision: 61.8551775 %
recall: 61.5929032 %

RFC
accuracy: 60.470592 %
precision: 61.2287285 %
recall: 60.4735484 %

SVC
accuracy: 60.0641275 %
precision: 60.3507434 %
recall: 60.0625806 %

KNN
accuracy: 56.9264153 %
precision: 57.6146795 %
recall: 56.9232258 %

MLP
accuracy: 60.387032 %
precision: 61.0685899 %
recall: 60.3883871 %

CNB
accuracy: 60.0641275 %
precision: 60.3507434 %
recall: 60.0625806 %

AdaBoost_clf
accuracy: 60.0670424 %
precision: 60.7982854 %
recall: 60.0696774 %

Voting_clf
accuracy: 60.7092888 %
precision: 60.9611908 %
recall: 60.7096774 %

Bagging_KNN_clf
accuracy: 59.2615624 %
precision: 59.4013641 %
recall: 59.2612903 %

Bagging_MLP_clf
accuracy: 59.5821998 %
precision: 60.3287059 %
recall: 59.5845161 %

Pasting_KNN_clf
accuracy: 59.2602669 %
precision: 59.2981621 %
recall: 59.256129 %

