In [1]:
import pandas as pd
import numpy as np
from numpy import meshgrid

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

dfs = pd.read_excel("Acme.xlsx")

# Here we are predicting for P(buy | click) thus we look at the subset where click is always 1.
dfs_click_copy = dfs.loc[dfs['click'] == 1,].copy()

X = dfs_click_copy[['currently_insured', 'number_of_vehicles', 'number_of_drivers', 'marital_status','rank']]
y = dfs_click_copy['policies sold']
# should we include rank in X????????????????


# we take out impression_id since it is just the index + 1.

for i in range(1,6):
    print("rank:", i,", # of people:", dfs_click_copy.loc[dfs_click_copy["rank"]==i,"impression_id"].count())

# thus we have 51 rank 5 entries
# 100 rank 4 entries
# 375 rank 3 entries
# 267 rank 2 entries
# and 581 rank 1 entries.

rank: 1 , # of people: 581
rank: 2 , # of people: 267
rank: 3 , # of people: 375
rank: 4 , # of people: 100
rank: 5 , # of people: 51


In [2]:
cols = ['currently_insured','marital_status','number_of_vehicles','number_of_drivers','rank']

X[cols] = X[cols].astype(str)

#dummies = pd.get_dummies(X[cols])
#ydummies = pd.get_dummies(y)

# NOTE: here, for multiclass and multi-label classification, we DON'T one-hot encode anything.
X_prime = X[cols]

# BUT we do have to make purely string labels into number labels:
X_prime.loc[X_prime.currently_insured == "Y",'currently_insured'] = 1
X_prime.loc[X_prime.currently_insured == "N",'currently_insured'] = 0
X_prime.loc[X_prime.marital_status == "M",'marital_status'] = 1
X_prime.loc[X_prime.marital_status == "S",'marital_status'] = 0

#X_prime = pd.concat([X[[c for c in X.columns if c not in cols]],dummies],axis=1, sort=False)
#X_prime = X_prime.astype(int)


# This is for predicting on one rank value at a time.
    #rank = 1
    #y_prime = ydummies.loc[:,rank]


y_prime = y.astype(str)


# Here we combine ranks 2,3 into rank 2 and ranks 4,5 into rank 3: 
# Don't do this?

#X_prime.loc[X_prime['rank'] == "3",'rank'] = "2"
#X_prime.loc[X_prime['rank'] == "2",'rank'] = "2"
#X_prime.loc[X_prime['rank'] == "1",'rank'] = "1"
#X_prime.loc[X_prime['rank'] == "5",'rank'] = "3"         # we have to do it in this order to avoid conflicts 
#X_prime.loc[X_prime['rank'] == "4",'rank'] = "3"         # between the assignments involving "3".

X_prime.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,currently_insured,marital_status,number_of_vehicles,number_of_drivers,rank
0,0,1,1,1,1
8,0,1,2,1,3
16,0,1,1,2,2
19,1,1,2,1,5
27,0,0,1,2,1


In [3]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_prime,y_prime,test_size = .25,random_state = 614,shuffle = True,stratify = X_prime["rank"])


# We are stratifying by y (= policies sold) but we might want to stratify by rank instead...


for i in range(1,6):
    print("rank:", i, ", # of people:", X_train.loc[X_train["rank"]==str(i),"rank"].count())

# So X_train has 447 rank 1 people
# and 473 rank 2 people (original ranks 2 and 3)
# and 110 rank 3 people (original ranks 4 and 5)


rank: 1 , # of people: 436
rank: 2 , # of people: 200
rank: 3 , # of people: 281
rank: 4 , # of people: 75
rank: 5 , # of people: 38


In [4]:
# This is for balancing the training dataset:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()

X_train,y_train = oversample.fit_resample(X_train,y_train)

for i in range(1,6):
    print("rank:", i, ", # of people:", X_train.loc[X_train["rank"]==i,"rank"].count())

# POSSIBLE PROBLEM: Here we are balancing the training set X for POLICIES SOLD, not for RANK!!
# How do we fix this???
    
# So X_train, oversampled, has 528 rank 1 people
# and 237 rank 2 people
# and 336 rank 3 people
# and 88 rank 4 people
# and 52 rank 5 people


Using TensorFlow backend.


rank: 1 , # of people: 528
rank: 2 , # of people: 237
rank: 3 , # of people: 336
rank: 4 , # of people: 88
rank: 5 , # of people: 52


In [5]:
# This takes 15-20 minutes to run, be careful!

from sklearn import metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold

# Here we try a neural network of 5 hidden layers, 65 nodes per layer.
mlp = MLPClassifier(hidden_layer_sizes=(65,5),max_iter=10000000)

svc = SVC(kernel = 'linear', probability = True)                   # linear kernel seems to work best for SVC.
rfc = RandomForestClassifier(max_depth = 10,n_estimators = 500)
lda = LinearDiscriminantAnalysis()
knn = KNeighborsClassifier(n_neighbors = 5)
bag_knn_clf = BaggingClassifier(KNeighborsClassifier(20),
                            n_estimators = 1000,
                            max_samples = 100,
                            bootstrap = True)

# Here we use a bagging neural network (only 10 estimators), as this was said to improve neural nets during class.
bag_mlp_clf = BaggingClassifier(MLPClassifier(hidden_layer_sizes=(65,5),max_iter=1000),
                            n_estimators = 10,
                            max_samples = 100,
                            bootstrap = True)


paste_knn_clf = BaggingClassifier(KNeighborsClassifier(20),
                            n_estimators = 1000,
                            max_samples = 100,
                            bootstrap = False)

# Here we try AdaBoost with RFC. Maybe some other weak learner algorithm will work better.
ada_clf = AdaBoostClassifier(RandomForestClassifier(max_depth = 10,n_estimators = 500),
                n_estimators = 50,
                algorithm="SAMME.R",
                learning_rate = 1)

voting_clf = VotingClassifier(
                [('lda',lda),
                ('rfc',rfc),
                ('svc',svc),
                ('knn',knn),
                ('mlp',mlp),
                ('ada',ada_clf)],
                voting = "soft")


cv = StratifiedKFold(5, shuffle = True, random_state=614)



# Here we balance the data using SMOTE (combine with RandomUnderSampler? We'll see later...)
# source: https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
# ????????
X_train,y_train = oversample.fit_resample(X_train,y_train)


# Here 10 is the number of classifiers we're using
finalacc = np.empty(10)
finalprec = np.empty(10)
finalrec = np.empty(10)

# Here we run over all classifiers and then do cross-validation.
k = 0
for name,clf in (["LDA",lda],["RFC",rfc],["SVC",svc],["KNN",knn],["MLP",mlp],["AdaBoost_clf",ada_clf],["Voting_clf",voting_clf],["Bagging_KNN_clf",bag_knn_clf],["Bagging_MLP_clf",bag_mlp_clf],["Pasting_KNN_clf",paste_knn_clf]):
    
    a = np.empty(5)          # 5 for the number of cv-splits.
    p = np.empty(5)
    r = np.empty(5)
    
    j = 0
    for train_idx, test_idx in cv.split(X_train,y_train):

        X_train2 = X_train.iloc[train_idx]
        y_train2 = y_train.iloc[train_idx]
        X_test2 = X_train.iloc[test_idx]
        y_test2 = y_train.iloc[test_idx]
        
        
        clone_clf = clone(clf)
        clone_clf.fit(X_train2,y_train2.ravel())
        
        y_predict = clone_clf.predict(X_test2)
        
        #y_predict = 1*(y_prob >= cutoff/100)       # Just make y_predict the max class...otherwise we end up
                                                    # with some indices being all rank 1,2,3,4, and 5.
        # We basically can't use proba_predict because we have multiple classes of y.    

        a[j] = 100*metrics.accuracy_score(y_test2, y_predict)
        p[j] = 100*metrics.precision_score(y_test2, y_predict, zero_division = 1,average='macro')
        r[j] = 100*metrics.recall_score(y_test2, y_predict, zero_division = 1,average='macro')
        
        # Using macro as the averaging is the same as taking np.mean of the 5 labels' accuracy, precision, and recall.
        
        j = j + 1
        
    # The mean over cross-validations of accuracy, precision, and recall
    finalacc[k] = np.mean(a)
    finalprec[k] = np.mean(p)
    finalrec[k] = np.mean(r)
    
    k = k + 1




In [7]:

print("All ranks are left separate here, and we predict for y = policies sold.")
print("Note also that we have oversampled to balance out the training set. But this oversampling occurs for policies sold, not rank!")
print()

k = 0
for name in ["LDA","RFC","SVC","KNN","MLP","AdaBoost_clf","Voting_clf","Bagging_KNN_clf","Bagging_MLP_clf","Pasting_KNN_clf"]:
    print(name)
    print("accuracy:",np.round(finalacc[k],7),"%")
    print("precision:",np.round(finalprec[k],7),"%")
    print("recall:",np.round(finalrec[k],7),"%")
    print()
    k = k + 1
    

All ranks are left separate here, and we predict for y = policies sold.
Note also that we have oversampled to balance out the training set. But this oversampling occurs for policies sold, not rank!

LDA
accuracy: 61.8563855 %
precision: 62.1367003 %
recall: 61.8554839 %

RFC
accuracy: 59.0464257 %
precision: 59.3854341 %
recall: 59.0406452 %

SVC
accuracy: 61.4554217 %
precision: 61.8959416 %
recall: 61.4548387 %

KNN
accuracy: 55.7683534 %
precision: 55.9746513 %
recall: 55.7709677 %

MLP
accuracy: 59.446747 %
precision: 59.9543125 %
recall: 59.4432258 %

AdaBoost_clf
accuracy: 59.6080321 %
precision: 59.9828169 %
recall: 59.6019355 %

Voting_clf
accuracy: 59.2125301 %
precision: 59.7267633 %
recall: 59.2180645 %

Bagging_KNN_clf
accuracy: 58.0880321 %
precision: 58.2544493 %
recall: 58.0806452 %

Bagging_MLP_clf
accuracy: 61.6954217 %
precision: 62.3160036 %
recall: 61.6922581 %

Pasting_KNN_clf
accuracy: 58.648996 %
precision: 58.7514039 %
recall: 58.6425806 %

