In [1]:
import pandas as pd
import numpy as np
from numpy import meshgrid

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

dfs = pd.read_excel("Acme.xlsx")

dfs_copy = dfs.copy()

X = dfs_copy[['currently_insured', 'number_of_vehicles', 'number_of_drivers', 'marital_status']]
y = dfs_copy['rank']

# we take out impression_id since it is just the index + 1.


dfs_copy.loc[dfs_copy["rank"]==1,"impression_id"].count()

# thus we have 2531 rank 5 entries
# 2427 rank 4 entries
# 2470 rank 3 entries
# 1440 rank 2 entries
# and 1132 rank 1 entries.

1132

In [2]:
cols = ['currently_insured','marital_status','number_of_vehicles','number_of_drivers']

X[cols] = X[cols].astype(str)

#dummies = pd.get_dummies(X[cols])
#ydummies = pd.get_dummies(y)

# NOTE: here, for multiclass and multi-label classification, we DON'T one-hot encode anything.
X_prime = X[cols]

# BUT we do have to make string labels into number labels:
X_prime.loc[X_prime.currently_insured == "Y",'currently_insured'] = 1
X_prime.loc[X_prime.currently_insured == "N",'currently_insured'] = 0
X_prime.loc[X_prime.marital_status == "M",'marital_status'] = 1
X_prime.loc[X_prime.marital_status == "S",'marital_status'] = 0

#X_prime = pd.concat([X[[c for c in X.columns if c not in cols]],dummies],axis=1, sort=False)
#X_prime = X_prime.astype(int)


# This is for predicting on one rank value at a time.
    #rank = 1
    #y_prime = ydummies.loc[:,rank]

    
# This is for doing multiple-class rank prediction with ALL ranks 1,2,3,4,5
y_prime = y.astype(str)


# Here we combine ranks 2,3 into rank 2 and ranks 4,5 into rank 3: 
y_prime2 = y.astype(str)

y_prime2.mask(y_prime2 == "5", "3", inplace=True)
y_prime2.mask(y_prime2 == "4", "3", inplace=True)
y_prime2.mask(y_prime2 == "3", "2", inplace=True)
y_prime2.mask(y_prime2 == "2", "2", inplace=True)
y_prime2.mask(y_prime2 == "1", "1", inplace=True)

y_prime2.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


0    1
1    2
2    2
3    2
4    2
Name: rank, dtype: object

In [3]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_prime,y_prime2,test_size = .25,random_state = 614,shuffle = True,stratify = y)

X_train.head()

Unnamed: 0,currently_insured,marital_status,number_of_vehicles,number_of_drivers
8184,1,1,1,2
4333,0,1,2,1
9294,0,1,1,1
9810,1,1,1,1
8519,1,1,1,2


In [4]:
# This takes 15-20 minutes to run, be careful!

from sklearn import metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold

# Here we try a neural network of 5 hidden layers, 65 nodes per layer.
mlp = MLPClassifier(hidden_layer_sizes=(65,5),max_iter=1000)

svc = SVC(kernel = 'linear', probability = True)                   # linear kernel seems to work best for SVC.
rfc = RandomForestClassifier(max_depth = 10,n_estimators = 500)
lda = LinearDiscriminantAnalysis()
knn = KNeighborsClassifier(n_neighbors = 5)
bag_knn_clf = BaggingClassifier(KNeighborsClassifier(20),
                            n_estimators = 1000,
                            max_samples = 1000,
                            bootstrap = True)

# Here we use a bagging neural network (only 10 estimators), as this was said to improve neural nets during class.
bag_mlp_clf = BaggingClassifier(MLPClassifier(hidden_layer_sizes=(65,5),max_iter=1000),
                            n_estimators = 10,
                            max_samples = 1000,
                            bootstrap = True)


paste_knn_clf = BaggingClassifier(KNeighborsClassifier(20),
                            n_estimators = 1000,
                            max_samples = 1000,
                            bootstrap = False)

# Here we try AdaBoost with RFC. Maybe some other weak learner algorithm will work better.
ada_clf = AdaBoostClassifier(RandomForestClassifier(max_depth = 10,n_estimators = 500),
                n_estimators = 50,
                algorithm="SAMME.R",
                learning_rate = 1)

voting_clf = VotingClassifier(
                [('lda',lda),
                ('rfc',rfc),
                ('svc',svc),
                ('knn',knn),
                ('mlp',mlp),
                ('ada',ada_clf)],
                voting = "soft")


cv = StratifiedKFold(5, shuffle = True, random_state=614)

# Here 10 is the number of classifiers we're using
finalacc = np.empty(10)
finalprec = np.empty(10)
finalrec = np.empty(10)

# Here we run over all classifiers and then do cross-validation.
k = 0
for name,clf in (["LDA",lda],["RFC",rfc],["SVC",svc],["KNN",knn],["MLP",mlp],["AdaBoost_clf",ada_clf],["Voting_clf",voting_clf],["Bagging_KNN_clf",bag_knn_clf],["Bagging_MLP_clf",bag_mlp_clf],["Pasting_KNN_clf",paste_knn_clf]):
    
    a = np.empty(5)          # 5 for the number of cv-splits.
    p = np.empty(5)
    r = np.empty(5)
    
    j = 0
    for train_idx, test_idx in cv.split(X_train,y_train):

        X_train2 = X_train.iloc[train_idx]
        y_train2 = y_train.iloc[train_idx]
        X_test2 = X_train.iloc[test_idx]
        y_test2 = y_train.iloc[test_idx]
        
        clone_clf = clone(clf)
        clone_clf.fit(X_train2,y_train2.ravel())
        
        y_predict = clone_clf.predict(X_test2)
        
        #y_predict = 1*(y_prob >= cutoff/100)       # Just make y_predict the max class...otherwise we end up
                                                    # with some indices being all rank 1,2,3,4, and 5.
        # We basically can't use proba_predict because we have multiple classes of y.    

        a[j] = 100*metrics.accuracy_score(y_test2, y_predict)
        p[j] = 100*metrics.precision_score(y_test2, y_predict, zero_division = 1,average='macro')
        r[j] = 100*metrics.recall_score(y_test2, y_predict, zero_division = 1,average='macro')
        
        # Using macro as the averaging is the same as taking np.mean of the 5 labels' accuracy, precision, and recall.
        
        j = j + 1
        
    # The mean over cross-validations of accuracy, precision, and recall
    finalacc[k] = np.mean(a)
    finalprec[k] = np.mean(p)
    finalrec[k] = np.mean(r)
    
    k = k + 1


In [5]:

print("These results are with ranks 2,3 combined into rank 2 and ranks 4,5 combined into rank 3.")

k = 0
for name in ["LDA","RFC","SVC","KNN","MLP","AdaBoost_clf","Voting_clf","Bagging_KNN_clf","Bagging_MLP_clf","Pasting_KNN_clf"]:
    print(name)
    print("accuracy:",np.round(finalacc[k],5),"%")
    print("precision:",np.round(finalprec[k],5),"%")
    print("recall:",np.round(finalrec[k],5),"%")
    print()
    k = k + 1
    


These results are with ranks 2,3 combined into rank 2 and ranks 4,5 combined into rank 3.
LDA
accuracy: 88.68 %
precision: 94.34 %
recall: 50.0 %

RFC
accuracy: 88.30667 %
precision: 79.59895 %
recall: 60.57694 %

SVC
accuracy: 88.68 %
precision: 94.34 %
recall: 50.0 %

KNN
accuracy: 88.4 %
precision: 79.26956 %
recall: 58.69731 %

MLP
accuracy: 88.30667 %
precision: 79.59895 %
recall: 60.57694 %

AdaBoost_clf
accuracy: 88.30667 %
precision: 79.59895 %
recall: 60.57694 %

Voting_clf
accuracy: 88.4 %
precision: 79.26956 %
recall: 58.69731 %

Bagging_KNN_clf
accuracy: 88.30667 %
precision: 79.59895 %
recall: 60.57694 %

Bagging_MLP_clf
accuracy: 88.61333 %
precision: 89.23484 %
recall: 51.92546 %

Pasting_KNN_clf
accuracy: 88.30667 %
precision: 79.59895 %
recall: 60.57694 %

