In [7]:
import pandas as pd
import numpy as np
from numpy import meshgrid

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

dfs = pd.read_excel("Acme.xlsx")

dfs_copy = dfs.copy()

X = dfs_copy[['currently_insured', 'number_of_vehicles', 'number_of_drivers', 'marital_status']]
y = dfs_copy['rank']

# we take out impression_id since it is just the index + 1.


dfs_copy.head(20)

Unnamed: 0,impression_id,click,cost,currently_insured,number_of_vehicles,number_of_drivers,marital_status,rank,policies sold
0,1,1,10,N,1,1,M,1,1
1,2,0,10,N,2,1,M,4,0
2,3,0,10,N,1,1,S,2,0
3,4,0,10,Y,2,1,M,5,0
4,5,0,10,Y,2,2,S,4,0
5,6,0,10,N,1,2,M,3,0
6,7,0,10,N,1,2,M,3,0
7,8,0,10,N,1,1,M,4,0
8,9,1,10,N,2,1,M,3,1
9,10,0,10,Y,1,1,M,2,0


In [8]:
cols = ['currently_insured','marital_status','number_of_vehicles','number_of_drivers']

X[cols] = X[cols].astype(str)

#dummies = pd.get_dummies(X[cols])
#ydummies = pd.get_dummies(y)

# NOTE: here, for multiclass and multi-label classification, we DON'T one-hot encode anything.
X_prime = X[cols]

# BUT we do have to make string labels into number labels:
X_prime.loc[X_prime.currently_insured == "Y",'currently_insured'] = 1
X_prime.loc[X_prime.currently_insured == "N",'currently_insured'] = 0
X_prime.loc[X_prime.marital_status == "M",'marital_status'] = 1
X_prime.loc[X_prime.marital_status == "S",'marital_status'] = 0

#X_prime = pd.concat([X[[c for c in X.columns if c not in cols]],dummies],axis=1, sort=False)
#X_prime = X_prime.astype(int)


# This is for predicting on one rank value at a time.
    #rank = 1
    #y_prime = ydummies.loc[:,rank]

# This is for doing multiple-class rank prediction
y_prime = y.astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_prime,y_prime,test_size = .25,random_state = 614,shuffle = True,stratify = y)

X_train.head()

Unnamed: 0,currently_insured,marital_status,number_of_vehicles,number_of_drivers
8184,1,1,1,2
4333,0,1,2,1
9294,0,1,1,1
9810,1,1,1,1
8519,1,1,1,2


In [12]:
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold

svc = SVC(kernel = 'linear', probability = True)
rfc = RandomForestClassifier(max_depth = 10,n_estimators = 500)
lda = LinearDiscriminantAnalysis()
knn = KNeighborsClassifier(n_neighbors = 5)
bag_clf = BaggingClassifier(KNeighborsClassifier(20),
                            n_estimators = 1000,
                            max_samples = 1000,
                            bootstrap = True)

paste_clf = BaggingClassifier(KNeighborsClassifier(20),
                            n_estimators = 1000,
                            max_samples = 1000,
                            bootstrap = False)

voting_clf = VotingClassifier(
                [('lda',lda),
                ('rfc',rfc),
                ('svc',svc),
                ('knn',knn),
                ('ada',ada_clf)],
                voting = "soft")

# Here we try AdaBoost with RFC. Maybe some other weak learner algorithm will work better.
ada_clf = AdaBoostClassifier(RandomForestClassifier(max_depth = 10,n_estimators = 500),
                n_estimators = 50,
                algorithm="SAMME.R",
                learning_rate = 1)


cv = StratifiedKFold(5, shuffle = True, random_state=614)

# Here 8 is the number of classifiers we're using
finalacc = np.empty(8)
finalprec = np.empty(8)
finalrec = np.empty(8)

# Here we run over all classifiers and then do cross-validation.
k = 0
for name,clf in (["LDA",lda],["RFC",rfc],["SVC",svc],["KNN",knn],["AdaBoost_clf",ada_clf],["Voting_clf",voting_clf],["Bagging_clf",bag_clf],["Pasting_clf",paste_clf]):
    
    a = np.empty(5)          # 5 for the number of cv-splits.
    p = np.empty(5)
    r = np.empty(5)
    
    j = 0
    for train_idx, test_idx in cv.split(X_train,y_train):

        X_train2 = X_train.iloc[train_idx]
        y_train2 = y_train.iloc[train_idx]
        X_test2 = X_train.iloc[test_idx]
        y_test2 = y_train.iloc[test_idx]
        
        clone_clf = clone(clf)
        clone_clf.fit(X_train2,y_train2.ravel())
        
        y_predict = clone_clf.predict(X_test2)
        
        #y_predict = 1*(y_prob >= cutoff/100)       # Just make y_predict the max class...otherwise we end up
                                                    # with some indices being all rank 1,2,3,4, and 5.
        # We basically can't use proba_predict because we have multiple classes of y.    

        a[j] = 100*metrics.accuracy_score(y_test2, y_predict)
        p[j] = 100*metrics.precision_score(y_test2, y_predict, zero_division = 1,average='macro')
        r[j] = 100*metrics.recall_score(y_test2, y_predict, zero_division = 1,average='macro')
        
        # Using macro as the averaging is the same as taking np.mean of the 5 labels' accuracy, precision, and recall.
        
        j = j + 1
        
    # The mean over cross-validations of accuracy, precision, and recall
    finalacc[k] = np.mean(a)
    finalprec[k] = np.mean(p)
    finalrec[k] = np.mean(r)
    
    k = k + 1


In [13]:
k = 0
for name in ["LDA","RFC","SVC","KNN","AdaBoost_clf","Voting_clf","Bagging_clf","Pasting_clf"]:
    print(name)
    print("accuracy:",finalacc[k],"%")
    print("precision:",finalprec[k],"%")
    print("recall:",finalrec[k],"%")
    print()
    k = k + 1
    
# Again, bad results here. Best seems to be Pasting Ensemble method, still not very good.

# IDEA: we might need dimensionality reduction, all these algorithms seem to hit the same wall
# at ~40% accuracy/precision/recall. Either this is all we can squeeze out of the data or there is
# some high dimensionality stopping the algorithms.

LDA
accuracy: 40.38666666666667 %
precision: 35.79011434513628 %
recall: 41.45235885977653 %

RFC
accuracy: 40.986666666666665 %
precision: 41.85634689126059 %
recall: 38.78856202366688 %

SVC
accuracy: 40.13333333333333 %
precision: 35.253248762855925 %
recall: 41.28143799074245 %

KNN
accuracy: 33.0 %
precision: 32.53510182710726 %
recall: 34.92932635887548 %

AdaBoost_clf
accuracy: 41.026666666666664 %
precision: 39.00102535913935 %
recall: 39.04486093090918 %

Voting_clf
accuracy: 40.52 %
precision: 36.650424000373825 %
recall: 41.69235479566771 %

Bagging_clf
accuracy: 41.160000000000004 %
precision: 40.11964690464431 %
recall: 39.457448827916096 %

Pasting_clf
accuracy: 41.053333333333335 %
precision: 41.76626198878854 %
recall: 39.16249783296511 %

