Generic imports

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

In [None]:
data = pd.read_csv('./data/sample.csv',sep=';')
data.head()

In [None]:
# Parameters
history_depth = 3

In [None]:
new = []

keep=['BENEFICIAIRE_AGE','BENEFICIAIRE_SEXE']
for idx,row in data.iterrows():
    
    X=[row[c] for c in keep]

    seq = row['LISTE_ACTES_EXT'].split('|')
    for i in range(history_depth):
        seq = ['0'] + seq
    seq = seq + ['0']
    
    for x in range(len(seq)-history_depth):
        W=[]
        Z = [i for i in seq[x:x+history_depth+1]]
        W = X+Z
        new.append(W)  

In [None]:
cols = keep + ['X%i' % i for i in range(history_depth)] + ['Y']
df = pd.DataFrame(new,columns=cols)

In [None]:
df

In [None]:
df['BENEFICIAIRE_SEXE'] = df['BENEFICIAIRE_SEXE'].astype('category')
df['X0'] = df['X0'].astype('category')
df['X1'] = df['X1'].astype('category')
df['X2'] = df['X2'].astype('category')
df['Y'] = df['Y']

FIRST LOOK AT DATAS

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.hist();

In [None]:
sns.catplot(x='BENEFICIAIRE_SEXE',kind="count",data=df)

In [None]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:

df_e = MultiColumnLabelEncoder(columns=['BENEFICIAIRE_SEXE','X0','X1','X2','Y']).transform(df)


In [None]:
df_e

In [None]:
from sklearn.model_selection import train_test_split

X=df.drop(labels='Y', axis=1)
Y=df['Y']

# Create train and test sets
xtrain,xtest,ytrain,ytest= train_test_split(X,Y,test_size=0.2,random_state=37)

SCALE EXPLANATORY VARIABLES

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(xtrain['BENEFICIAIRE_SEXE'])

xtrain_s = xtrain.copy()

xtrain_s['BENEFICIAIRE_SEXE'] = encoder.transform(xtrain_s['BENEFICIAIRE_SEXE'])


In [None]:
xtrain_s = xtrain.copy()

for idx,row in df.select_dtypes(include='category').iteritems():
    print(row.name)
    break


In [None]:
xtrain_s.info()

In [None]:
# from sklearn.preprocessing import StandardScaler

# # Scale x data
# scaler = StandardScaler()
# scaler.fit(xtrain)

# xtrain_s = scaler.transform(xtrain)
# xtest_s = scaler.transform(xtest)

TRY DIFFERENT CLASSIFIERS FOR FIRST EVALUATION

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

classifs = [KNeighborsClassifier(),
           DecisionTreeClassifier(random_state=37),
           SVC(),
           RandomForestClassifier(random_state=37),
           AdaBoostClassifier(random_state=37),
           GaussianNB(),
           ]

classif_names=['KNN\t\t', 
               'DecisionTree\t', 
               'SVC\t\t',
               'RandomForest\t',
               'AdaBoost\t',
               'Naive Bayes\t',
               ]

# Apply different classifiers
bestscore=0
bestclf =''
for name, clf in zip(classif_names, classifs):
    classif = clf.fit(xtrain, ytrain)
    
    score1 = classif.score(xtest_s, ytest)*100
    print("%s Score : %.2f %%" % (name, score1))
    if score1 > bestscore:
        bestscore = score1
        ypred = classif.predict(xtest)
        bestclf = name

In [None]:
print('Best score using: %s' % bestclf)

CROSS VALIDATION EVALUATION

In [None]:
from sklearn.model_selection import cross_val_score

classif = RandomForestClassifier(random_state=37)
model = classif.fit(xtrain_s, ytrain)

accu = cross_val_score(model,xtrain_s, ytrain,scoring='accuracy',cv=5)
                        

print("model scores:",accu)
print("avg:",accu.mean())
print("std dev:",accu.std())

TRY MODEL IMPROVEMENT BY TUNING PARAMETERS USING GRID SEARCH

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'bootstrap':[True,False],
        'n_estimators':[10,50,100,],
        'max_features':[3,5,7,'auto'],
        'max_depth':[2,5,10,None],
    }
]

classif = RandomForestClassifier(random_state=37)
grid_search = GridSearchCV(classif,param_grid,cv=3,return_train_score=True)
grid_search.fit(xtrain_s,ytrain);

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix

ytrain_pred = cross_val_predict(grid_search.best_estimator_,xtrain_s,ytrain,cv=3)
confmatr = confusion_matrix(ytrain, ytrain_pred)

In [None]:
print(confusion_matrix(ytrain, ytrain_pred))
print(classification_report(ytrain, ytrain_pred))

In [None]:
plt.matshow(confmatr,cmap=plt.cm.gray);

In [None]:
row_sums = confmatr.sum(axis=1,keepdims=True)
norm_conf_matr = confmatr / row_sums
np.fill_diagonal(norm_conf_matr,0)
plt.matshow(norm_conf_matr,cmap=plt.cm.gray);

EVALUATE MODEL ON TEST DATASET

In [None]:
ypred = grid_search.best_estimator_.predict(xtest_s)

score = grid_search.best_estimator_.score(xtest_s, ytest)*100

print("Model score: %.2f%% on test dataset (%d obs)" % (score,len(ytest)))


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

confmatr = confusion_matrix(ytest, ypred)
confm = pd.DataFrame(confmatr)

print(confusion_matrix(ytest, ypred))
print(classification_report(ytest, ypred))

In [None]:
plt.matshow(confmatr,cmap=plt.cm.gray);

In [None]:
row_sums = confmatr.sum(axis=1,keepdims=True)
norm_conf_matr = confmatr / row_sums
np.fill_diagonal(norm_conf_matr,0)
plt.matshow(norm_conf_matr,cmap=plt.cm.gray);