In [1]:
from sklearn.linear_model import SGDClassifier, RidgeClassifier, LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
import matplotlib.pyplot as plt
from scipy.special import expit
from scipy.io import arff
import numpy as np
import pandas as pd
import scipy

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold, cross_val_score, LeaveOneOut



## Veltec - Ranking de Motoristas

In [1]:
df = pd.read_csv("./../../veltec_dados/vigencias_scores.csv",sep=',')
df

NameError: name 'pd' is not defined

#### Encode coluna de ranking

In [3]:
le = preprocessing.LabelEncoder()
df["rank"] = le.fit_transform(df["rank"])
df.head()

Unnamed: 0,id_vei,id_uo_vei,id_motorista,id_uo_motorista,distancia_percorrida_km,qtd_banguela,qtd_curvas,qtd_aceleracoes,qtd_frenagens,qtd_vel_faixa_1,...,qtd_vel_via_faixa_3,tempo_vel_via_faixa_1,tempo_vel_via_faixa_2,tempo_vel_via_faixa_3,qtd_manuseio_celular,qtd_fadiga_motorista,qtd_distracao_motorista,qtd_uso_cigarro,score_geral,rank
0,66486,2855,636779.0,2950.0,234.42,0.0,0.0,0.0,0.0,0.0,...,1.0,208.0,0.0,1456.0,0.0,0.0,0.0,0.0,50.0,4
1,8155,3321,636786.0,2950.0,114.083,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,78.962685,3
2,8577,3327,636592.0,2950.0,183.725,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,1
3,7665,3581,636644.0,2950.0,702.525,0.0,0.0,0.0,3.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,97.437814,1
4,8427,2868,636989.0,2950.0,175.29,0.0,0.0,1.0,3.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,86.308403,0


#### Divide treinamento e teste

In [4]:
X = df.drop(['rank'],axis=1)
y = df['rank']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

### Pipeline para testar classificadores

In [6]:
# pipelines

plr = Pipeline( [ ('scl', StandardScaler()), ('clf', LogisticRegression()) ] )
pknn = Pipeline( [ ('scl', StandardScaler()), ('clf', KNeighborsClassifier(n_neighbors=3)) ] )
pdt = Pipeline( [ ('scl', StandardScaler()), ('clf', DecisionTreeClassifier()) ] )
pnb = Pipeline( [ ('scl', StandardScaler()), ('clf', GaussianNB()) ] )
pr = Pipeline( [ ('scl', StandardScaler()), ('clf', RidgeClassifier()) ] )

pipelines = [plr, pknn, pdt, pnb, pr]

pipe_dict = {0:'Logistic Regression',1:'KNN',2:'Decision Tree',3:'Naive Bayes', 4:'Ridge'}

for pipe in pipelines:
    pipe.fit(X_train, y_train)
    
for idx, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' % (pipe_dict[idx], val.score(X_test, y_test)))
    
# para cada modelo treinado obtem val score
best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, val in enumerate(pipelines):
    # Descobre o melhor val.score e armazen em best_clf
    if val.score(X_test, y_test) > best_acc:
        best_acc = val.score(X_test, y_test)
        best_pipe = val
        best_clf = idx
print('Classifier with best accuracy: %s' % pipe_dict[best_clf])



Logistic Regression pipeline test accuracy: 0.903
KNN pipeline test accuracy: 0.928
Decision Tree pipeline test accuracy: 1.000
Naive Bayes pipeline test accuracy: 0.754
Ridge pipeline test accuracy: 0.813
Classifier with best accuracy: Decision Tree


### Identifica melhor classificador 

In [7]:
# Decision Tree (Melhor)

model = pipelines[best_clf]
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print(cm)
print(classification_report(y_test, y_pred)) 

[[ 303    0    0    0    0]
 [   0 2297    0    0    0]
 [   0    0  184    0    0]
 [   0    0    0  286    0]
 [   0    0    0    0  723]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       303
           1       1.00      1.00      1.00      2297
           2       1.00      1.00      1.00       184
           3       1.00      1.00      1.00       286
           4       1.00      1.00      1.00       723

    accuracy                           1.00      3793
   macro avg       1.00      1.00      1.00      3793
weighted avg       1.00      1.00      1.00      3793



In [8]:
# KNN (Segundo melhor)

model = pipelines[1]
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print(cm)
print(classification_report(y_test, y_pred)) 

[[ 220   71    0   11    1]
 [  15 2280    0    1    1]
 [   3    0  123    7   51]
 [  64    4    1  207   10]
 [   2    0   12   20  689]]
              precision    recall  f1-score   support

           0       0.72      0.73      0.72       303
           1       0.97      0.99      0.98      2297
           2       0.90      0.67      0.77       184
           3       0.84      0.72      0.78       286
           4       0.92      0.95      0.93       723

    accuracy                           0.93      3793
   macro avg       0.87      0.81      0.84      3793
weighted avg       0.93      0.93      0.93      3793



### Validação cruzada com o melhor classificador

Valor de acurácia baixa 0.5%

In [18]:
# Stratified K-folds

skfold = StratifiedKFold(n_splits=10, random_state=100)
model_skfold = DecisionTreeClassifier()
results_skfold = cross_val_score(model_skfold, X_train, y_train, cv=skfold)

print("Stratified K-folds")
print("scores: ", results_skfold) 
print("Accuracy: %.2f%%" % (results_skfold.mean()*100.0))

# Leave One out Cross-Validator

loocv = LeaveOneOut()
model_loocv = DecisionTreeClassifier()
results_loocv = cross_val_score(model_loocv, X_train, y_train, cv=loocv)

print("\nLeave One out")
print("scores: ", results_loocv) 
print("Accuracy: %.2f%%" % (results_loocv.mean()*100.0))

Stratified K-folds
scores:  [0.9988726  1.         0.99774266 1.         1.         1.
 1.         1.         0.9988675  1.        ]
Accuracy: 99.95%

Leave One out
scores:  [1. 1. 1. ... 1. 1. 1.]
Accuracy: 99.98%


### Busca por hiperparametros e features para tentar melhorar os resultados do classificador Ridge

In [10]:
# Set the parameters by cross-validation
tuned_parameters = [{'alpha': [1, 10, 50, 100, 300, 500, 1000]},]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score,"\n")
    
    clf = GridSearchCV(
        RidgeClassifier(), tuned_parameters, scoring='%s_macro' % score
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:\n")
    print(clf.best_params_,"\n")
    print("Grid scores on development set:\n")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("classification report:\n")
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision 

Best parameters set found on development set:

{'alpha': 500} 

Grid scores on development set:

0.623 (+/-0.023) for {'alpha': 1}
0.623 (+/-0.023) for {'alpha': 10}
0.628 (+/-0.034) for {'alpha': 50}
0.632 (+/-0.034) for {'alpha': 100}
0.632 (+/-0.036) for {'alpha': 300}
0.634 (+/-0.034) for {'alpha': 500}
0.634 (+/-0.041) for {'alpha': 1000}

classification report:

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       303
           1       0.82      1.00      0.90      2297
           2       0.70      0.23      0.35       184
           3       1.00      0.10      0.18       286
           4       0.78      0.99      0.87       723

    accuracy                           0.81      3793
   macro avg       0.66      0.46      0.46      3793
weighted avg       0.76      0.81      0.74      3793


# Tuning hyper-parameters for recall 



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best parameters set found on development set:

{'alpha': 100} 

Grid scores on development set:

0.456 (+/-0.025) for {'alpha': 1}
0.456 (+/-0.025) for {'alpha': 10}
0.457 (+/-0.025) for {'alpha': 50}
0.457 (+/-0.025) for {'alpha': 100}
0.455 (+/-0.024) for {'alpha': 300}
0.456 (+/-0.024) for {'alpha': 500}
0.456 (+/-0.024) for {'alpha': 1000}

classification report:

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       303
           1       0.82      1.00      0.90      2297
           2       0.70      0.24      0.36       184
           3       1.00      0.10      0.18       286
           4       0.78      0.99      0.87       723

    accuracy                           0.81      3793
   macro avg       0.66      0.47      0.46      3793
weighted avg       0.76      0.81      0.74      3793




  'precision', 'predicted', average, warn_for)


In [11]:
# Ridge com optimum alpha value

model = RidgeClassifier(alpha=500).fit(X_train, y_train)
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print(cm)
print(classification_report(y_test, y_pred)) 

[[   0  303    0    0    0]
 [   0 2297    0    0    0]
 [   0    0   43    0  141]
 [   0  191    9   28   58]
 [   0    0    9    0  714]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       303
           1       0.82      1.00      0.90      2297
           2       0.70      0.23      0.35       184
           3       1.00      0.10      0.18       286
           4       0.78      0.99      0.87       723

    accuracy                           0.81      3793
   macro avg       0.66      0.46      0.46      3793
weighted avg       0.76      0.81      0.74      3793



  'precision', 'predicted', average, warn_for)


### Combinando Pipeline e Grid Search

In [21]:
pipetree = Pipeline([('scl', StandardScaler()), ('clf', DecisionTreeClassifier())])
pipe = [pipetree]

param_range = [1, 2, 3, 4, 5]

# grid search params
#grid_params = [{'clf__criterion': ['gini', 'entropy'],
#               'clf__presort': [True, False]}]
grid_params = [{'clf__criterion': ['gini', 'entropy'],
    'clf__min_samples_leaf': param_range,
    'clf__max_depth': param_range,
    'clf__min_samples_split': param_range[1:],
    'clf__presort': [True, False]}]

# Construct grid search
gs = GridSearchCV(estimator=pipetree,
    param_grid=grid_params,
    scoring='accuracy')

# Fit using grid search
gs.fit(X_train, y_train)

# Best accuracy
print('Best accuracy: %.3f' % gs.best_score_)

# Best params
print('\nBest params:\n', gs.best_params_)



Best accuracy: 1.000

Best params:
 {'clf__criterion': 'gini', 'clf__max_depth': 3, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__presort': True}
