In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn

# from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
#from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score, average_precision_score

import matplotlib
matplotlib.use('nbagg')
import matplotlib.pyplot as plt

import seaborn as sns



In [2]:
train_set = pd.read_csv("data/treino_total.csv", sep=',', header=0, low_memory=False)
val_set = pd.read_csv("data/val_total.csv", sep=',', header=0, low_memory=False)
test_set = pd.read_csv("data/teste_total.csv", sep=',', header=0, low_memory=False)

In [3]:
# merge das duas ultimas colunas na coluna 'y'
train_set = train_set.drop('IND_BOM_1_2', 1)
train_set = train_set.rename(columns={'IND_BOM_1_1': 'y'})
val_set = val_set.drop('IND_BOM_1_2', 1)
val_set = train_set.rename(columns={'IND_BOM_1_1': 'y'})
test_set = test_set.drop('IND_BOM_1_2', 1)
test_set = test_set.rename(columns={'IND_BOM_1_1': 'y'})

# encontra as variavies categoricas
for col in train_set.columns:
    if ((train_set[col].quantile() == 1.0) | (train_set[col].quantile() == 0.0)):
        train_set[col] = train_set[col].astype('category')
for col in val_set.columns:
    if ((val_set[col].quantile() == 1.0) | (val_set[col].quantile() == 0.0)):
        val_set[col] = val_set[col].astype('category')
for col in test_set.columns:
    if ((test_set[col].quantile() == 1.0) | (test_set[col].quantile() == 0.0)):
        test_set[col] = test_set[col].astype('category')

# Embaralha os datasets
train_set = train_set.sample(frac=1)
val_set = val_set.sample(frac=1)

In [4]:
X_train = train_set.loc[:, train_set.columns != 'y']
y_train = train_set.loc[:, train_set.columns == 'y']
X_val = val_set.loc[:, val_set.columns != 'y']
y_val = val_set.loc[:, val_set.columns == 'y']
X_test = test_set.loc[:, test_set.columns != 'y']
y_test = test_set.loc[:, test_set.columns == 'y']

In [5]:
from matplotlib.ticker import FormatStrFormatter

fig, ax = plt.subplots()

ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

In [6]:
from sklearn import svm

Primeiro usando Kernel SVM com kernel do tipo rbf

In [7]:
#testando svm kernel rbf
#treino
clf = svm.SVC(gamma=0.001, C=100.)
clf.fit(X_train.values, y_train.values.ravel())

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [8]:
predited = clf.predict(X_test.values)
accuracy = accuracy_score(y_test.values, predited)
print("Mean accuracy score: {:.3}".format(accuracy))

Mean accuracy score: 0.656


In [9]:
cm = pd.DataFrame(confusion_matrix(y_test, predited))
fig = plt.figure(figsize=(10,7))
heatmap = sns.heatmap(cm, annot=True)
plt.ylabel('True label')
plt.xlabel('Predicted label')
cm

Unnamed: 0,0,1
0,8,3324
1,23,6374


Kernel do tipo linear e do tipo polinomial demoram mais de 12 horas para terminarem a execução. Códigos a seguir

In [None]:
#kernel linear
linear_clf = svm.SVC(gamma=0.001, C=100., kernel='linear')
linear_clf.fit(X_train.values, y_train.values.ravel())

In [None]:
linear_predited = linear_clf.predict(X_test.values)
linear_accuracy = accuracy_score(y_test.values, linear_predited)
print("Mean accuracy score: {:.3}".format(linear_accuracy))

In [None]:
linear_cm = pd.DataFrame( confusion_matrix(y_test, linear_predited))
linear_fig = plt.figure(figsize=(10,7))
linear_heatmap = sns.heatmap(linear_cm, annot=True)
plt.ylabel('True label')
plt.xlabel('Predicted label')
linear_cm

In [None]:
#kernel polinomial
poly_clf = svm.SVC(gamma=0.001, C=100., kernel='poly')
poly_clf.fit(X_train.values, y_train.values.ravel())

In [None]:
poly_predited = poly_clf.predict(X_test.values)
poly_accuracy = accuracy_score(y_test.values, poly_predited)
print("Mean accuracy score: {:.3}".format(poly_accuracy))

In [None]:
poly_cm = pd.DataFrame( confusion_matrix(y_test, poly_predited))
poly_fig = plt.figure(figsize=(10,7))
poly_heatmap = sns.heatmap(poly_cm, annot=True)
plt.ylabel('True label')
plt.xlabel('Predicted label')
poly_cm