# Support Vector Machine - SVM - Diabetes
###  Disponível em https://www.kaggle.com/uciml/pima-indians-diabetes-database

Attributes:

Pregnancies: Number of times pregnant - Gravidez

Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test - Glicose

BloodPressure: Diastolic blood pressure (mm Hg) - Pressão Arterial

SkinThickness: Triceps skin fold thickness (mm)  - Espessura do tríceps

Insulin: 2-Hour serum insulin (mu U/ml) - Insulina

BMI: Body mass index (weight in kg/(height in m)^2) - IMC

DiabetesPedigreeFunction: Diabetes pedigree function - Função que leva em conta doenças na familia

Age: Age (years)

Outcome: Class variable (0 or 1) - 0 : Não tem Diabetes, 1: Possui Diabetes

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("diabetes.csv")
df.rename(columns={"Outcome": "Class"} , inplace=True)
df.head()

In [None]:
df.describe().T

## Preparação dos dados

In [None]:
# particionar os conjuntos de treino e teste
from sklearn.model_selection import train_test_split

df2 = df.copy()

diabetes_data = df2.loc[:,["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin",
                       "BMI", "DiabetesPedigreeFunction", "Age"]]
diabetes_target = df2["Class"]

In [None]:
diabetes_data[:3]

In [None]:
diabetes_target[:3]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    diabetes_data, diabetes_target, test_size=0.33, random_state=42)

X_train[:3]

In [None]:
print("# dados de treino = ", len(X_train))
print("# dados de teste = ", len(X_test))    

In [None]:
diabetes_data[:3]

In [None]:
diabetes = diabetes_data
diabetes["class"] = diabetes_target
diabetes.head()

In [None]:
# correlação
diabetes.corr()

In [None]:
diabetes.corr().loc["class"].sort_values()

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn import svm

In [None]:
classifier = svm.SVC(kernel='linear')

In [None]:
classifier.fit(X_train, y_train)

In [None]:
prediction_SVM = classifier.predict(X_test)

In [None]:
#kernel_svm.fit(data_train, targets_train)
#kernel_svm_score = kernel_svm.score(data_test, targets_test)

print("Accuracy on test set (SVM): {:.3f}".format(classifier.score(X_test, y_test)))

In [None]:
cm = confusion_matrix(y_test, prediction_SVM)
cm

## comparação com arvore de decisão

In [None]:
from sklearn.tree import DecisionTreeClassifier 
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
#print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set (Arvore de Decisão): {:.3f}".format(tree.score(X_test, y_test)))

In [None]:
import sklearn.metrics as metrics

metrics.confusion_matrix(y_test, tree.predict(X_test))

# mudar os parametros do svm

O parâmetro C é um trade off (escolha) entre a incorreta classificação de exemplos de treinamento contra a simplicidade da superfície de decisão. Um C baixo torna a superfície de decisão suave, enquanto um C alto visa classificar todos os exemplos de treinamento corretamente, dando ao modelo liberdade para selecionar mais amostras como vetores de suporte.
O parâmetro gamma define qual é a influência de um único exemplo de treinamento. É um coeficiente de kernel para 'rbf', 'poli' e 'sigmóide'. Se gamma for definido como 'auto' então 1/n_features serão usados. Valores baixos significam 'alta variância' e maior influência do vetor de suporte e valores altos significam 'baixa variância' e os vetores de suporte não possuem grande influência no processo de classificação. Os parâmetros gama podem ser vistos como o inverso do raio de influência de amostras selecionadas pelo modelo como vetores de suporte.



In [None]:
def testar_kernels(kernels):
    for kernel in kernels:
        #classifier = svm.SVC(kernel=kernel, C = 10.0, gamma = 0.001)
        if kernel == 'linear':
            classifier = svm.SVC(kernel=kernel)
        else:
            classifier = svm.SVC(kernel=kernel, C = 10.0, gamma = 0.001 )
        classifier.fit(X_train, y_train)
        prediction_SVM = classifier.predict(X_test)
        cm = confusion_matrix(y_test, prediction_SVM)
        #cm = confusion_matrix(y_test, prediction_SVM)
        print("kernel",kernel)
        print("confusion matrix:\n",cm)
        print("score:", classifier.score(X_test, y_test))
        print("\n")
kernels = ['linear', 'rbf', 'sigmoid']
#['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
# o tempo de processamento do kernel polinomial é alto em relação aos demais
testar_kernels(kernels)

## Normalizar os dados 
 aumenta a acurácia do modelo
 
valores são transpostos para o intervalo 0-1

In [None]:
#diabetes = diabetes_data
#diabetes["class"] = diabetes_target
#diabetes.head()
diabetes_data = diabetes_data.loc[:, ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin",
                       "BMI", "DiabetesPedigreeFunction", "Age"] ]
diabetes_data.head()

In [None]:
# normalize the data attributes
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(diabetes_data)
df_normalized = pd.DataFrame(np_scaled)
df_normalized.columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin",
                       "BMI", "DiabetesPedigreeFunction", "Age"]
df_normalized.head()

In [None]:
diabetes_target[:3]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_normalized, diabetes_target, test_size=0.33, random_state=42)
X_train[:3]

In [None]:
classifier = svm.SVC(kernel='linear', C=10.0, gamma=0.001)

In [None]:
classifier.fit(X_train, y_train)

In [None]:
prediction_SVM = classifier.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, prediction_SVM)
cm

In [None]:
classifier.score(X_test, y_test)

In [None]:
kernels = ['linear', 'rbf', 'sigmoid']
testar_kernels(kernels)

## SVM - Cancer

In [None]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

list(data.target_names)

'malignant': 0, 'benign':1

Breast Cancer Wisconsin (Diagnostic) Database
=============================================

Notes
-----
Data Set Characteristics:
    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, field
        13 is Radius SE, field 23 is Worst Radius.

        - class:
                - WDBC-Malignant (0)
                - WDBC-Benign (1)

In [None]:
df = pd.DataFrame(data.data)
df.columns = ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension']
df["class"] = data.target
df.head()

In [None]:
len(data.data)

In [None]:
data.feature_names

In [None]:
data.data

In [None]:
data.target_names

In [None]:
data.target[:20]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.33, random_state=42)

In [None]:
classifier = svm.SVC(kernel='linear', C=10.0, gamma=0.001)

In [None]:
classifier.fit(X_train, y_train)

In [None]:
prediction_SVM = classifier.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, prediction_SVM)
cm

In [None]:
classifier.score(X_test, y_test)

In [None]:
testar_kernels(kernels)

### normalizar

In [None]:
np_scaled = min_max_scaler.fit_transform(data.data)

In [None]:
data.data

In [None]:
np_scaled

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    np_scaled, data.target, test_size=0.33, random_state=42)

In [None]:
classifier.fit(X_train, y_train)

In [None]:
classifier.score(X_test, y_test)

In [None]:
testar_kernels(kernels)

# SVM - Carros

In [None]:
df = pd.read_csv("car.data")
df.columns = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "Class"]
df.head()

In [None]:
len(df)

In [None]:
df.Class.value_counts()

## Codificar os dados categóricos para inteiros

In [None]:
# codifica todo o dataframe para numérico
from sklearn.preprocessing import LabelEncoder
def codificar_dataframe(df):
    le = LabelEncoder()
    df2 = pd.DataFrame()
    for col in df.columns.values:
        # Encoding only categorical variables
        #print(len(df2[col]))
        if df[col].dtypes=='object':
            data=df[col]
            le.fit(data.values)
            #print (data.values)
            #print(le.fit(data.values))
            df2[col]=le.transform(df[col])           
            
    # gerar os dicionarios das categorias e dos inteiros
    dict_scalar ={}
    dict_to_string = {}
    d = {}
    columns = df.columns.values.tolist()
    #print(type(columns))
    #print(columns)
    for col in columns:
        #print(col)
        values = list(set(df[col]))
        #print (values)
        le = LabelEncoder()
        vt = le.fit_transform(values)
        #print(le.transform(values))
        dict_scalar[col] = {}
        dict_to_string[col] = {}
        d = {}
        ds = {}
        for v, vt in zip(values, vt): 
            #print (v,vt)
            d[v] = vt
            ds[vt] = v
        dict_scalar[col] = d
        dict_to_string[col] = ds
        
    return(df2, dict_scalar, dict_to_string)

In [None]:
df_cod, dict_nomes, dict_int = codificar_dataframe(df)
df_cod.head()

In [None]:
len(df_cod)

## Separar os dados X e Y

In [None]:
data_x = df_cod.loc[:,["buying", "maint", "doors", "persons", "lug_boot", "safety"]]
data_x.head()

In [None]:
target_y = df_cod.loc[:,"Class"]
len(target_y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_x, target_y, test_size=0.33, random_state=42)

In [None]:
classifier = svm.SVC(kernel='linear', C=10.0, gamma=0.001)

In [None]:
classifier.fit(X_train, y_train)

In [None]:
prediction_SVM = classifier.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, prediction_SVM)
cm

In [None]:
classifier.score(X_test, y_test)

In [None]:
print("Vetores de suporte: ", len(classifier.support_vectors_))
classifier.support_vectors_

In [None]:
import matplotlib.pyplot as plt
clf = classifier
#plt.clf()
#plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1]) #, s=80, facecolors='none', zorder=10)
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80,
                facecolors='none', zorder=10, edgecolors='k')
#plt.scatter(clf.support_, clf.support_)
#plt.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=plt.cm.Paired)
plt.show()

In [None]:
clf.support_[:4]

In [None]:
plt.scatter(clf.support_, clf.support_) #, s=80, facecolors='none', zorder=10)
#plt.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=plt.cm.Paired)
plt.show()

In [None]:
testar_kernels(kernels)

In [None]:
## Normalizar

In [None]:
np_scaled = min_max_scaler.fit_transform(data_x)

In [None]:
np_scaled[:3]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(np_scaled, target_y, test_size=0.33, random_state=42)

In [None]:
testar_kernels(kernels)

# Kernels - Plots

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm


# Atributos e variável target
X = np.c_[(.4, -.7),
          (-1.5, -1),
          (-1.4, -.9),
          (-1.3, -1.2),
          (-1.1, -.2),
          (-1.2, -.4),
          (-.5, 1.2),
          (-1.5, 2.1),
          (1, 1),
          # --
          (1.3, .8),
          (1.2, .5),
          (.2, -2),
          (.5, -2.4),
          (.2, -2.3),
          (0, -2.7),
          (1.3, 2.1)].T
Y = [0] * 8 + [1] * 8

fignum = 1

# Modelo e Fit com 3 kernels
for kernel in ('linear', 'poly', 'rbf'):
    clf = svm.SVC(kernel = kernel, gamma = 2)
    clf.fit(X, Y)

    # Plot da linha com vetores de suporte mais próximos
    print("kernel: ", kernel)
    plt.figure(fignum, figsize=(4, 3))
    plt.clf()

    plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80, facecolors='none', zorder=10)
    plt.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=plt.cm.Paired)

    plt.axis('tight')
    x_min = -3
    x_max = 3
    y_min = -3
    y_max = 3

    XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
    Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])

    # Color Plot
    Z = Z.reshape(XX.shape)
    plt.figure(fignum, figsize=(4, 3))
    plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
    plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'], levels=[-.5, 0, .5])

    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)

    plt.xticks(())
    plt.yticks(())
    fignum = fignum + 1
    
plt.show()

## Hyperparameter Tuning¶

A escolha de C e de gamma é importante performance das SVMs.
A otimização (tuning) dos hyperparâmetros é uma boa prática para enoontrar bons parâmetros.


In [None]:
# dataset diabetes
X = diabetes_data.values
Y = diabetes_target.values
X_train, X_test, y_train, y_test = train_test_split(
    diabetes_data, diabetes_target, test_size=0.33, random_state=42)

In [None]:
kernel = 'linear'
#classifier = svm.SVC(kernel=kernel, C = 10.0, gamma = 0.001)
classifier = svm.SVC(kernel=kernel)
classifier.fit(X_train, y_train)
prediction_SVM = classifier.predict(X_test)
cm = confusion_matrix(y_test, prediction_SVM)
#cm = confusion_matrix(y_test, prediction_SVM)
print("kernel",kernel)
print("confusion matrix:\n",cm)
print("score:", classifier.score(X_test, y_test))
print("\n")

In [None]:
kernels = ['linear', 'rbf', 'sigmoid']
kernel = 'rbf'
#classifier = svm.SVC(kernel=kernel, C = 10.0, gamma = 0.001)
classifier = svm.SVC(kernel=kernel,C = 10.0, gamma = 0.001)
classifier.fit(X_train, y_train)
prediction_SVM = classifier.predict(X_test)
cm = confusion_matrix(y_test, prediction_SVM)
#cm = confusion_matrix(y_test, prediction_SVM)
print("kernel",kernel)
print("confusion matrix:\n",cm)
print("score:", classifier.score(X_test, y_test))
print("\n")

In [None]:
import numpy as np
np.set_printoptions(precision=1, suppress=True)
g_range = 2. ** np.arange(-15, 5, step=2)
C_range = 2. ** np.arange(-5, 15, step=2)
g_range

In [None]:
C_range

In [None]:
#from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
#import sklearn.cross_validation as cv
#  gamma and C (Cost)  hyperparametros
g_range = 2. ** np.arange(-15, 5, step=2)
C_range = 2. ** np.arange(-5, 15, step=2)

grid = [{'gamma': g_range, 'C': C_range}]

gridcv = GridSearchCV(svm.SVC(kernel='rbf'), param_grid=grid, cv= 5) #cv.KFold(n=X_train.shape[0], n_folds=5))
gridcv.fit(X_train, y_train)

bestGamma =gridcv.best_params_['gamma']
bestC = gridcv.best_params_['C']

print ("Os melhores paRâmetros: gamma=", bestGamma, " and Cost=", bestC)

In [None]:
classifier = svm.SVC(kernel='rbf', C=8, gamma=3.0517578125e-05)
classifier.fit(X_train, y_train)
prediction_SVM = classifier.predict(X_test)
cm = confusion_matrix(y_test, prediction_SVM)
#cm = confusion_matrix(y_test, prediction_SVM)
print("kernel",kernel)
print("confusion matrix:\n",cm)
print("score:", classifier.score(X_test, y_test))
print("\n")

In [None]:
gridcv

In [None]:
gridcv.cv_results_

In [None]:
scores = gridcv.cv_results_['mean_test_score']

In [None]:
# plot the scores of the grid
# grid_scores_ contains parameter settings and scores
scores = np.array(scores).reshape(len(C_range), len(g_range))

# Make a heatmap with the performance
plt.figure(figsize=(10, 6))
plt.subplots_adjust(left=0.15, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores, interpolation='nearest', origin='higher', cmap=plt.cm.get_cmap('jet_r'))
plt.xlabel('gamma (log2)')
plt.ylabel('C - Cost (log2)')
plt.xticks(np.arange(len(g_range)), np.round(g_range,3) )
plt.yticks(np.arange(len(C_range)), np.log2((C_range)))

cbar = plt.colorbar()
cbar.set_label('Classification Accuracy', rotation=270, labelpad=20)

plt.show()