In [32]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.datasets import load_iris, load_wine
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler

from utilities import plot_class_regions_for_classifier_subplot

In [134]:
wine = load_wine()
iris = load_iris()

In [135]:
#Select features and label
X = iris['data']
y = iris['target']

In [136]:
#split train and test data. Set test size %
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3,random_state=5)

In [137]:
#scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [160]:
#logistic Regression
# model1 = LogisticRegression()
# param_grid ={'C': c_logspace,
#             'penalty': ['l1', 'l2']}

#SVM with linear Kernal (linear SVM) or RFB (radial basis function) or poly (polynomial)
#if using svm, won't get probability
#poly take parameter degree
#gamma controls how far the influence of a single training example reaches, which affects how tightly the decision boundtries
#end up surrounding point in the input space. small gamma means a larger similarity radius so points further apart are considered
#similar. results in more points being grouped together and smoother decision boundaries.
#large gamma results in more complex decision boundaries.
this_gamma = c_logspace
that_gamma = [.0001, .001, .01, .1, 1, 10]
model2 = SVC(kernel = 'linear')
param_grid ={'C': c_logspace,
            'kernel': ['linear', 'rbf', 'poly'],
             'gamma': that_gamma
            }

In [1]:
#logspace(start, stop, num of samples to generate) -> returns numbers spaced evenly on logscale
c_logspace = np.logspace(-2, 2, 30)
#plt.scatter(range(len(c_logspace)),c_logspace)
#c_linspace = (1, 200)
#plt.plot(c_linspace)

In [162]:
#C controls the amount of regularization. Higher values of C=less regularization & less model complexity
# penalty selects the type of regularization l1-Lasso, L2-Ridge

In [163]:
model_cv = GridSearchCV(model2, param_grid, cv=5)

In [164]:
model_cv.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.00000e-02, 1.37382e-02, 1.88739e-02, 2.59294e-02, 3.56225e-02,
       4.89390e-02, 6.72336e-02, 9.23671e-02, 1.26896e-01, 1.74333e-01,
       2.39503e-01, 3.29034e-01, 4.52035e-01, 6.21017e-01, 8.53168e-01,
       1.17210e+00, 1.61026e+00, 2.21222e+00, 3.03920e+00, 4.17532e...+01, 1.00000e+02]), 'kernel': ['linear', 'rbf', 'poly'], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [165]:
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(model_cv.score(X_train_scaled, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(model_cv.score(X_test_scaled, y_test)))

Accuracy of Logistic regression classifier on training set: 0.98
Accuracy of Logistic regression classifier on test set: 0.96


In [166]:
print("Tuned Model Parameter: {}".format(model_cv.best_params_))
print("Tuned Model Accuracy: {}".format(model_cv.best_score_))

Tuned Model Parameter: {'C': 0.32903445623126676, 'gamma': 0.0001, 'kernel': 'linear'}
Tuned Model Accuracy: 0.9809523809523809


In [167]:
y_pred = model_cv.predict(X_test_scaled)

In [168]:
confusion_matrix(y_test, y_pred)

array([[15,  0,  0],
       [ 0, 15,  1],
       [ 0,  1, 13]], dtype=int64)

In [169]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        15
          1       0.94      0.94      0.94        16
          2       0.93      0.93      0.93        14

avg / total       0.96      0.96      0.96        45



In [148]:
#no probability for SVM
#y_pred_prob = model_cv.predict_proba(X_test_scaled)[:,1]

In [149]:
#no roc curve for multi-class