**Hyper-Parameter Tuning**
---

It's a method used to find the optimal parameters for a machine learning algorithm.
It's important to search what parameters the algorithm offers and I can tune.

Best results:


*   Decision Tree: 0.95%
*   Random Forest: 0.96%
*   KNN: 0.93%
*   Logistic Regression: 0.95%
*   SVM: 0.95%
*   MLP: 0.98% with scaled data


In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import pandas as pd

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler

In [3]:
data, target = load_breast_cancer(return_X_y=True)

In [4]:
scaler = StandardScaler()
data = scaler.fit_transform(data)

Decision Tree
---


In [None]:
parameters = {'criterion': ['gini', 'entropy'],
              'splitter': ['best', 'random'],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 5, 10]}

In [None]:
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=parameters)
grid_search.fit(data, target)
best_parameters = grid_search.best_params_
best_result = grid_search.best_score_

In [None]:
best_parameters, best_result

({'criterion': 'entropy',
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'splitter': 'random'},
 0.9507374631268437)

Random Forest
---

In [None]:
parameters = {'criterion': ['gini', 'entropy'],
              'n_estimators': [10, 15, 20],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 5, 10]}
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parameters)
grid_search.fit(data, target)
best_parameters = grid_search.best_params_
best_result = grid_search.best_score_

In [None]:
best_parameters, best_result

({'criterion': 'entropy',
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 10},
 0.9648501785437045)

KNN
---

In [None]:
parameters = {'n_neighbors': [7, 9, 11],
              'p': [1, 2]}
grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=parameters)
grid_search.fit(data, target)
best_parameters = grid_search.best_params_
best_result = grid_search.best_score_

In [None]:
best_parameters, best_result

({'n_neighbors': 9, 'p': 1}, 0.9385188635305077)

Logistic Regression
---

In [None]:
parameters = {'solver': ['liblinear', 'lbfgs', 'sag'],
              'tol': [0.001, 0.0001, 0.00001],
              'C': [1.0, 1.5, 2.0]}
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=parameters)
grid_search.fit(data, target)
best_parameters = grid_search.best_params_
best_result = grid_search.best_score_

In [None]:
best_parameters, best_result

({'C': 1.5, 'solver': 'liblinear', 'tol': 0.0001}, 0.9543238627542309)

Support Vector Machine
---

In [27]:
parameters = {'C': [1.5, 2.0],
              'kernel': ['rbf', 'linear', 'poly'],
              'tol': [0.001, 0.0001]}
grid_search = GridSearchCV(estimator=SVC(), param_grid=parameters)
grid_search.fit(data, target)
best_parameters = grid_search.best_params_
best_result = grid_search.best_score_

In [28]:
best_parameters, best_result

({'C': 1.5, 'kernel': 'rbf', 'tol': 0.001}, 0.9771619313771154)

MLP
---

In [None]:
parameters = {'activation': ['relu', 'logistic', 'tanh'],
              'solver': ['adam', 'sgd'],
              'batch_size': [10, 56]}
grid_search = GridSearchCV(estimator=MLPClassifier(), param_grid=parameters)
grid_search.fit(data, target)
best_parameters = grid_search.best_params_
best_result = grid_search.best_score_

In [None]:
best_parameters, best_result

({'activation': 'logistic', 'batch_size': 10, 'solver': 'adam'},
 0.9243595714951095)

Cross Validation
---


In [5]:
from sklearn.model_selection import cross_val_score, KFold

In [20]:
tree_results = []
randomf_results = []
mlp_results = []
svm_results = []

In [21]:
for i in range(10):  # 300 tests with 10 splits and 30 random states is better
  kfold = KFold(n_splits=10, shuffle=True, random_state=i)
  tree = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=1, 
                                min_samples_split=5, splitter='random')
  score = cross_val_score(tree, data, target, cv=kfold)
  tree_results.append(score.mean())

In [22]:
for i in range(10):
  kfold = KFold(n_splits=10, shuffle=True, random_state=i)
  randomf = RandomForestClassifier(criterion='entropy', min_samples_leaf=1, 
                                   min_samples_split=2, n_estimators=10)
  score = cross_val_score(randomf, data, target, cv=kfold)
  randomf_results.append(score.mean())

In [29]:
for i in range(10):
  kfold = KFold(n_splits=10, shuffle=True, random_state=i)
  svm_cls = SVC(C=1.5, kernel='rbf', tol=0.001)
  score = cross_val_score(svm_cls, data, target, cv=kfold)
  svm_results.append(score.mean())

In [None]:
for i in range(10):
  kfold = KFold(n_splits=10, shuffle=True, random_state=i)
  mlp_cls = MLPClassifier(activation='logistic', batch_size=10, solver='adam')
  score = cross_val_score(mlp_cls, data, target, cv=kfold)
  mlp_results.append(score.mean())

In [30]:
svm_results # each result is the mean of 10 tests with different combinations of slices

[0.9789473684210526,
 0.9736215538847116,
 0.9753759398496239,
 0.9753759398496239,
 0.9771303258145363,
 0.9735902255639097,
 0.9806704260651629,
 0.9718671679197994,
 0.9718984962406015,
 0.975407268170426]

Results Analysis
---

In [31]:
results = pd.DataFrame({'Decision tree': tree_results,
                        'SVM': svm_results,
                        'Random forest': randomf_results,
                        'Neural Network': mlp_results})
results # this structure could be used to analyse the precision and recall too

Unnamed: 0,Decision tree,SVM,Random forest,Neural Network
0,0.947306,0.978947,0.957832,0.973684
1,0.94552,0.973622,0.956078,0.971867
2,0.943797,0.975376,0.954323,0.970144
3,0.92265,0.975376,0.950815,0.970144
4,0.922682,0.97713,0.957832,0.968327
5,0.927851,0.97359,0.955984,0.97005
6,0.935025,0.98067,0.954355,0.973653
7,0.940226,0.971867,0.957832,0.970113
8,0.927882,0.971898,0.970081,0.966573
9,0.924499,0.975407,0.957895,0.971867


In [32]:
results.describe() # best std = MLP and SVM
# the best algorithm here is SVM due to its score and std

Unnamed: 0,Decision tree,SVM,Random forest,Neural Network
count,10.0,10.0,10.0,10.0
mean,0.933744,0.975388,0.957303,0.970642
std,0.009817,0.002878,0.005027,0.00222
min,0.92265,0.971867,0.950815,0.966573
25%,0.925337,0.973598,0.954762,0.970066
50%,0.931454,0.975376,0.956955,0.970144
75%,0.942904,0.9767,0.957832,0.971867
max,0.947306,0.98067,0.970081,0.973684


In [33]:
results.std() / results.mean() * 100 # coefficient of variation

Decision tree     1.051333
SVM               0.295040
Random forest     0.525070
Neural Network    0.228744
dtype: float64