## Data

In [None]:
from sklearn.datasets import load_wine
import pandas as pd

In [None]:
data, target = load_wine(return_X_y=True)

In [None]:
data_frame = pd.DataFrame(data=data, columns=load_wine().feature_names)

In [None]:
data_frame.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [None]:
from collections import Counter
Counter(target)

Counter({0: 59, 1: 71, 2: 48})

## Hyperparameter tuning

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

### Decision Tree

In [None]:
parameters = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 3, 4]
}
grid_search = GridSearchCV(param_grid=parameters, 
                           estimator=DecisionTreeClassifier())
grid_search.fit(data, target)
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

In [None]:
best_parameters, best_score

({'criterion': 'gini',
  'min_samples_leaf': 2,
  'min_samples_split': 2,
  'splitter': 'random'},
 0.9331746031746032)

### Random forest

In [None]:
parameters = {
    'n_estimators': [10, 25, 40, 50],
    'criterion': ['entropy', 'gini'],
    'min_samples_split': [2, 3, 4, 5]
}
grid_search = GridSearchCV(param_grid=parameters, 
                           estimator=RandomForestClassifier())
grid_search.fit(data, target)
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_
best_parameters, best_score

({'criterion': 'gini', 'min_samples_split': 2, 'n_estimators': 50},
 0.9888888888888889)

### KNN

In [None]:
parameters = {
    'n_neighbors': [3, 5, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}
grid_search = GridSearchCV(param_grid=parameters, 
                           estimator=KNeighborsClassifier())
grid_search.fit(data, target)
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_
best_parameters, best_score

({'n_neighbors': 3, 'p': 1, 'weights': 'distance'}, 0.7814285714285714)

### SVM

In [None]:
parameters = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [1, 1.5, 2],
    'degree': [2, 3, 4, 5]
}
grid_search = GridSearchCV(param_grid=parameters, 
                           estimator=SVC())
grid_search.fit(data, target)
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_
best_parameters, best_score

({'C': 1.5, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'},
 0.9666666666666668)

### Neural Network

In [None]:
parameters = {
    'hidden_layer_sizes': [300, 400],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'sgd', 'lbfgs'],
}
grid_search = GridSearchCV(param_grid=parameters, 
                           estimator=MLPClassifier())
grid_search.fit(data, target)
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_
best_parameters, best_score # {'activation': 'logistic', 'hidden_layer_sizes': 200, 'solver': 'adam'}

## Cross Validation


In [None]:
from sklearn.model_selection import cross_val_score, KFold

In [None]:
tree_results = []
rf_results = []
knn_results = []
svm_results = []
mlp_results = []

In [None]:
for i in range(30):
  kfold = KFold(n_splits=10, shuffle=True, random_state=i)
  tree = DecisionTreeClassifier(criterion='gini', min_samples_leaf=2,
                                min_samples_split=2, splitter='random')
  score = cross_val_score(tree, data, target, cv=kfold)
  tree_results.append(score.mean())

In [None]:
for i in range(30):
  kfold = KFold(n_splits=10, shuffle=True, random_state=i)
  rf = RandomForestClassifier(criterion='gini', min_samples_split=2, 
                              n_estimators=50)
  score = cross_val_score(rf, data, target, cv=kfold)
  rf_results.append(score.mean())

In [None]:
for i in range(30):
  kfold = KFold(n_splits=10, shuffle=True, random_state=i)
  knn = KNeighborsClassifier(n_neighbors=3, p=1, weights='distance')
  score = cross_val_score(knn, data, target, cv=kfold)
  knn_results.append(score.mean())

In [None]:
for i in range(30):
  kfold = KFold(n_splits=10, shuffle=True, random_state=i)
  svm = SVC(C=1.5, gamma='scale', kernel='linear')
  score = cross_val_score(svm, data, target, cv=kfold)
  svm_results.append(score.mean())

In [None]:
for i in range(30):
  kfold = KFold(n_splits=10, shuffle=True, random_state=i)
  mlp = MLPClassifier(activation='logistic', hidden_layer_sizes=200, solver='adam')
  score = cross_val_score(mlp, data, target, cv=kfold)
  mlp_results.append(score.mean())