# Задание к занятию «Проверка точности модели, переобучение, регуляризация»

Определите модель с наилучшим показателем метрики accuracy, используя различные модели и варьируя их параметры на датасете load_digits.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pylab
from sklearn.datasets import load_digits

In [56]:
# отключим предупреждения Anaconda
import warnings
warnings.simplefilter('ignore')

# будем отображать графики прямо в jupyter'e
%pylab inline
# графики в svg выглядят более четкими
%config InlineBackend.figure_format = 'svg' 
# Retina
# %config InlineBackend.figure_format = 'retina' 

#увеличим дефолтный размер графиков
from pylab import rcParams
rcParams['figure.figsize'] = 14,9

Populating the interactive namespace from numpy and matplotlib


## Загрузка данных

In [57]:
data = load_digits()

In [58]:
X, y = pd.DataFrame(data.data), pd.Series(data.target)

## Выбор модели

In [230]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

def tune_models(models_params, X, y, n_splits=3, random_state=0):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for model, params in models_params:
        try:
            model.set_params(random_state=random_state)
        except:
            pass
        tune_model(model, params, X_train, X_test, y_train, y_test, cv)

def tune_model(model, params, X_train, X_test, y_train, y_test, cv):
    grid = GridSearchCV(model, params, cv=cv, scoring='accuracy')
    grid.fit(X_train, y_train)
    print("\n" + ("-" * 20))
    print(model.__class__.__name__)
    print("Cross val score:", grid.best_score_)
    print("Test score:", grid.score(X_test, y_test))
    print("Params:", grid.best_params_)

In [231]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

tune_models([
    (LogisticRegression(tol=0.01, class_weight='balanced'), {
        'penalty': ['l1', 'l2'],
        'C': [0.001, 0.01, 0.1, 1],
    }),
    (DecisionTreeClassifier(), {
        'max_depth': range(3, 7),
        'min_samples_split': range(2, 5),
        'min_samples_leaf': range(1, 5),
    }),    
    (RandomForestClassifier(), {
        'n_estimators': range(30, 120, 10),
    }),     
    (KNeighborsClassifier(), {
        'n_neighbors': range(3, 20),
    }),    
    (SVC(probability=True), {
        'gamma': [1e-3, 1e-4],
        'C': [1, 10, 100, 1000],
        'kernel': ['rbf', 'linear']
    }),            
], X, y, 10, random_state=123456)


--------------------
LogisticRegression
Cross val score: 0.957683741648
Test score: 0.98
Params: {'C': 0.1, 'penalty': 'l1'}

--------------------
DecisionTreeClassifier
Cross val score: 0.7713437268
Test score: 0.751111111111
Params: {'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 2}

--------------------
RandomForestClassifier
Cross val score: 0.974016332591
Test score: 0.982222222222
Params: {'n_estimators': 110}

--------------------
KNeighborsClassifier
Cross val score: 0.989606533036
Test score: 0.986666666667
Params: {'n_neighbors': 3}

--------------------
SVC
Cross val score: 0.993318485523
Test score: 0.991111111111
Params: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}


## Предыдущие результаты

```
--------------------
LogisticRegression
Cross val score: 0.968819599109
Test score: 0.964444444444
Params: {'C': 0.1, 'penalty': 'l1'}

--------------------
DecisionTreeClassifier
Cross val score: 0.766146993318
Test score: 0.76
Params: {'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 3}

--------------------
RandomForestClassifier
Cross val score: 0.968819599109
Test score: 0.984444444444
Params: {'n_estimators': 70}

--------------------
KNeighborsClassifier
Cross val score: 0.984409799555
Test score: 0.986666666667
Params: {'n_neighbors': 4}

--------------------
SVC
Cross val score: 0.988121752042
Test score: 0.988888888889
Params: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
```