### `scikit.learn` imports

In [1]:
# modules
from sklearn import (datasets, 
                     metrics, 
                     model_selection as skms,
                     neighbors,
                     linear_model,
                     svm)
# other friends
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from IPython.display import display

# inline graphics
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore') # force sklearn to be quiet

In [3]:
#linear_model
#linear_model.LinearRegression

# basic procedure overview

In [4]:
diabetes = datasets.load_diabetes()
diabetes_df = pd.DataFrame(diabetes.data, 
                           columns=diabetes.feature_names)
diabetes_df['target'] = diabetes.target
diabetes_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [5]:
# for classification, train_test_split can use stratify=True
(diabetes_train_ftrs, diabetes_test_ftrs,
 diabetes_train_tgt , diabetes_test_tgt) = skms.train_test_split(diabetes.data, 
                                                                 diabetes.target)

In [6]:
print(diabetes_test_ftrs.shape,
      diabetes_train_ftrs.shape, sep='\n')

(111, 10)
(331, 10)


In [7]:
# fit a model to the training data
linreg_model = linear_model.LinearRegression()
linreg_model.fit(diabetes_train_ftrs, diabetes_train_tgt)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
# predict on the test data
preds = linreg_model.predict(diabetes_test_ftrs)
print(preds[:10])

[ 95.97626784  91.80412751 161.79908223  67.26257636 161.91974663
 179.90837875  71.66290905 156.27919369  50.5885396  186.87418706]


In [9]:
# compute root-mean-squared-error (MSE) between predictions and reality
mse = metrics.mean_squared_error(diabetes_test_tgt, preds)
print(np.sqrt(mse)) # sqrt is back in units of the target

56.814342668655556


# advanced procedure overview

In [10]:
# data, model, fit&score
iris = datasets.load_iris()
model = linear_model.LogisticRegression()
skms.cross_val_score(model, iris.data, iris.target)

array([0.96078431, 0.92156863, 0.95833333])

In [11]:
# control fit & score
# sklearn "scorers" are both 
#   * losses (lower is better) 
#   * scores (bigger is better)
# sklearn generally uses "bigger is better"
# which can make for weird signs some times
# lots of possibilities:  see metrics.SCORERS.keys()
skms.cross_val_score(model, 
                     iris.data, iris.target, 
                     cv=5, 
                     scoring='neg_mean_absolute_error') 

array([-0.        , -0.03333333, -0.06666667, -0.1       , -0.        ])

In [12]:
# even more custom control of scoring
def neg_RMSE(mod, ftrs, tgt):
    tgt_pred = mod.predict(ftrs)
    tgt_true = tgt
    MSE = metrics.mean_squared_error(tgt_true, tgt_pred)
    return -np.sqrt(MSE)

skms.cross_val_score(model, 
                     iris.data, iris.target, 
                     cv=skms.KFold(5, shuffle=True),  # and CV
                     scoring=neg_RMSE)

array([-0.        , -0.25819889, -0.        , -0.25819889, -0.25819889])

In [13]:
# very often, we don't -know- the best parameters/model
# so we see who does the best on the data
# in this case, let's find the best 

# barebones
# model = neighbors.KNeighborsClassifier(10) # if we knew it
knn = neighbors.KNeighborsClassifier()
mod = skms.GridSearchCV(knn, {"n_neighbors":np.arange(1,11)})
results = mod.fit(iris.data, iris.target)
results.best_params_

{'n_neighbors': 5}

In [14]:
knn = neighbors.KNeighborsClassifier()

params = {"n_neighbors":np.arange(1,11)}
kfold = skms.KFold(n_splits=10, shuffle=True)
gscv = skms.GridSearchCV(knn, params, cv=kfold)

f = gscv.fit(iris.data, iris.target)
gscv
# refit with best parameters on all the whole dataset 
# to use in "production" on brand new, different examples
# by default GridSearchCV does the fit for us, so this is 
# ready to use
# best_knn = f.best_estimator_
# and new we profit!  or save the world ...

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=True),
             error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [15]:
# what were the good parameters?
# how sensitive are the results to a parameter?
# pure python
print(sorted(zip(f.cv_results_['mean_test_score'], 
                 f.cv_results_['param_n_neighbors']), reverse=True))

# pandas style kung-fu
df = pd.DataFrame(f.cv_results_)
show=['mean_test_score','params']
df.sort_values(by=['mean_test_score'], ascending=False)[show]

[(0.9733333333333334, 10), (0.9733333333333334, 9), (0.9733333333333334, 7), (0.9666666666666667, 8), (0.9666666666666667, 5), (0.96, 6), (0.96, 4), (0.96, 3), (0.96, 1), (0.9466666666666667, 2)]


Unnamed: 0,mean_test_score,params
6,0.973333,{'n_neighbors': 7}
8,0.973333,{'n_neighbors': 9}
9,0.973333,{'n_neighbors': 10}
4,0.966667,{'n_neighbors': 5}
7,0.966667,{'n_neighbors': 8}
0,0.96,{'n_neighbors': 1}
2,0.96,{'n_neighbors': 3}
3,0.96,{'n_neighbors': 4}
5,0.96,{'n_neighbors': 6}
1,0.946667,{'n_neighbors': 2}
