In [1]:
# dataset loader
from sklearn import datasets

# model training and evalutation utilities 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold # this is one way to generate folds
from sklearn.model_selection import KFold

# models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

# toy data
X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape

((150, 4), (150,))

# What you should learn/be aware of based on this lecture

Key sklearn functions:

- train_test_split
- cross_validate
- Fold generators: KFold and StratifiedKFold
- Scoring functions per last lecture and how to pass to cross_validate
- How to compare different models by looping over them with cross_validate, GridSearchCV, or RandomizedSearchCV

Not covered today but you should check out:

- confusion_matrix and classification_report (helpful to evaluate models)

# A simple "split, train, evaluate" example

In [2]:
# split the data with 50% in each set
X1, X2, y1, y2 = train_test_split(X, y, random_state=0,
                                  train_size=0.5)

# fit the model on one set of data
# ignore the model I choose here, its not important what
model = KNeighborsClassifier(n_neighbors=1)
model.fit(X1, y1) # fit on the "training data" X1 and  y1

# evaluate the model on the second set of data
y2_model = model.predict(X2) # using X2 (out-of-sample data), predict y2
accuracy_score(y2, y2_model) # see how close y2 is to prediction (fraction of all pred that are exactly right)

0.9066666666666666

# Want to do k-fold? It's like repeating the above. In pseudo code, it looks like:¶

1. Break the X and y data into $k$ subsamples
2. For each subsample, fit the model, predict OOS, score predictions, and save those

Ok?

# K-Fold in Python: The explicit way, and the wrapped way

Watch me do the explicit way

In [9]:
# you can take quick notes here, but I'm not going to write this code slow enough to copy

accuracy= []

# the point here is to illustrate

from train_index, test_index in StratifiedKFold(n_splits=5).split(X,y):
    
    #.split() yields the indices in train/test sets, use those to get 
    # x/y vars for each separated out: 
    
    x_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #fit/estimate, predict OOS, evaluate and store 
    model.fit (X_train, y_train)
    y_predict = model.predict(X_test)
    accuracy.append(accuracy_score(y_test, y_predict))
    
accuracy #print 
# import numpy as np 
#np.mean 

SyntaxError: invalid syntax (<ipython-input-9-eec6a95e9e92>, line 7)

In [12]:
# try the function heere 

cross_validate(model, X, y)



{'fit_time': array([0.        , 0.00099945, 0.00099921]),
 'score_time': array([0.00199294, 0.00252581, 0.00250363]),
 'test_score': array([0.98039216, 0.92156863, 1.        ])}

In [11]:
# try here with diff scores 
cross_validate (model, X,  y, scoring = ['accuracy', 'r2', 'precision_macro'])



{'fit_time': array([0., 0., 0.]),
 'score_time': array([0.00897384, 0.00797796, 0.00698233]),
 'test_accuracy': array([0.98039216, 0.92156863, 1.        ]),
 'test_r2': array([0.97058824, 0.88235294, 1.        ]),
 'test_precision_macro': array([0.98148148, 0.9251462 , 1.        ])}

Now try the wrapper below! We are going to see how to use that function to:

- try multiple models
- try different sets of X variables
- try different ways to specific folds

In [3]:
# try the function here

In [4]:
# try here with diff scores

All the metrics it can compute out of the box are here: https://scikit-learn.org/stable/modules/model_evaluation.html

Notice that many of these were discussed in our last lecture!

Warning/Note: the metric names on that link and what you put in the scoring dictionary don't seem to match up.

# Question

In [5]:
# using 5 folds, what is the average (across the folds) out-of-sampling (training) F1? 

In [13]:
cross_validate(model,X,y, scoring = 'f1_macro') ['test_score'].mean()



0.9672238255571589

# Exploring the cross_validate parameters 

# The Model Parameter

In [14]:
# change the model 

# yb changing the model parameter, you can adj the type of model and models parameters 

cross_validate (SVC (gamma='auto'),X,y,scoring='f1_macro')
cross_validate (SVC (C=5), X,y, scoring ='f1_macro')



{'fit_time': array([0.0029912 , 0.00099754, 0.0009973 ]),
 'score_time': array([0.00099826, 0.00099325, 0.00099754]),
 'test_score': array([0.98037518, 0.96064815, 1.        ])}

# Question 

try to use a regression model, (you can't use f1 on  this, so evaluate on r2)

In [19]:
# answer here 
cross_validate (LinearRegression(),X,y, scoring= 'r2') ['test_score'].mean()



0.0

linear_model submodule contains lots of useful alternate options 

In [None]:
# for example
linear_model.Lasso
linear_model.Ridge
linear_model.LogisticRegression

linear_model.LassoCV() # Returns a Lasso (L1 Regularization) linear model with picking the best model by cross validation
linear_model.RidgeCV() # Returns a Ridge (L2 Regularization) linear model with picking the best model by cross validation
linear_model.LogisticRegressionCV() # return best logit model by CV

looping over models 

In [20]:
# set up models to try
models = []
models.append(('svc_1', SVC(gamma='auto') ))
models.append(('svc_2', SVC(C=5) ))
models.append(('neighbor1',  KNeighborsClassifier(n_neighbors=1)))
models[0][1]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [21]:
# set up models to try
models = []
models.append(('svc_1', SVC(gamma='auto') ))
models.append(('svc_2', SVC(C=5, gamma='scale') ))
models.append(('neighbor',  KNeighborsClassifier(n_neighbors=1)))

# loop and print
for name, model in models:
    scores = cross_validate(model, X, y, scoring='accuracy')
    print('%s: %.3f (%.3f)' % (name.ljust(10), 
                                   scores['test_score'].mean(), 
                                   scores['test_score'].std()
                                   )
         )

svc_1     : 0.973 (0.009)
svc_2     : 0.980 (0.001)
neighbor  : 0.967 (0.033)




In [None]:
# grid search CV 
# randomsizedsearch CV

# The X factor

You can loop over X's

In [None]:
# define a smaller X and a bigger X
X_small = X[:,:2] # just first two columns

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, include_bias=False)
X3 = poly.fit_transform(X)

# set up Xs to try
right here!

# loop and print
right here!

# Xs and Models

# Cv Parameter and Folds

Just watch 

# Links, resoruces, and next week

Only two resources needed

- sklearn docs are GREAT https://scikit-learn.org/stable/user_guide.html
- Python Data Science Handbook (note some module calls are obsolete, so you might need to update code) - - https://jakevdp.github.io/PythonDataScienceHandbook/index.html

Next week:

- preprocessing
- data transformations
- feasture selection