# sklearn

scikit-learn: Machine Learning in Python
* Simple and efficient tools for data mining and data analysis
* Accessible to everybody, and reusable in various contexts
* Built on NumPy, SciPy, and matplotlib
* Open source, commercially usable - BSD license

# Outline

* Datasets
* Train a logistic regression estimator
* Predict
* Evaluate a model
* Split datasets
* Cross-validation and hyper-parameter tunning
* Save and load models

# Datasets

1. Packaged sets
2. Hosted on the web
3. Auto-generated

In [1]:
from sklearn import datasets

## Packaged

In [None]:
datasets.load_

## Hosted

After you fetch these datasets they get cached

In [None]:
datasets.fetch_

The location can be found by running

In [2]:
from sklearn.datasets import get_data_home
get_data_home()

'/Users/redaal-bahrani/scikit_learn_data'

## Load the iris dataset

sklearn data is provided as a bunch

In [3]:
iris = datasets.load_iris()

In [4]:
iris.keys()

['target_names', 'data', 'target', 'DESCR', 'feature_names']

In [5]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], 
      dtype='|S10')

In [6]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [11]:
len(iris.data)

150

In [10]:
len(iris.target)

150

# Training a model

## Import estimator

In [12]:
from sklearn import linear_model

## Instantiate a logistic regression estimator

In [21]:
logreg = linear_model.LogisticRegression(C=1e5)

In [22]:
X = iris.data
y = iris.target

## Train an estimator

In [23]:
logreg.fit(X, y)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

## Predict

In [24]:
predictions = logreg.predict(X)

In [25]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Evaluate a model

To read more about [metrics](http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics)

In [26]:
from sklearn.metrics import accuracy_score

In [27]:
accuracy_score(predictions, y)

0.97999999999999998

# Spliting the dataset

[Cross Validation](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.cross_validation)

In [28]:
from sklearn.cross_validation import StratifiedShuffleSplit

In [29]:
sss = StratifiedShuffleSplit(y, 1, test_size=0.10, random_state=123456)
for train_index, test_index in sss:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [30]:
len(X), len(X_train), len(X_test)

(150, 135, 15)

In [31]:
y_train

array([2, 1, 1, 1, 2, 2, 1, 1, 0, 0, 0, 1, 0, 2, 0, 2, 2, 0, 0, 0, 2, 0, 1,
       2, 1, 0, 1, 1, 2, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1, 0, 0, 2, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 2, 2, 2, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 2, 2, 2, 1, 0, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 0, 0, 1, 0, 2, 1,
       2, 2, 2, 0, 1, 0, 1, 0, 2, 1, 2, 2, 0, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1,
       0, 0, 1, 1, 2, 0, 2, 1, 0, 2, 0, 1, 2, 1, 2, 1, 2, 2, 0, 2])

In [32]:
logreg.fit(X_train, y_train)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [33]:
predictions = logreg.predict(X_test)

In [34]:
accuracy_score(predictions, y_test)

0.8666666666666667

# Cross-validation and Hyper-parameter Tunning

In [35]:
from sklearn.cross_validation import StratifiedKFold
from sklearn import grid_search

## How to split your data

In [36]:
kf = StratifiedKFold(y_train, n_folds=3)

In [37]:
logreg??

## How to define your hyper-parameter values

In [38]:
parameters = {'C':[10.0, 1e3, 1e5], 
              'max_iter':[100, 1000, 100000], 
              'solver':['liblinear' ,'newton-cg'], 
              'tol': [0.0001, 0.001, 0.01]}

Use *n_jobs=-1* to utilize all cores on your machine 

In [39]:
logreg_tune = linear_model.LogisticRegression(n_jobs=-1)

## How to include that in a gridsearch

In [40]:
clf = grid_search.GridSearchCV(logreg_tune, parameters, cv=kf)

In [41]:
clf.fit(X_train, y_train)

GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[2 1 ..., 0 2], n_folds=3, shuffle=False, random_state=None),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [10.0, 1000.0, 100000.0], 'max_iter': [100, 1000, 100000], 'tol': [0.0001, 0.001, 0.01], 'solver': ['liblinear', 'newton-cg']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

## How to select the best estimator

In [45]:
clf.best_score_

0.97777777777777775

In [46]:
clf_best = clf.best_estimator_

In [47]:
clf_best.fit(X_train,y_train)

LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [48]:
predictions = clf_best.predict(X_test)

In [49]:
accuracy_score(predictions, y_test)

0.93333333333333335

# Saving and Loading models

## Save a model

### Method 1

In [52]:
import pickle
s = pickle.dumps(clf_best)
c = pickle.loads(s)

In [53]:
c.predict(X_train)

array([2, 1, 1, 1, 2, 2, 1, 1, 0, 0, 0, 1, 0, 2, 0, 2, 2, 0, 0, 0, 2, 0, 1,
       2, 1, 0, 1, 1, 2, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1, 0, 0, 2, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 2, 2, 2, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 1,
       1, 1, 2, 2, 2, 1, 0, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 0, 0, 1, 0, 2, 1,
       2, 2, 2, 0, 1, 0, 1, 0, 2, 1, 2, 1, 0, 0, 2, 2, 0, 2, 1, 2, 2, 0, 1,
       0, 0, 1, 1, 2, 0, 2, 1, 0, 2, 0, 1, 2, 1, 2, 1, 2, 2, 0, 2])

### Method 2

In [59]:
from sklearn.externals import joblib
joblib.dump(clf_best, 'logreg1.pkl', compress=True)

['logreg1.pkl']

In [57]:
joblib.dump??

In [None]:
from sklearn.externals import joblib

## Load a model

In [60]:
clf_load = joblib.load('logreg.pkl')
clf_load.predict(X_test)

array([1, 0, 1, 2, 1, 0, 0, 2, 1, 0, 1, 1, 2, 2, 0])