In [1]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib notebook

In [2]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data,
                                                    digits.target)

In [3]:
X_train.shape

(1347, 64)

In [4]:
np.bincount(y_train)

array([134, 135, 137, 133, 128, 150, 131, 143, 130, 126])

Really Simple API
-------------------
0) Import your model class

In [5]:
from sklearn.svm import LinearSVC

1) Instantiate an object and set the parameters

In [6]:
svm = LinearSVC(C=0.1)

2) Fit the model

In [7]:
svm.fit(X_train, y_train)

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

3) Apply / evaluate

In [8]:
print(svm.predict(X_train))
print(y_train)

[1 4 4 ..., 0 9 7]
[1 4 4 ..., 0 9 7]


In [9]:
svm.score(X_train, y_train)

0.99703043801039348

In [10]:
svm.score(X_test, y_test)

0.93999999999999995

And again
---------

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
rf = RandomForestClassifier(n_estimators=50)

In [13]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [14]:
rf.score(X_test, y_test)

0.95999999999999996

# Exercises
Load the iris dataset from the ``sklearn.datasets`` module using the ``load_iris`` function.

Split it into training and test set using ``train_test_split``.


Then train an evaluate a classifier of your choice. Try ``sklearn.neighbors.KNeighborsClassifier`` for example.


In [15]:
from sklearn.datasets import load_iris
iris = load_iris()

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target)

In [17]:
from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier(n_neighbors=5)
kn.fit(X=X_train, y=y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [18]:
kn.score(X_train, y_train)

0.9821428571428571

In [19]:
kn.score(X_test, y_test)

0.94736842105263153

In [20]:
print(kn.predict(X_test))
print(y_test)

[2 2 0 1 2 0 0 1 2 2 1 2 0 0 0 0 2 2 0 0 2 2 0 2 2 2 2 1 0 0 2 2 1 2 0 0 0
 1]
[2 2 0 1 2 0 0 1 2 2 1 2 0 0 0 0 2 2 0 0 2 2 0 2 2 2 2 1 0 0 1 2 1 1 0 0 0
 1]


In [20]:
# %load solutions/train_iris.py