In [1]:
from IPython.display import Image

Image(url="https://upload.wikimedia.org/wikipedia/commons/thumb/9/9f/Iris_virginica.jpg/1024px-Iris_virginica.jpg", height=100, width=200)

In [2]:
import numpy as np
from sklearn.datasets import load_iris

iris = load_iris()
(iris.data.shape, iris.feature_names, iris.target_names)

((150, 4),
 ['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'],
 array(['setosa', 'versicolor', 'virginica'], 
       dtype='<U10'))

In [3]:
iris.data[1:5,:]

array([[ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2]])

In [4]:
iris.target[1:4]

array([0, 0, 0])

In [6]:
from sklearn.model_selection import train_test_split

X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123) 

In [7]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=242)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=242, splitter='best')

In [16]:
clf.predict([[1, 2, 3, 4], [ 4.7,  3.2,  1.3,  0.2]])

array([1, 0])

In [17]:
clf.score(X_test, y_test)

0.92105263157894735

In [18]:
clf.feature_importances_

array([ 0.01344861,  0.01344861,  0.06159065,  0.91151214])

In [19]:
from sklearn.model_selection import cross_val_score

cvs = cross_val_score(clf, X, y, cv=5)
cvs.mean()

0.96000000000000019

In [20]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=242)
clf.fit(X_train, y_train)

clf.score(X_test, y_test)

0.94736842105263153

In [22]:
from sklearn.model_selection import GridSearchCV

gparams = {"min_samples_split": [2, 3, 5],
           "min_samples_leaf": [1, 5, 10],
           "max_depth": [1, 10, 50, 100]}
gs = GridSearchCV(clf, gparams, n_jobs=-1)
gs.fit(X_train, y_train)
(gs.best_score_, gs.best_params_)

(0.9285714285714286,
 {'max_depth': 1, 'min_samples_leaf': 1, 'min_samples_split': 2})

In [24]:
from sklearn.model_selection import RandomizedSearchCV

rparams = {"min_samples_split": [2, 10],
           "min_samples_leaf": [1, 100],
           "max_depth": [1, 2, 3, 5, 10, 20, 100]}
rs = RandomizedSearchCV(clf, rparams, n_jobs=-1)
rs.fit(X_train, y_train)
(rs.best_score_, rs.best_params_)

(0.9285714285714286,
 {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2})

In [25]:
import pickle

filename = "iris-predictor.bin"

with open(filename, "wb") as f:
    pickle.dump(clf, f)

with open(filename, "rb") as f:    
    clf2 = pickle.load(f)    
    
clf2.predict(X)    

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [26]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, y_train)

clf.score(X_test, y_test)


0.94736842105263153