# Supervised learning - Classification

In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

In [3]:
from sklearn.datasets import load_iris
iris = load_iris()

print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [4]:
data = iris.data
label = iris.target
columns = iris.feature_names

data = pd.DataFrame(data, columns=columns)
print(data.shape)
print(data.head())

(150, 4)
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, label, train_size=0.8, random_state=1994)

In [6]:
# Logistic regression classification

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
from sklearn.metrics import accuracy_score

print('Logistic regression classification, Accuracy: {:.4f}'.format(accuracy_score(y_test, y_pred)))

Logistic regression classification, Accuracy: 0.9667


In [11]:
# Support vector machine classifier
from sklearn.svm import SVC
svc = SVC(C=10)

svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

print('Support vector machine classification, Accuracy: {:.4f}'.format(accuracy_score(y_test, y_pred)))

Support vector machine classification, Accuracy: 1.0000


In [12]:
# Decision tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=5)

tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)

print('Decision tree classification, Accuracy: {:.4f}'.format(accuracy_score(y_test, y_pred)))

Decision tree classification, Accuracy: 0.9333


In [14]:
# Random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=5)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print('Random forest classification, Accuracy: {:.4f}'.format(accuracy_score(y_test, y_pred)))

Random forest classification, Accuracy: 0.9333
