### Upload librerie

In [77]:
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [5]:
# Importo il dataset iris
iris = datasets.load_iris()

In [7]:
print(iris['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

                Min  Max   Mean    SD   Class Correlation
sepal length:   4.3  7.9   5.84   0.83    0.7826
sepal width:    2.0  4.4   3.05   0.43   -0.4194
petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fis

In [9]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [11]:
X = iris['data']
Y = iris['target']

In [15]:
for i, x in enumerate(X[:10]):
    print(f'feature: {x} label: {Y[i]}')

feature: [5.1 3.5 1.4 0.2] label: 0
feature: [4.9 3.  1.4 0.2] label: 0
feature: [4.7 3.2 1.3 0.2] label: 0
feature: [4.6 3.1 1.5 0.2] label: 0
feature: [5.  3.6 1.4 0.2] label: 0
feature: [5.4 3.9 1.7 0.4] label: 0
feature: [4.6 3.4 1.4 0.3] label: 0
feature: [5.  3.4 1.5 0.2] label: 0
feature: [4.4 2.9 1.4 0.2] label: 0
feature: [4.9 3.1 1.5 0.1] label: 0


### Feature Scaling

In [27]:
X.shape

(150, 4)

In [47]:
import numpy as np
min = np.min(X, axis=0)
max = np.max(X, axis=0)
mean = np.mean(X, axis=0)
print(f'Sepal len - min: {min[0]} max: {max[0]} mean: {mean[0]:.1f}')
print(f'Sepal wid - min: {min[1]} max: {max[1]} mean: {mean[1]:.1f}')
print(f'Petal len - min: {min[2]} max: {max[2]} mean: {mean[2]:.1f}')
print(f'Petal wid - min: {min[3]} max: {max[3]} mean: {mean[3]:.1f}')

Sepal len - min: 4.3 max: 7.9 mean: 5.8
Sepal wid - min: 2.0 max: 4.4 mean: 3.1
Petal len - min: 1.0 max: 6.9 mean: 3.8
Petal wid - min: 0.1 max: 2.5 mean: 1.2


In [51]:
scaler = MinMaxScaler()
scaler.feature_range

(0, 1)

In [53]:
X = scaler.fit_transform(X)
min = np.min(X, axis=0)
max = np.max(X, axis=0)
mean = np.mean(X, axis=0)
print(f'Sepal len - min: {min[0]} max: {max[0]} mean: {mean[0]:.1f}')
print(f'Sepal wid - min: {min[1]} max: {max[1]} mean: {mean[1]:.1f}')
print(f'Petal len - min: {min[2]} max: {max[2]} mean: {mean[2]:.1f}')
print(f'Petal wid - min: {min[3]} max: {max[3]} mean: {mean[3]:.1f}')

Sepal len - min: 0.0 max: 1.0 mean: 0.4
Sepal wid - min: 0.0 max: 1.0 mean: 0.4
Petal len - min: 0.0 max: 1.0 mean: 0.5
Petal wid - min: 0.0 max: 1.0 mean: 0.5


### Training set e test set

In [112]:
from sklearn.model_selection import train_test_split
# Primo split: 80% training + validation, 25% test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, Y, test_size=0.25, random_state=0, stratify=Y, shuffle=True)

# Secondo split: 80% training, 20% validation (dai dati di train+validation)
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y, shuffle=True)

## K Nearest Neighbors

### Addestramento

In [116]:
for i in [2,3,5,10]:
    model = KNN(n_neighbors=i)
    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, prediction)
    print(f'k={i} - accuracy: {val_accuracy:.2f}')

k=2 - accuracy: 0.93
k=3 - accuracy: 0.97
k=5 - accuracy: 0.97
k=10 - accuracy: 0.97


In [118]:
best_model = KNN(n_neighbors=3)
best_model.fit(X_train, y_train)
prediction = best_model.predict(X_test)

print('Test accuracy:', accuracy_score(y_test, prediction))

Test accuracy: 0.9736842105263158


## Decision Tree

### Addestramento

In [122]:
from sklearn.tree import DecisionTreeClassifier as DT
for i in range(1,30):
    model = DT(max_depth=i)
    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, prediction)
    print(f'k={i} - accuracy: {val_accuracy:.2f}')

k=1 - accuracy: 0.67
k=2 - accuracy: 0.93
k=3 - accuracy: 0.97
k=4 - accuracy: 0.97
k=5 - accuracy: 0.97
k=6 - accuracy: 0.97
k=7 - accuracy: 0.97
k=8 - accuracy: 0.97
k=9 - accuracy: 0.97
k=10 - accuracy: 0.93
k=11 - accuracy: 0.93
k=12 - accuracy: 0.97
k=13 - accuracy: 0.93
k=14 - accuracy: 0.93
k=15 - accuracy: 0.93
k=16 - accuracy: 0.93
k=17 - accuracy: 0.93
k=18 - accuracy: 0.93
k=19 - accuracy: 0.93
k=20 - accuracy: 0.97
k=21 - accuracy: 0.97
k=22 - accuracy: 0.93
k=23 - accuracy: 0.97
k=24 - accuracy: 0.93
k=25 - accuracy: 0.93
k=26 - accuracy: 0.97
k=27 - accuracy: 0.97
k=28 - accuracy: 0.93
k=29 - accuracy: 0.97


In [124]:
best_model = DT(max_depth=3)
best_model.fit(X_train, y_train)
prediction = best_model.predict(X_test)
accuracy = accuracy_score(y_test, prediction)

print('Accuracy on test set: {:.2f}'.format(accuracy))

Accuracy on test set: 1.00
