# Machine Learning 1 - Nearest Neighbors and Decision Trees

## Lab objectives

* Classification with decision trees and random forests.
* Cross-validation and evaluation.

In [1]:
from lab_tools import CIFAR10, get_hog_image

dataset = CIFAR10('./CIFAR10/')

Pre-loading training data
Pre-loading test data


# 1. Nearest Neighbor

The following example uses the Nearest Neighbor algorithm on the Histogram of Gradient decriptors in the dataset.

In [2]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(dataset.train['hog'], dataset.train['labels'])

* What is the **descriptive performance** of this classifier ?
* Modify the code to estimate the **predictive performance**.
* Use cross-validation to find the best hyper-parameters for this method.

In [6]:
# -- Your code here -- #
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

pred = clf.predict(dataset.train['hog'])
score = accuracy_score(dataset.train['labels'], pred)
print(f"Descriptive {score}") #Descriptive based on the training data

split_val = 0.1
len_dataset = int(split_val*len(dataset.train["hog"]))
train_X = dataset.train["hog"][:-len_dataset]
train_Y = dataset.train["labels"][:-len_dataset]
val_X = dataset.train["hog"][-len_dataset:]
val_Y = dataset.train["labels"][-len_dataset:]

clf2 = KNeighborsClassifier(n_neighbors=1)
clf2.fit(train_X, train_Y)
pred2 = clf2.predict(val_X)
score2 = accuracy_score(val_Y, pred2)
print(f"Predictive {score2}") #Predictive based on the testing/validation data

Descriptive 1.0
Predictive 0.692


In [None]:
#Checking different parameters on the raw dataset train (distance metric, number of neighorbs + weighted distance)
#Check l'algo en plus ??
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

X, y = dataset.train["hog"], dataset.train["labels"]
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

number_neighbour = 1
distance_list = ["cityblock", "cosine", "euclidean", "haversine", "l1", "l2", "manhattan", "nan_euclidean"] #Pas haversine (only 2D)
weighted_distance_list = ["uniform", "distance"] 
best_mean = 0
best_distance = None
best_weight = None 
best_number_neighbour = 99999


while number_neighbour < 20:
    for distance in distance_list:
        for weight in weighted_distance_list:
            clf_test = KNeighborsClassifier(n_neighbors=number_neighbour, weights=weight, metric=distance)
            score_cross = cross_val_score(clf_test, X, y, cv=skf)
            mean = np.mean(score_cross)
            if mean > best_mean:
                best_mean = mean 
                best_distance = distance
                best_weight = weight 
                best_number_neighbour = number_neighbour

    number_neighbour += 1          

In [9]:
print(f"Best model - Mean: {best_mean} Distance metric: {best_distance} Weight: {best_weight} Number of neighours: {best_number_neighbour}")

Best model - Mean: 0.7804666666666666 Distance metric: cosine Weight: distance Number of neighours: 11


In [13]:
from sklearn.model_selection import GridSearchCV

X, y = dataset.train["hog"], dataset.train["labels"]
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

param_grid = {
    'n_neighbors': np.arange(1, 20),
    'metric': ["cityblock", "cosine", "euclidean", "l1", "l2", "manhattan", "nan_euclidean"],
    'weights': ["uniform", "distance"]
}

knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=skf, scoring='accuracy', verbose=1)

# Exécution de la recherche sur grille
grid_search.fit(X, y)

# Meilleurs résultats
print("Meilleure précision obtenue :", grid_search.best_score_)
print("Meilleurs paramètres :", grid_search.best_params_)


Fitting 5 folds for each of 266 candidates, totalling 1330 fits
Meilleure précision obtenue : 0.7804666666666666
Meilleurs paramètres : {'metric': 'cosine', 'n_neighbors': 11, 'weights': 'distance'}


In [14]:
#Checking different parameters after removing the extra dimension
from sklearn.decomposition import PCA

pca = PCA(n_components=0.80)
X_pca = pca.fit_transform(dataset.train["hog"])

print("Number of principal components:", pca.n_components_)
print("Variance explained :", np.sum(pca.explained_variance_ratio_))

Number of principal components: 104
Variance explained : 0.8012945313137937


In [15]:
X, y = dataset.train["hog"], dataset.train["labels"]
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

param_grid = {
    'n_neighbors': [1, 3, 5, 11, 15],
    'metric': ["cityblock", "cosine", "euclidean", "manhattan"],
    'weights': ["uniform", "distance"]
}

knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=skf, scoring='accuracy', verbose=1)

# Exécution de la recherche sur grille
grid_search.fit(X_pca, y)

# Meilleurs résultats
print("Meilleure précision obtenue :", grid_search.best_score_)
print("Meilleurs paramètres :", grid_search.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Meilleure précision obtenue : 0.7765333333333333
Meilleurs paramètres : {'metric': 'cosine', 'n_neighbors': 15, 'weights': 'distance'}


In [25]:
#Comparison of results based on the two hyper-parameters found
clf_model1 = KNeighborsClassifier(n_neighbors=11, weights="distance", metric="cosine")
clf_model1.fit(dataset.train['hog'], dataset.train['labels'])
pred_model1 = clf_model1.predict(dataset.test["hog"])
score_model1 = accuracy_score(dataset.test["labels"], pred_model1)
print(f"Predictive best parameters (raw data): {score_model1}") #Predictive based on the testing/validation data
cm_model1 = confusion_matrix(dataset.test["labels"], pred_model1)
print(cm_model1)

pca = PCA(n_components=0.80)  
X_train_pca = pca.fit_transform(dataset.train['hog'])
X_test_pca = pca.transform(dataset.test['hog'])
clf_model2 = KNeighborsClassifier(n_neighbors=15, weights="distance", metric="cosine")
clf_model2.fit(X_train_pca, dataset.train['labels'])
pred_model2 = clf_model2.predict(X_test_pca)
score_model2 = accuracy_score(dataset.test["labels"], pred_model2)
print(f"Predictive best parameters (after PCA): {score_model2}") #Predictive based on the testing/validation data
cm_model2 = confusion_matrix(dataset.test["labels"], pred_model2)
print(cm_model2)

Predictive best parameters (raw data): 0.7876666666666666
[[755 173  72]
 [ 81 772 147]
 [ 24 140 836]]
Predictive best parameters (after PCA): 0.802
[[847 120  33]
 [154 741 105]
 [ 60 122 818]]


In [29]:
#Standardiser les données ? Ou pas ? (soustraction moyenne et division par l'ecart-type pour chaque feature)
from sklearn.preprocessing import StandardScaler

X, y = dataset.train["hog"], dataset.train["labels"]
scaler = StandardScaler()
X_standard = scaler.fit_transform(X)

skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

param_grid = {
    'n_neighbors': [11, 15, 17, 19],
    'metric': ["cityblock", "cosine", "euclidean", "manhattan"],
    'weights': ["uniform", "distance"]
}

knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=skf, scoring='accuracy', verbose=1)

# Exécution de la recherche sur grille
grid_search.fit(X_standard, y)

# Meilleurs résultats
print("Best results standard: ", grid_search.best_score_)
print("Best parameters:", grid_search.best_params_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best results standard:  0.7872
Best parameters: {'metric': 'cityblock', 'n_neighbors': 17, 'weights': 'uniform'}


In [30]:
from sklearn.preprocessing import StandardScaler

X, y = dataset.train["hog"], dataset.train["labels"]
scaler = StandardScaler()
X_standard = scaler.fit_transform(X)
X_pca = pca.fit_transform(X_standard)
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

param_grid = {
    'n_neighbors': [11, 15, 17, 19],
    'metric': ["cityblock", "cosine", "euclidean", "manhattan"],
    'weights': ["uniform", "distance"]
}

knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=skf, scoring='accuracy', verbose=1)

# Exécution de la recherche sur grille
grid_search.fit(X_pca, y)

# Meilleurs résultats
print("Best results standard + PCA: ", grid_search.best_score_)
print("Best parameters:", grid_search.best_params_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best results standard + PCA:  0.7769333333333333
Best parameters: {'metric': 'cosine', 'n_neighbors': 15, 'weights': 'distance'}


In [31]:
scaler = StandardScaler()
X_train = scaler.fit_transform(dataset.train['hog'])
X_test = scaler.fit_transform(dataset.test['hog'])

clf_model_std = KNeighborsClassifier(n_neighbors=17, weights="uniform", metric="cityblock")
clf_model_std.fit(X_train, dataset.train['labels'])
pred_model_std = clf_model_std.predict(X_test)
score_model_std = accuracy_score(dataset.test["labels"], pred_model_std)
print(f"Predictive best parameters standard: {score_model_std}") #Predictive based on the testing/validation data
cm_model_std = confusion_matrix(dataset.test["labels"], pred_model_std)
print(cm_model_std)

Predictive best parameters standard: 0.785
[[758 188  54]
 [ 91 792 117]
 [ 23 172 805]]


In [47]:
scaler = StandardScaler()
pca = PCA(n_components=0.80)  

X_train = scaler.fit_transform(dataset.train['hog'])
X_train= pca.fit_transform(X_train)
X_test = scaler.fit_transform(dataset.test['hog'])
X_test = pca.transform(X_test)

clf_model_std = KNeighborsClassifier(n_neighbors=15, weights="distance", metric="cosine")
clf_model_std.fit(X_train, dataset.train['labels'])
pred_model_std = clf_model_std.predict(X_test)
score_model_std = accuracy_score(dataset.test["labels"], pred_model_std)
print(f"Predictive best parameters standard: {score_model_std}") #Predictive based on the testing/validation data
cm_model_std = confusion_matrix(dataset.test["labels"], pred_model_std)
print(cm_model_std)

Predictive best parameters standard: 0.7956666666666666
[[851 119  30]
 [162 731 107]
 [ 56 139 805]]


## 2. Decision Trees

[Decision Trees](http://scikit-learn.org/stable/modules/tree.html#tree) classify the data by splitting the feature space according to simple, single-feature rules. Scikit-learn uses the [CART](https://en.wikipedia.org/wiki/Predictive_analytics#Classification_and_regression_trees_.28CART.29) algorithm for [its implementation](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html) of the classifier. 

* **Create a simple Decision Tree classifier** using scikit-learn and train it on the HoG training set.
* Use cross-validation to find the best hyper-paramters for this method.

In [26]:
from sklearn import tree

# --- Your code here --- #
clf_tree = tree.DecisionTreeClassifier()
clf_tree.fit(train_X, train_Y)
pred_tree = clf_tree.predict(val_X)
score_tree = accuracy_score(val_Y, pred_tree)
print(f"Predictive: {score_tree}") #Predict


Predictive: 0.576


In [39]:
#Finding parameters: slip criterion, depth, features to consider per split, pruning + pre-processing (PCA)

X, y = dataset.train["hog"], dataset.train["labels"]
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

param_grid = {
    'criterion': ["gini", "“entropy”", "log_loss"],
    'splitter': ["best", "random"],
    'max_depth': [10, 15, 20, 25],
    "max_features" : [int, float, "sqrt", "log2", None]
}

clf_tree = tree.DecisionTreeClassifier()
grid_search = GridSearchCV(clf_tree, param_grid, cv=skf, scoring='accuracy', verbose=1)

# Exécution de la recherche sur grille
grid_search.fit(X, y)

# Meilleurs résultats
print("Best result tree: ", grid_search.best_score_)
print("Best parameters tree: ", grid_search.best_params_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


360 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/xavierdekeme/miniconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/xavierdekeme/miniconda3/lib/python3.11/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Users/xavierdekeme/miniconda3/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/xavierdekeme/miniconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py",

Best result tree:  0.5958666666666667
Best parameters tree:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'splitter': 'best'}


In [40]:
pca = PCA(n_components=0.80)  
X, y = dataset.train["hog"], dataset.train["labels"]
X_pca = pca.fit_transform(X)

skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

param_grid = {
    'criterion': ["gini", "“entropy”", "log_loss"],
    'splitter': ["best", "random"],
    'max_depth': [10, 12, 15, 20],
    "max_features" : [int, float, "sqrt", "log2", None]
}

clf_tree = tree.DecisionTreeClassifier()
grid_search = GridSearchCV(clf_tree, param_grid, cv=skf, scoring='accuracy', verbose=1)

# Exécution de la recherche sur grille
grid_search.fit(X_pca, y)

# Meilleurs résultats
print("Best result tree + PCA: ", grid_search.best_score_)
print("Best parameters tree: ", grid_search.best_params_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


360 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/xavierdekeme/miniconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/xavierdekeme/miniconda3/lib/python3.11/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Users/xavierdekeme/miniconda3/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/xavierdekeme/miniconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py",

Best result tree + PCA:  0.6587333333333334
Best parameters tree:  {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'splitter': 'best'}


In [37]:
#Comparison of results based on the two hyper-parameters found
clf_model_tree1 = tree.DecisionTreeClassifier(criterion="gini", max_depth=10, splitter="best")
clf_model_tree1.fit(dataset.train['hog'], dataset.train['labels'])
pred_model_tree1 = clf_model_tree1.predict(dataset.test["hog"])
score_model_tree1 = accuracy_score(dataset.test["labels"], pred_model_tree1)
print(f"Predictive best parameters tree (raw data): {score_model_tree1}") #Predictive based on the testing/validation data
cm_model_tree1 = confusion_matrix(dataset.test["labels"], pred_model_tree1)
print(cm_model_tree1)

pca = PCA(n_components=0.80)  
X_train_pca = pca.fit_transform(dataset.train['hog'])
X_test_pca = pca.transform(dataset.test['hog'])
clf_model_tree2 = tree.DecisionTreeClassifier(criterion="gini", max_depth=10, splitter="best")
clf_model_tree2.fit(X_train_pca, dataset.train['labels'])
pred_model_tree2 = clf_model_tree2.predict(X_test_pca)
score_model_tree2 = accuracy_score(dataset.test["labels"], pred_model_tree2)
print(f"Predictive best parameters tree (after PCA): {score_model_tree2}") #Predictive based on the testing/validation data
cm_model_tree2 = confusion_matrix(dataset.test["labels"], pred_model_tree2)
print(cm_model_tree2)

Predictive best parameters tree (raw data): 0.605
[[603 251 146]
 [153 630 217]
 [118 300 582]]
Predictive best parameters tree (after PCA): 0.652
[[645 240 115]
 [154 625 221]
 [ 75 239 686]]


## 3. Random Forests

[Random Forest](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) classifiers use multiple decision trees trained on "weaker" datasets (less data and/or less features), averaging the results so as to reduce over-fitting.

* Use scikit-learn to **create a Random Forest classifier** on the CIFAR data. 
* Use cross-validation to find the best hyper-paramters for this method.

In [41]:
from sklearn import ensemble

# --- Your code here --- #
clf_forest = ensemble.RandomForestClassifier()
clf_forest.fit(train_X, train_Y)
pred_forest = clf_forest.predict(val_X)
score_forest = accuracy_score(val_Y, pred_forest)
print(f"Predictive forest: {score_forest}") #Predict

Predictive forest: 0.7713333333333333


In [50]:
X, y = dataset.train["hog"], dataset.train["labels"]
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

param_grid = {
    'n_estimators': [10, 20, 50, 75, 100]
}

clf_forest = ensemble.RandomForestClassifier(criterion="gini", max_depth=10, max_features=None)
grid_search = GridSearchCV(clf_forest, param_grid, cv=skf, scoring='accuracy', verbose=1)

# Exécution de la recherche sur grille
grid_search.fit(X, y)

# Meilleurs résultats
print("Best result forest: ", grid_search.best_score_)
print("Best parameters forest: ", grid_search.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best result forest:  0.7375999999999999
Best parameters forest:  {'n_estimators': 50}


In [51]:
X, y = dataset.train["hog"], dataset.train["labels"]
X_train_pca = pca.fit_transform(dataset.train['hog'])
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

param_grid = {
    'n_estimators': [20, 50, 75]
}

clf_forest = ensemble.RandomForestClassifier(criterion="gini", max_depth=10, max_features=None)
grid_search = GridSearchCV(clf_forest, param_grid, cv=skf, scoring='accuracy', verbose=1)

# Exécution de la recherche sur grille
grid_search.fit(X_train_pca, y)

# Meilleurs résultats
print("Best result forest: ", grid_search.best_score_)
print("Best parameters forest: ", grid_search.best_params_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best result forest:  0.7248
Best parameters forest:  {'n_estimators': 75}


In [52]:
#Comparison of results based on the two hyper-parameters found
clf_model_tree1 = ensemble.RandomForestClassifier(criterion="gini", max_depth=10, max_features=None, n_estimators=50)
clf_model_tree1.fit(dataset.train['hog'], dataset.train['labels'])
pred_model_tree1 = clf_model_tree1.predict(dataset.test["hog"])
score_model_tree1 = accuracy_score(dataset.test["labels"], pred_model_tree1)
print(f"Predictive best parameters tree (raw data): {score_model_tree1}") #Predictive based on the testing/validation data
cm_model_tree1 = confusion_matrix(dataset.test["labels"], pred_model_tree1)
print(cm_model_tree1)

pca = PCA(n_components=0.80)  
X_train_pca = pca.fit_transform(dataset.train['hog'])
X_test_pca = pca.transform(dataset.test['hog'])
clf_model_tree2 = ensemble.RandomForestClassifier(criterion="gini", max_depth=10, max_features=None, n_estimators=75)
clf_model_tree2.fit(X_train_pca, dataset.train['labels'])
pred_model_tree2 = clf_model_tree2.predict(X_test_pca)
score_model_tree2 = accuracy_score(dataset.test["labels"], pred_model_tree2)
print(f"Predictive best parameters tree (after PCA): {score_model_tree2}") #Predictive based on the testing/validation data
cm_model_tree2 = confusion_matrix(dataset.test["labels"], pred_model_tree2)
print(cm_model_tree2)

Predictive best parameters tree (raw data): 0.732
[[755 188  57]
 [127 717 156]
 [ 75 201 724]]
Predictive best parameters tree (after PCA): 0.7113333333333334
[[723 194  83]
 [134 671 195]
 [ 55 205 740]]
