# BanknoteAuthentication Dataset

Responsibility : Mohamed Khaled

## Imports

In [24]:
# For data manipulation and analysis
import pandas as pd
import numpy as np

# For data preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# For implementing ensemble models
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)

# For hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# For model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns

## Dataset Selection

In [None]:
df = pd.read_csv(
    "./AMLAss1Datasets/data_banknote_authentication.csv"
)  # Suitable for classification

##  Preprocessing
* Dataset has no nulls
* All columns will be used in our project
* Scaling the numerical features 
    ```python
    StandardScaler() #Scaling data to fit a standard normal distribution
    ```
* Removing outliers using `IQR` method

In [None]:
# Renaming the columns
df.columns = [
    "variance_wavelet", "skewness_wavelet", "curtosis_wavelet", "image_entropy", "class"
]

In [None]:
# Scaling the numerical features
scaler = StandardScaler()
df[["variance_wavelet", "skewness_wavelet", "curtosis_wavelet", "image_entropy"]] = scaler.fit_transform(
    df[["variance_wavelet", "skewness_wavelet", "curtosis_wavelet", "image_entropy"]]
)

In [None]:
df.head()

# Models

In [None]:
# Train test split
X = df.drop("class", axis=1)
y = df["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Random Forest

In [None]:
# Extra Imports
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

In [37]:
rf_1 = RandomForestClassifier()

### Hyperparametre Tuning
- Defined a range of estimators, features and bootstrap state
- These are then used to figure out the best parametres using GridSearchCV()
- The model trains on the dataset then predictions are done
- Evaluations are calculated and shown

#### Choosing the correct number of trees

In [38]:
# Having a range of parameters to test
n = [64, 100, 128, 200]
max_features = ["sqrt", "log2"]
bootstrap = [True, False]

In [39]:
param_grid = {
    "n_estimators": n,
    "max_features": max_features,
    "bootstrap": bootstrap,
}

clf = GridSearchCV(rf_1, param_grid, cv=10, scoring="accuracy")
clf.fit(X_train, y_train)
print("Best Parameters:", clf.best_params_)
print("Best Accuracy:", clf.best_score_)

Best Parameters: {'bootstrap': True, 'max_features': 'sqrt', 'n_estimators': 100}
Best Accuracy: 0.9927189324437032


In [40]:
# Training the model with the best parameters
rf = RandomForestClassifier(
    n_estimators=clf.best_params_["n_estimators"],
    max_features=clf.best_params_["max_features"],
    bootstrap=clf.best_params_["bootstrap"],
)

rf.fit(X_train, y_train)

In [71]:
# Predictions
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Displaying the prediction evals
print("Random Forest Classifier")
print("Accuracy:", accuracy)
print("Confusion Matrix:", cm)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Random Forest Classifier
Accuracy: 0.9927272727272727
Confusion Matrix: [[148   0]
 [  2 125]]
Precision: 1.0
Recall: 0.984251968503937
F1 Score: 0.9920634920634921


## AdaBoost

In [65]:
# AdaBoost
ada = AdaBoostClassifier()

### Hyperparametre Tuning
- Defined a range of estimators, learning rates, random values and the boosting algortithm to use
- These are then used to figure out the best parametres using GridSearchCV()
- The model trains on the dataset then predictions are done
- Evaluations are calculated and shown

In [66]:
# Having a range of parameters to test
n_ada = [64, 100, 200, 500, 700, 1000]
learning_rate = [0.1, 0.5, 1, 1.5, 2.0]
random = [None, 42]
algorithm = ["SAMME"]

In [67]:
param_grid = {
    "n_estimators": n_ada,
    "learning_rate": learning_rate,
    "random_state": random,
    "algorithm": algorithm,
}

clf = GridSearchCV(ada, param_grid, cv=10, scoring="accuracy")
clf.fit(X_train, y_train)
print("Best Parameters:", clf.best_params_)
print("Best Accuracy:", clf.best_score_)

Best Parameters: {'algorithm': 'SAMME', 'learning_rate': 0.5, 'n_estimators': 500, 'random_state': None}
Best Accuracy: 0.9981734778982485


In [68]:
# Training the model with the best parameters
ada = AdaBoostClassifier(
    n_estimators=clf.best_params_["n_estimators"],
    learning_rate=clf.best_params_["learning_rate"],
    random_state=clf.best_params_["random_state"],
    algorithm=clf.best_params_["algorithm"],
)

ada.fit(X_train, y_train)

In [72]:
# Predictions
y_pred = ada.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Displaying the prediction evals
print("AdaBoost Classifier")
print("Accuracy:", accuracy)
print("Confusion Matrix:", cm)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

AdaBoost Classifier
Accuracy: 0.9963636363636363
Confusion Matrix: [[148   0]
 [  1 126]]
Precision: 1.0
Recall: 0.9921259842519685
F1 Score: 0.9960474308300395


## Gradient Boost

In [59]:
# Gradient Boosting
gb = GradientBoostingClassifier()

### Hyperparametre Tuning

In [62]:
# Having a range of parameters to test
n_gb = [64, 100, 200, 500, 700, 1000]
learning_rate = [0.1, 0.5, 1, 1.5, 2.0]
loss = ["log_loss", "exponential"]

In [73]:
param_grid = {
    "n_estimators": n_gb,
    "learning_rate": learning_rate,
    "loss": loss,
}

clf = GridSearchCV(gb, param_grid, cv=10, scoring="accuracy")
clf.fit(X_train, y_train)
print("Best Parameters:", clf.best_params_)
print("Best Accuracy:", clf.best_score_)

In [None]:
# Training the model with the best parameters
gb = GradientBoostingClassifier(
    n_estimators=clf.best_params_["n_estimators"],
    learning_rate=clf.best_params_["learning_rate"],
    subsample=clf.best_params_["subsample"],
    loss=clf.best_params_["loss"],
)

gb.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = gb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Displaying the prediction evals
print("Gradient Boosting Classifier")
print("Accuracy:", accuracy)
print("Confusion Matrix:", cm)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

## Model Implementation & Hyperparameter Turning

References :
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html