# GlassTypePrediction
Responsibility : Mariam Amr

## Imports

In [2]:
# For data manipulation and analysis
import pandas as pd
import numpy as np

# For data preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# For implementing ensemble models
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)

# For hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# For model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns

## Dataset Selection

In [3]:
df = pd.read_csv(
    "./AMLAss1Datasets/glasstypePrediction.csv"
)  # Suitable for classification

## Preprocessing
* Dataset has no nulls
* All columns will be used in our project
* Scaling the numerical features 
    ```python
    StandardScaler() #Scaling data to fit a standard normal distribution
    ```
* Removing outliers using `IQR` method

In [4]:
# Renaming the columns
df.columns = [
    "ri", "na", "mg", "al", "si", "k", "ca", "ba", "fe", "type"
]

In [5]:
# Scaling the numerical features
scaler = StandardScaler()
df[["ri", "na", "mg", "al", "si", "k", "ca", "ba", "fe"]] = scaler.fit_transform(
    df[["ri", "na", "mg", "al", "si", "k", "ca", "ba", "fe"]]
)

In [7]:
df.head()

Unnamed: 0,ri,na,mg,al,si,k,ca,ba,fe,type
0,0.872868,0.284953,1.254639,-0.692442,-1.127082,-0.671705,-0.145766,-0.352877,-0.586451,1
1,-0.249333,0.591817,0.636168,-0.17046,0.102319,-0.026213,-0.793734,-0.352877,-0.586451,1
2,-0.721318,0.149933,0.601422,0.190912,0.438787,-0.164533,-0.828949,-0.352877,-0.586451,1
3,-0.232831,-0.242853,0.69871,-0.310994,-0.052974,0.112107,-0.519052,-0.352877,-0.586451,1
4,-0.312045,-0.169205,0.650066,-0.411375,0.555256,0.081369,-0.624699,-0.352877,-0.586451,1


## Model Implementation & Hyperparameter Turning

References :
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

In [6]:
#Train Test Split
y = df["type"]
X = df.drop("type", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#random_state here is used to randomly split the data into train and test sets (randomize data before split)

Random Forest Classifier

In [7]:
#random_state here and in the other models introduces randomness when splitting and selecting the feature subsets but also when comparing models, setting random_state to the same value will make the comparison fair (instead of using internal randomness)
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

#Performance
accuracy_rf = accuracy_score(y_test, y_pred)
precision_rf = precision_score(y_test, y_pred, average="macro")
recall_rf = recall_score(y_test, y_pred, average="macro")
f1_rf = f1_score(y_test, y_pred, average="macro")

print("Random Forest Classifier default parameters:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1 Score:", f1_rf)

Random Forest Classifier default parameters:
Accuracy: 0.8372093023255814
Precision: 0.9127314814814816
Recall: 0.8432539682539683
F1 Score: 0.8605223570909845


In [9]:
#Hyperparameter Tuning with RandomSearchCV
params={
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5,10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2']
}

rf_random = RandomizedSearchCV(rf_model, param_distributions=params, n_iter=100, cv=5)

rf_random.fit(X_train, y_train)

print("Best parameters for Random Forest Classifier:")
print(rf_random.best_params_)

Best parameters for Random Forest Classifier:
{'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10, 'criterion': 'entropy'}


In [10]:
#Hyperparameter Tuning with GridSearchCV
rf_grid = GridSearchCV(rf_model, param_grid=params, cv=5, n_jobs=-1)
rf_grid.fit(X_train, y_train)

print("Best parameters for Random Forest Classifier from Grid Search:")
print(rf_grid.best_params_)

Best parameters for Random Forest Classifier from Grid Search:
{'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}


In [None]:
#Final Comparison between initial and tuned models

#Tuned model
rf_model_tuned = RandomForestClassifier(
    n_estimators=100,
    criterion='entropy',
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
)
rf_model_tuned.fit(X_train, y_train)
y_pred_tuned = rf_model_tuned.predict(X_test)

accuracy_rf_tuned = accuracy_score(y_test, y_pred_tuned)
precision_rf_tuned = precision_score(y_test, y_pred_tuned, average="macro")
recall_rf_tuned = recall_score(y_test, y_pred_tuned, average="macro")
f1_rf_tuned = f1_score(y_test, y_pred_tuned, average="macro")

print("Random Forest Classifier tuned parameters:")
print("Accuracy:", accuracy_rf_tuned)
print("Precision:", precision_rf_tuned)
print("Recall:", recall_rf_tuned)
print("F1 Score:", f1_rf_tuned)

#Initial model
print("Random Forest Classifier default parameters:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1 Score:", f1_rf)

AdaBoost Classifier

In [None]:
ab_model = AdaBoostClassifier(random_state=42)
ab_model.fit(X_train, y_train)
y_pred = ab_model.predict(X_test)

#Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")

print("AdaBoost Classifier default parameters performance:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
#Hyperparameter Tuning with RandomSearchCV
params={
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 1, 10]
}

ab_random = RandomizedSearchCV(ab_model, param_distributions=params, n_iter=100, cv=5)
ab_random.fit(X_train, y_train)

print("Best parameters for AdaBoost Classifier with Random Search:")
print(ab_random.best_params_)

In [None]:
#Hyperparameter Tuning with GridSearchCV
ab_grid = GridSearchCV(ab_model, param_grid=params, cv=5)
ab_grid.fit(X_train, y_train)

print("Best parameters for AdaBoost Classifier from Grid Search:")
print(ab_grid.best_params_)

In [None]:
#Final Comparison between initial and tuned models

Gradient Boost Classifier

In [None]:
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)

#Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")

print("Gradient Boosting Classifier default parameters performance:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
#Hyperparameter Tuning with RandomSearchCV
params={
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 1,1.5],
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2']
}

gb_random = RandomizedSearchCV(gb_model, param_distributions=params, n_iter=100, cv=5)
gb_random.fit(X_train, y_train)

print("Best parameters for Gradient Boosting Classifier with Random Search:")
print(gb_random.best_params_)

In [None]:
#Hyperparameter Tuning with GridSearchCV
gb_grid = GridSearchCV(gb_model, param_grid=params, cv=5)
gb_grid.fit(X_train, y_train)

print("Best parameters for Gradient Boosting Classifier from Grid Search:")
print(gb_grid.best_params_)

In [None]:
#Final Comparison between initial and tuned models