
**ΟΜΑΔΑ 33**

*   Κιζιρίδης Δημήτριος ΑΕΜ: 10539
*   Μπίλλας Θωμάς Αχιλλέας ΑΕΜ: 10366




Φόρτωση των δεδομένων

In [10]:
# Cell 1 - Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, silhouette_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

# Φόρτωση των δεδομένων από τα αρχεία CSV
train_data = pd.read_csv('datasetTV.csv', header=None)
test_data = pd.read_csv('datasetTest.csv', header=None)

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")


Training data shape: (8743, 225)
Test data shape: (6955, 224)


Προεπεξεργασία των δεδομένων

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Διαχωρισμός των χαρακτηριστικών και των ετικετών στο training set
x = train_data.iloc[:, :-1].values
y = train_data.iloc[:, -1].values

X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

X_test = test_data.values



Καλύτερο Μοντέλο

In [17]:
from sklearn.svm import SVC
#best model = svm
#best params = {'C': 9, 'gamma': 0.025, 'kernel': 'rbf'}

def fine_tune_SVM(model, param_grid, X_train, y_train, n_jobs=-1, cv=10):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=n_jobs, cv=cv)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    return best_model, best_params

# Χρήση του βέλτιστου SVM μοντέλου για πρόβλεψη
best_params = {'C': [9], 'gamma': [0.025], 'kernel': ['rbf']}
SVM = SVC(random_state=42)
best_svm, bet_params_svm = fine_tune_SVM(SVM, best_params, X_train, y_train)

y_best = best_svm.predict(X_val)
accuracy_best_svm = accuracy_score(y_val, y_best)
print(f"Best Validation Accuracy for SVM: {accuracy_best_svm}")
print(classification_report(y_val, y_best))
print(f"Best parameters for SVM: {best_params}")

labels_33 = best_svm.predict(X_test)
print(f"Test data size: {test_data.shape[0]}")
print(f"Predictions size: {labels_33.shape[0]}")
print(f"Unique labels in predictions: {np.unique(labels_33)}")

# Αποθήκευση των προβλέψεων σε ένα numpy αρχείο
np.save('labels33.npy', labels_33)

# Φόρτωση και επαλήθευση
saved_predictions = np.load('labels33.npy')
print(f"\nSaved predictions size: {saved_predictions.shape[0]}")
print(f"First 10 predictions: {saved_predictions[:10]}")



Best Validation Accuracy for SVM: 0.873642081189251
              precision    recall  f1-score   support

           1       0.95      0.97      0.96       377
           2       0.74      0.76      0.75       332
           3       0.93      0.94      0.93       349
           4       0.93      0.90      0.92       325
           5       0.81      0.78      0.80       366

    accuracy                           0.87      1749
   macro avg       0.87      0.87      0.87      1749
weighted avg       0.87      0.87      0.87      1749

Best parameters for SVM: {'C': [9], 'gamma': [0.025], 'kernel': ['rbf']}
Test data size: 6955
Predictions size: 6955
Unique labels in predictions: [1 2 3 4 5]

Saved predictions size: 6955
First 10 predictions: [1 4 5 2 2 4 5 5 1 3]


================================================================================================================


Συνάρτηση για έλεγχο scaling ή normalization

In [8]:
import pandas as pd
import numpy as np

def check_scaling_or_normalization(data):
    """
    Determines if scaling or normalization is required for a dataset.
    Output:
    - Recommendation: Scaling, Normalization, or None.
    """

    # Display dataset statistics
    print("Dataset Loaded Successfully!")
    print("\nBasic Statistics:")
    print(data.describe())

    # Drop non-numeric columns
    numeric_data = data.select_dtypes(include=[np.number])
    if numeric_data.shape[1] == 0:
        print("No numeric features found. Scaling/Normalization is not applicable.")
        return

    # Check for scaling/normalization needs
    print("\nAnalyzing Feature Ranges...")

    feature_ranges = numeric_data.max() - numeric_data.min()
    feature_std = numeric_data.std()

    range_ratio = feature_ranges.max() / feature_ranges.min()
    std_ratio = feature_std.max() / feature_std.min()

    print(f"Range Ratio (Max/Min): {range_ratio:.2f}")
    print(f"Standard Deviation Ratio (Max/Min): {std_ratio:.2f}")

    # Recommendations
    if range_ratio > 10:
        print("\nRecommendation: Normalization is recommended (MinMaxScaler).")
        print("Reason: Large differences in feature ranges detected.")
    elif std_ratio > 10:
        print("\nRecommendation: Scaling is recommended (StandardScaler).")
        print("Reason: Large differences in feature variances detected.")
    else:
        print("\nRecommendation: No scaling or normalization required.")
        print("Reason: Features are already on a similar scale.")

# Example Usage
print("Training data:")
check_scaling_or_normalization(train_data)
print("Testing data:")
check_scaling_or_normalization(test_data)


Training data:
Dataset Loaded Successfully!

Basic Statistics:
               0            1            2            3            4    \
count  8743.000000  8743.000000  8743.000000  8743.000000  8743.000000   
mean      0.360086     0.001636     0.343860     0.177411     0.199805   
std       0.545784     0.321264     0.537970     0.479141     0.469726   
min      -1.052900    -1.492000    -1.168500    -1.154900    -1.238800   
25%      -0.056065    -0.212760    -0.059468    -0.154845    -0.121140   
50%       0.273600     0.004490     0.254900     0.102000     0.126890   
75%       0.772390     0.212595     0.739950     0.428225     0.449460   
max       2.035000     1.582200     2.075400     1.928000     1.934200   

               5            6            7            8            9    ...  \
count  8743.000000  8743.000000  8743.000000  8743.000000  8743.000000  ...   
mean      0.522868     0.374971     0.515616     0.041620    -0.002033  ...   
std       0.559579     0.552721  

================================================================================================================

Μοντέλα που δοκιμάστηκαν

Συνάρτηση για fine tuning μέσω gridSearch

In [None]:
# Ορισμός της συνάρτησης fine_tune_model
def fine_tune_model(model, param_grid, X_train, y_train, cv=10, n_jobs=-1, verbose=2):
    """
    Βελτιστοποιεί το δοθέν μοντέλο χρησιμοποιώντας GridSearchCV.

    Παράμετροι:
    - model: Το μοντέλο μηχανικής μάθησης που θα βελτιστοποιηθεί.
    - param_grid: Το πλέγμα παραμέτρων για το GridSearchCV.
    - X_train: Χαρακτηριστικά εκπαίδευσης.
    - y_train: Ετικέτες εκπαίδευσης.
    - cv: Αριθμός πτυχών διασταυρούμενης επικύρωσης (προεπιλογή είναι 3).
    - n_jobs: Αριθμός εργασιών που θα εκτελεστούν παράλληλα (προεπιλογή είναι -1).
    - verbose: Επίπεδο λεπτομέρειας (προεπιλογή είναι 2).

    Επιστρέφει:
    - best_model: Το καλύτερο μοντέλο που βρέθηκε από το GridSearchCV.
    - best_params: Οι καλύτερες παράμετροι που βρέθηκαν από το GridSearchCV.
    """
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, n_jobs=n_jobs, verbose=verbose)
    grid_search.fit(X_train, y_train)
    # Εκτύπωση της ακρίβειας μετά από κάθε επανάληψη
    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
        print(f"Mean accuracy: {mean:.3f} (+/-{std * 2:.3f}) for params: {params}")

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    return best_model, best_params

Fine - Tune στο random forest

In [None]:
# Cell 5 - Fine-Tune Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 250, 300, 350], # Number of trees in the forest
    'max_depth': [None, 1, 2, 5, 7, 10, 20, 25, 30],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10], # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4, 10]
}

# Initialize the RF model
rf_notune = RandomForestClassifier(bootstrap=True,random_state=42)

# Train the initial RF model
rf_notune.fit(X_train, y_train)

# Predict on validation set
y_val_pred = rf_notune.predict(X_val)
# Compute accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy (Initial): {accuracy}")
print(classification_report(y_val, y_val_pred))
# silhouette = silhouette_score(X_test, rf_notune.predict(X_test))
# print(f"Silhouette Score(initial): {silhouette}")
# db_index = davies_bouldin_score(X_test, rf_notune.predict(X_test))
# print(f"Davies-Bouldin Index(initial): {db_index}")

rf = RandomForestClassifier(bootstrap=True,random_state=42)
# Fine-tune the RF model
best_rf, best_params_rf = fine_tune_model(rf, param_grid_rf, X_train, y_train)
print(f"Best parameters for KNN: {best_params_rf}")

# Predict with the best model
y_val_pred_best = best_rf.predict(X_val)
accuracy_best = accuracy_score(y_val, y_val_pred_best)
print(f"Validation Accuracy (Tuned): {accuracy_best}")
print(classification_report(y_val, y_val_pred_best))

Validation Accuracy: 0.8061749571183533
              precision    recall  f1-score   support

           1       0.93      0.95      0.94       377
           2       0.67      0.64      0.65       332
           3       0.86      0.90      0.88       349
           4       0.87      0.85      0.86       325
           5       0.70      0.68      0.69       366

    accuracy                           0.81      1749
   macro avg       0.80      0.80      0.80      1749
weighted avg       0.80      0.81      0.80      1749

Best Validation Accuracy for Random Forest: 0.819325328759291
              precision    recall  f1-score   support

           1       0.93      0.95      0.94       377
           2       0.69      0.66      0.68       332
           3       0.87      0.92      0.90       349
           4       0.88      0.86      0.87       325
           5       0.71      0.70      0.70       366

    accuracy                           0.82      1749
   macro avg       0.82      

Fine - Tune στο SVM

In [None]:
# Cell 6 - Fine-Tune Support Vector Machine
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf']
}

svm = SVC(random_state=42)
best_svm, best_params_svm = fine_tune_model(svm, param_grid_svm, X_train, y_train)
print(f"Best parameters for SVM: {best_params_svm}")

# Πρόβλεψη και αξιολόγηση στο validation set με το βελτιστοποιημένο μοντέλο
y_val_pred_best_svm = best_svm.predict(X_val)
accuracy_best_svm = accuracy_score(y_val, y_val_pred_best_svm)
print(f"Validation Accuracy with Best SVM Model: {accuracy_best_svm}")
print(classification_report(y_val, y_val_pred_best_svm))

Fitting 3 folds for each of 32 candidates, totalling 96 fits
Mean accuracy: 0.782 (+/-0.006) for params: {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
Mean accuracy: 0.203 (+/-0.000) for params: {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
Mean accuracy: 0.782 (+/-0.006) for params: {'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}
Mean accuracy: 0.203 (+/-0.000) for params: {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}
Mean accuracy: 0.782 (+/-0.006) for params: {'C': 0.1, 'gamma': 0.01, 'kernel': 'linear'}
Mean accuracy: 0.784 (+/-0.006) for params: {'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
Mean accuracy: 0.782 (+/-0.006) for params: {'C': 0.1, 'gamma': 0.001, 'kernel': 'linear'}
Mean accuracy: 0.702 (+/-0.012) for params: {'C': 0.1, 'gamma': 0.001, 'kernel': 'rbf'}
Mean accuracy: 0.751 (+/-0.011) for params: {'C': 1, 'gamma': 1, 'kernel': 'linear'}
Mean accuracy: 0.203 (+/-0.000) for params: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Mean accuracy: 0.751 (+/-0.011) for params: {'C': 1, 'gamma': 0.1, 'ke

Fine - Tune στο AdaBoost

In [None]:
# Cell - Fine-Tune AdaBoost
from sklearn.ensemble import AdaBoostClassifier

param_grid_ab = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.1, 1, 10]
}

ab = AdaBoostClassifier(random_state=42)
ab.fit(X_train, y_train)

# Step 4: Evaluate the Model
# Predict on validation set
y_val_pred_ab = ab.predict(X_val)

# Compute accuracy
accuracy_ab = accuracy_score(y_val, y_val_pred_ab)
print(f"Validation Accuracy for AdaBoost: {accuracy_ab}")
print(classification_report(y_val, y_val_pred_ab))

# Perform grid search
grid_search_ab = GridSearchCV(estimator=ab, param_grid=param_grid_ab, cv=5, scoring='accuracy')
grid_search_ab.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_ab = grid_search_ab.best_params_
best_ab = grid_search_ab.best_estimator_

# Evaluate the best model on the validation set
y_val_pred_best_ab = best_ab.predict(X_val)
accuracy_best_ab = accuracy_score(y_val, y_val_pred_best_ab)
print(f"Best Validation Accuracy for AdaBoost: {accuracy_best_ab}")
print(classification_report(y_val, y_val_pred_best_ab))
print(f"Best parameters for AdaBoost: {best_params_ab}")



Validation Accuracy for AdaBoost: 0.6380789022298456
              precision    recall  f1-score   support

           1       0.89      0.89      0.89       377
           2       0.50      0.54      0.52       332
           3       0.73      0.68      0.70       349
           4       0.52      0.79      0.63       325
           5       0.54      0.30      0.38       366

    accuracy                           0.64      1749
   macro avg       0.64      0.64      0.62      1749
weighted avg       0.64      0.64      0.63      1749





Best Validation Accuracy for AdaBoost: 0.6958261863922242
              precision    recall  f1-score   support

           1       0.91      0.92      0.91       377
           2       0.54      0.58      0.56       332
           3       0.74      0.74      0.74       349
           4       0.63      0.74      0.68       325
           5       0.64      0.49      0.55       366

    accuracy                           0.70      1749
   macro avg       0.69      0.69      0.69      1749
weighted avg       0.70      0.70      0.69      1749

Best parameters for AdaBoost: {'learning_rate': 0.1, 'n_estimators': 300}


Περαιτέρω εξερεύνηση για το SVM στην περιοχή των καλύτερων παραμέτρων


In [None]:
# Define the extended parameter grid
param_grid_svm = {
    'C': [8, 10, 12, 15],
    'gamma': [0.015, 0.02, 0.025, 0.03],
    'kernel': ['rbf']
}

# Initialize the SVM classifier
svm = SVC(random_state=42)

# Perform grid search
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=3, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_svm = grid_search_svm.best_params_
best_svm = grid_search_svm.best_estimator_

# Evaluate the best model on the validation set
y_val_pred_best_svm = best_svm.predict(X_val)
accuracy_best_svm = accuracy_score(y_val, y_val_pred_best_svm)
print(f"Best Validation Accuracy for SVM: {accuracy_best_svm}")
print(classification_report(y_val, y_val_pred_best_svm))
print(f"Best parameters for SVM: {best_params_svm}")

Best Validation Accuracy for SVM: 0.8713550600343053
              precision    recall  f1-score   support

           1       0.95      0.97      0.96       377
           2       0.74      0.76      0.75       332
           3       0.92      0.95      0.93       349
           4       0.94      0.90      0.92       325
           5       0.80      0.77      0.79       366

    accuracy                           0.87      1749
   macro avg       0.87      0.87      0.87      1749
weighted avg       0.87      0.87      0.87      1749

Best parameters for SVM: {'C': 8, 'gamma': 0.03, 'kernel': 'rbf'}


Περαιτέρω εξερεύνηση για SVM στην περιοχή των καλύτερων παραμέτρων


In [None]:
# Define the extended parameter grid
param_grid_svm = {
    'C': [9, 10, 11, 12],
    'gamma': [0.018, 0.02, 0.022, 0.025],
    'kernel': ['rbf']
}

# Initialize the SVM classifier
svm = SVC(random_state=42)

# Perform grid search
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=3, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_svm = grid_search_svm.best_params_
best_svm = grid_search_svm.best_estimator_

# Evaluate the best model on the validation set
y_val_pred_best_svm = best_svm.predict(X_val)
accuracy_best_svm = accuracy_score(y_val, y_val_pred_best_svm)
print(f"Best Validation Accuracy for SVM: {accuracy_best_svm}")
print(classification_report(y_val, y_val_pred_best_svm))
print(f"Best parameters for SVM: {best_params_svm}")

Best Validation Accuracy for SVM: 0.873642081189251
              precision    recall  f1-score   support

           1       0.95      0.97      0.96       377
           2       0.74      0.76      0.75       332
           3       0.93      0.94      0.93       349
           4       0.93      0.90      0.92       325
           5       0.81      0.78      0.80       366

    accuracy                           0.87      1749
   macro avg       0.87      0.87      0.87      1749
weighted avg       0.87      0.87      0.87      1749

Best parameters for SVM: {'C': 9, 'gamma': 0.025, 'kernel': 'rbf'}


Περαιτέρω εξερεύνηση για SVM στην περιοχή των καλύτερων παραμέτρων


In [None]:
# Define the extended parameter grid
param_grid_svm = {
    'C': [8.5, 9, 9.5, 10],
    'gamma': [0.022, 0.025, 0.028, 0.03],
    'kernel': ['rbf']
}

# Initialize the SVM classifier
svm = SVC(random_state=42)

# Perform grid search
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=3, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_svm = grid_search_svm.best_params_
best_svm = grid_search_svm.best_estimator_

# Evaluate the best model on the validation set
y_val_pred_best_svm = best_svm.predict(X_val)
accuracy_best_svm = accuracy_score(y_val, y_val_pred_best_svm)
print(f"Best Validation Accuracy for SVM: {accuracy_best_svm}")
print(classification_report(y_val, y_val_pred_best_svm))
print(f"Best parameters for SVM: {best_params_svm}")

Best Validation Accuracy for SVM: 0.8713550600343053
              precision    recall  f1-score   support

           1       0.95      0.97      0.96       377
           2       0.74      0.76      0.75       332
           3       0.92      0.95      0.93       349
           4       0.94      0.90      0.92       325
           5       0.80      0.77      0.79       366

    accuracy                           0.87      1749
   macro avg       0.87      0.87      0.87      1749
weighted avg       0.87      0.87      0.87      1749

Best parameters for SVM: {'C': 8.5, 'gamma': 0.03, 'kernel': 'rbf'}


Fine - Tune στο XGBoost

In [None]:
# Adjust the labels to start from 0
y_train_adjusted = y_train - 1
y_val_adjusted = y_val - 1

# Cell - Fine-Tune XGBoost
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Define the parameter grid
param_grid_xgb = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

# Initialize the XGBoost classifier
xgb = XGBClassifier(random_state=42)

# Perform grid search
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=3, scoring='accuracy')
grid_search_xgb.fit(X_train, y_train_adjusted)

# Get the best parameters and best estimator
best_params_xgb = grid_search_xgb.best_params_
best_xgb = grid_search_xgb.best_estimator_

# Evaluate the best model on the validation set
y_val_pred_best_xgb = best_xgb.predict(X_val)
accuracy_best_xgb = accuracy_score(y_val_adjusted, y_val_pred_best_xgb)
print(f"Best Validation Accuracy for XGBoost: {accuracy_best_xgb}")
print(classification_report(y_val_adjusted, y_val_pred_best_xgb))
print(f"Best parameters for XGBoost: {best_params_xgb}")

Best Validation Accuracy for XGBoost: 0.8502001143510578
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       377
           1       0.73      0.71      0.72       332
           2       0.92      0.92      0.92       349
           3       0.89      0.89      0.89       325
           4       0.75      0.77      0.76       366

    accuracy                           0.85      1749
   macro avg       0.85      0.85      0.85      1749
weighted avg       0.85      0.85      0.85      1749

Best parameters for XGBoost: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.7}


Περαιτέρω εξερεύνηση για το XGBoost στην περιοχή των καλύτερων παραμέτρων


In [None]:
# Cell - Fine-Tune XGBoost
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Define the narrowed parameter grid
param_grid_xgb = {
    'n_estimators': [250, 300, 350],
    'learning_rate': [0.05, 0.1, 0.15],
    'max_depth': [6, 7, 8],
    'subsample': [0.65, 0.7, 0.75],
    'colsample_bytree': [0.65, 0.7, 0.75]
}

# Initialize the XGBoost classifier
xgb = XGBClassifier(random_state=42)

# Perform grid search with 3-fold cross-validation
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=3, scoring='accuracy')
grid_search_xgb.fit(X_train, y_train_adjusted)

# Get the best parameters and best estimator
best_params_xgb = grid_search_xgb.best_params_
best_xgb = grid_search_xgb.best_estimator_

# Evaluate the best model on the validation set
y_val_pred_best_xgb = best_xgb.predict(X_val)
accuracy_best_xgb = accuracy_score(y_val_adjusted, y_val_pred_best_xgb)
print(f"Best Validation Accuracy for XGBoost: {accuracy_best_xgb}")
print(classification_report(y_val_adjusted, y_val_pred_best_xgb))
print(f"Best parameters for XGBoost: {best_params_xgb}")

  _data = np.array(data, dtype=dtype, copy=copy,


Best Validation Accuracy for XGBoost: 0.8502001143510578
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       377
           1       0.73      0.71      0.72       332
           2       0.92      0.92      0.92       349
           3       0.89      0.89      0.89       325
           4       0.75      0.77      0.76       366

    accuracy                           0.85      1749
   macro avg       0.85      0.85      0.85      1749
weighted avg       0.85      0.85      0.85      1749

Best parameters for XGBoost: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.7}


Tune using LightGBM

In [None]:
param_grid_lgbm = {
    'num_leaves': [31, 50, 70, 100, 150, 300],         # Number of leaves in one tree
    'max_depth': [-1, 10, 20],         # Maximum depth of the tree (-1 means no limit)
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2], # Learning rate (shrinkage)
    'n_estimators': [100, 200, 300],   # Number of boosting iterations (trees)
    'min_child_samples': [10, 20, 30], # Minimum number of samples per leaf
    'subsample': [0.8, 1.0],           # Subsample ratio of training instances
 }

# Initialize the LGBM model
lgbm_notune = LGBMClassifier()

# Train the initial LGBM model
lgbm_notune.fit(X_train, y_train)

# Predict on validation set
y_val_pred = lgbm_notune.predict(X_val)

# Compute accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy (Initial): {accuracy}")
print(classification_report(y_val, y_val_pred))
# silhouette = silhouette_score(X_test, rf_notune.predict(X_test))
# print(f"Silhouette Score(initial): {silhouette}")
# db_index = davies_bouldin_score(X_test, rf_notune.predict(X_test))
# print(f"Davies-Bouldin Index(initial): {db_index}")

lgbm = LGBMClassifier()
# Fine-tune the MLP model
best_lgbm, best_params_lgbm = fine_tune_model(lgbm, param_grid_lgbm, X_train, y_train)
print(f"Best parameters for LGBM: {best_params_lgbm}")

# Predict with the best model
y_val_pred_best = best_lgbm.predict(X_val)
accuracy_best = accuracy_score(y_val, y_val_pred_best)
print(f"Validation Accuracy (Tuned): {accuracy_best}")
print(classification_report(y_val, y_val_pred_best))

Tune using KNN

In [None]:
# Parameter grid for KNN
param_grid_knn = {
    'n_neighbors': [5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
}
# Initialize the KNN model
knn_notune = KNeighborsClassifier()

# Train the initial KNN model
knn_notune.fit(X_train, y_train)

# Predict on validation set
y_val_pred = knn_notune.predict(X_val)

# Compute accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy (Initial): {accuracy}")
print(classification_report(y_val, y_val_pred))
silhouette = silhouette_score(X_test, knn_notune.predict(X_test))
print(f"Silhouette Score(initial): {silhouette}")
db_index = davies_bouldin_score(X_test, knn_notune.predict(X_test))
print(f"Davies-Bouldin Index(initial): {db_index}")

knn= KNeighborsClassifier()
# Fine-tune the KNN model
best_knn, best_params_knn = fine_tune_model(knn, param_grid_knn, X_train, y_train)
print(f"Best parameters for KNN: {best_params_knn}")

# Predict with the best model
y_val_pred_best = best_knn.predict(X_val)
accuracy_best = accuracy_score(y_val, y_val_pred_best)
print(f"Validation Accuracy (Tuned): {accuracy_best}")
print(classification_report(y_val, y_val_pred_best))


# Example: Evaluate clustering
silhouette = silhouette_score(X_test, best_knn.predict(X_test))
print(f"Silhouette Score: {silhouette}")
db_index = davies_bouldin_score(X_test, best_knn.predict(X_test))
print(f"Davies-Bouldin Index: {db_index}")


Tune using MLP

In [None]:
from sklearn.neural_network import MLPClassifier
# Parameter grid for MLP
param_grid_mlp =
{   'hidden_layer_sizes': [(10), (100), (50,50), (100,50), (100,100), (50,50,50), (400, 40), (400,38), (400,42)],
    'activation': ['tanh', 'relu'],                    # Activation functions
    'solver': ['sgd', 'adam'],                         # Optimizers
    'alpha': [0.0001, 0.001, 0.01],                    # Regularization parameter
    'learning_rate': ['constant', 'adaptive'],         # Learning rate schedule
    'learning_rate_init': [0.001, 0.01],               # Initial learning rate
}

# Initialize the mlp model
mlp_notune = MLPClassifier(random_state=42)

# Train the initial MLP model
mlp_notune.fit(X_train, y_train)

# Predict on validation set
y_val_pred = mlp_notune.predict(X_val)

# Compute accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy (Initial): {accuracy}")
print(classification_report(y_val, y_val_pred))
# silhouette = silhouette_score(X_test, rf_notune.predict(X_test))
# print(f"Silhouette Score(initial): {silhouette}")
# db_index = davies_bouldin_score(X_test, rf_notune.predict(X_test))
# print(f"Davies-Bouldin Index(initial): {db_index}")

mlp = MLPClassifier(random_state=42)
# Fine-tune the MLP model
best_mlp, best_params_mlp = fine_tune_model(mlp, param_grid_mlp, X_train, y_train)
print(f"Best parameters for MLP: {best_params_mlp}")

# Predict with the best model
y_val_pred_best = best_mlp.predict(X_val)
accuracy_best = accuracy_score(y_val, y_val_pred_best)
print(f"Validation Accuracy (Tuned): {accuracy_best}")
print(classification_report(y_val, y_val_pred_best))
