In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.decomposition import PCA
import numpy as np
from scipy.stats import uniform, randint

# Load the Excel data
file_path = r"C:\Users\btech\Desktop\ml_cse22257\bert_embeddings (1).xlsx"
data = pd.read_excel(file_path)

# Define features and target
X = data.drop(columns=['Class'])  # Adjust to the actual feature columns
y = data['Class']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models and parameter distributions for RandomizedSearchCV
models = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'model__C': uniform(1e-4, 1e4),
            'model__penalty': ['l2'],
            'model__solver': ['liblinear']
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': randint(2, 15),
            'model__min_samples_leaf': randint(1, 10)
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'model__C': uniform(1e-3, 1e3),
            'model__gamma': ['scale', 'auto'],
            'model__kernel': ['linear', 'rbf']
        }
    },
    'GradientBoostingClassifier': {
        'model': GradientBoostingClassifier(),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__learning_rate': uniform(0.01, 0.2),
            'model__max_depth': [3, 5, 7]
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'model__n_neighbors': randint(3, 20),
            'model__weights': ['uniform', 'distance'],
            'model__p': [1, 2]
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': randint(2, 15),
            'model__min_samples_leaf': randint(1, 10)
        }
    },
    'GaussianNB': {
        'model': GaussianNB(),
        'params': {}  # No hyperparameters for GaussianNB
    },
    'XGBClassifier': {
        'model': XGBClassifier(eval_metric='logloss', use_label_encoder=False),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__learning_rate': uniform(0.01, 0.2),
            'model__max_depth': randint(3, 10)
        }
    },
    'AdaBoostClassifier': {
        'model': AdaBoostClassifier(),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__learning_rate': uniform(0.01, 1.0)
        }
    },
    'CatBoostClassifier': {
        'model': CatBoostClassifier(verbose=0),
        'params': {
            'model__iterations': randint(50, 200),
            'model__learning_rate': uniform(0.01, 0.2),
            'model__depth': randint(4, 10)
        }
    }
}

# Perform RandomizedSearchCV for each model
results = []
best_models = {}
for model_name, model_info in models.items():
    print(f"Tuning {model_name}...")
    pipe = Pipeline([
        ('scaler', StandardScaler()),  # Applies scaling
        ('model', model_info['model'])
    ])

    search = RandomizedSearchCV(pipe, model_info['params'], n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, verbose=1, random_state=42)
    search.fit(X_train, y_train)

    best_models[model_name] = search.best_estimator_
    train_scores = cross_val_score(search.best_estimator_, X_train, y_train, cv=3, scoring='accuracy')
    train_mean = np.mean(train_scores)
    train_std = np.std(train_scores)

    results.append({
        'Model': model_name,
        'Best Parameters': search.best_params_,
        'Best CV Score': search.best_score_,
        'Train Mean Accuracy': train_mean,
        'Train Std Dev': train_std
    })

    print(f"Best parameters for {model_name}: {search.best_params_}")
    print(f"Best cross-validation score for {model_name}: {search.best_score_:.4f}")
    print(f"Train mean accuracy: {train_mean:.4f}, Train std deviation: {train_std:.4f}\n")

# Evaluate best models on the test set
print("Evaluating models on test data...")
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"Classification report for {model_name}:\n{classification_report(y_test, y_pred)}\n")

# Apply PCA to the data
pca = PCA(n_components=0.9999)  # Retain 99.99% of the variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"Original number of features: {X_train.shape[1]}")
print(f"Reduced number of features after PCA: {X_train_pca.shape[1]}")

# Re-evaluate models on PCA-transformed data
pca_results = []
for model_name, model_info in models.items():
    print(f"Re-training {model_name} on PCA-transformed data...")
    model = model_info['model']
    model.fit(X_train_pca, y_train)
    y_pred_pca = model.predict(X_test_pca)
    acc_pca = accuracy_score(y_test, y_pred_pca)
    pca_results.append({
        'Model': model_name,
        'Test Accuracy with PCA': acc_pca
    })
    print(f"Test accuracy for {model_name} with PCA: {acc_pca:.4f}")

# Display results after PCA
pca_results_df = pd.DataFrame(pca_results)
print("\nResults after PCA:")
print(pca_results_df)

Tuning LogisticRegression...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for LogisticRegression: {'model__C': 1559.9453033620264, 'model__penalty': 'l2', 'model__solver': 'liblinear'}
Best cross-validation score for LogisticRegression: 0.5208
Train mean accuracy: 0.5208, Train std deviation: 0.0331

Tuning RandomForestClassifier...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for RandomForestClassifier: {'model__max_depth': 30, 'model__min_samples_leaf': 8, 'model__min_samples_split': 4, 'model__n_estimators': 199}
Best cross-validation score for RandomForestClassifier: 0.5476
Train mean accuracy: 0.5402, Train std deviation: 0.0018

Tuning SVC...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for SVC: {'model__C': 20.585494295802448, 'model__gamma': 'auto', 'model__kernel': 'rbf'}
Best cross-validation score for SVC: 0.5677
Train mean accuracy: 0.5677, Train std deviation: 0.0295

Tuning Gra



Best parameters for GaussianNB: {}
Best cross-validation score for GaussianNB: 0.4881
Train mean accuracy: 0.4881, Train std deviation: 0.0312

Tuning XGBClassifier...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Best parameters for XGBClassifier: {'model__learning_rate': 0.13349630192554332, 'model__max_depth': 4, 'model__n_estimators': 71}
Best cross-validation score for XGBClassifier: 0.5536
Train mean accuracy: 0.5536, Train std deviation: 0.0330

Tuning AdaBoostClassifier...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for AdaBoostClassifier: {'model__learning_rate': 0.3845401188473625, 'model__n_estimators': 142}
Best cross-validation score for AdaBoostClassifier: 0.5312
Train mean accuracy: 0.5312, Train std deviation: 0.0301

Tuning CatBoostClassifier...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for CatBoostClassifier: {'model__depth': 7, 'model__iterations': 142, 'model__learning_rate': 0.04668695797323276}
Best cross-validation score for CatBoostClassifier: 0.5536
Train mean accuracy: 0.5536, Train std deviation: 0.0128

Evaluating models on test data...
Classification report for LogisticRegression:
              preci

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test accuracy for LogisticRegression with PCA: 0.5982
Re-training RandomForestClassifier on PCA-transformed data...
Test accuracy for RandomForestClassifier with PCA: 0.5238
Re-training SVC on PCA-transformed data...
Test accuracy for SVC with PCA: 0.6042
Re-training GradientBoostingClassifier on PCA-transformed data...
Test accuracy for GradientBoostingClassifier with PCA: 0.5387
Re-training KNeighborsClassifier on PCA-transformed data...
Test accuracy for KNeighborsClassifier with PCA: 0.5327
Re-training DecisionTreeClassifier on PCA-transformed data...
Test accuracy for DecisionTreeClassifier with PCA: 0.5149
Re-training GaussianNB on PCA-transformed data...
Test accuracy for GaussianNB with PCA: 0.3780
Re-training XGBClassifier on PCA-transformed data...


Parameters: { "use_label_encoder" } are not used.



Test accuracy for XGBClassifier with PCA: 0.5268
Re-training AdaBoostClassifier on PCA-transformed data...
Test accuracy for AdaBoostClassifier with PCA: 0.5060
Re-training CatBoostClassifier on PCA-transformed data...
Test accuracy for CatBoostClassifier with PCA: 0.5893

Results after PCA:
                        Model  Test Accuracy with PCA
0          LogisticRegression                0.598214
1      RandomForestClassifier                0.523810
2                         SVC                0.604167
3  GradientBoostingClassifier                0.538690
4        KNeighborsClassifier                0.532738
5      DecisionTreeClassifier                0.514881
6                  GaussianNB                0.377976
7               XGBClassifier                0.526786
8          AdaBoostClassifier                0.505952
9          CatBoostClassifier                0.589286
