In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.decomposition import PCA
import numpy as np
from scipy.stats import uniform, randint

# Load the Excel data
file_path = r"/content/roberta_embeddings exal (1).xlsx"
data = pd.read_excel(file_path)

# Display column names to confirm structure
print("Column names:", data.columns)

# Define features and target
X = data.drop(columns=['Class'])  # Adjust to the actual feature columns
y = data['Class']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models and parameter distributions for RandomizedSearchCV
models = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'model__C': uniform(1e-4, 1e4),
            'model__penalty': ['l2'],
            'model__solver': ['liblinear']
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': randint(2, 15),
            'model__min_samples_leaf': randint(1, 10)
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'model__C': uniform(1e-3, 1e3),
            'model__gamma': ['scale', 'auto'],
            'model__kernel': ['linear', 'rbf']
        }
    },
    'GradientBoostingClassifier': {
        'model': GradientBoostingClassifier(),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__learning_rate': uniform(0.01, 0.2),
            'model__max_depth': [3, 5, 7]
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'model__n_neighbors': randint(3, 20),
            'model__weights': ['uniform', 'distance'],
            'model__p': [1, 2]
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': randint(2, 15),
            'model__min_samples_leaf': randint(1, 10)
        }
    },
    'GaussianNB': {
        'model': GaussianNB(),
        'params': {}  # No hyperparameters for GaussianNB
    },
    'XGBClassifier': {
        'model': XGBClassifier(eval_metric='logloss', use_label_encoder=False),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__learning_rate': uniform(0.01, 0.2),
            'model__max_depth': randint(3, 10)
        }
    },
    'AdaBoostClassifier': {
        'model': AdaBoostClassifier(),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__learning_rate': uniform(0.01, 1.0)
        }
    },
    'CatBoostClassifier': {
        'model': CatBoostClassifier(verbose=0),
        'params': {
            'model__iterations': randint(50, 200),
            'model__learning_rate': uniform(0.01, 0.2),
            'model__depth': randint(4, 10)
        }
    }
}

# Perform RandomizedSearchCV with PCA
best_models_pca = {}
results_pca = []

for model_name, model_info in models.items():
    print(f"Tuning {model_name} with PCA...")

    # Define the pipeline with PCA
    pipe = Pipeline([
        ('scaler', StandardScaler()),  # Scaling for PCA
        ('pca', PCA(n_components=0.9999)),  # Keep 99.99% variance
        ('model', model_info['model'])
    ])

    # Perform RandomizedSearchCV
    search = RandomizedSearchCV(pipe, model_info['params'], n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, verbose=1, random_state=42)
    search.fit(X_train, y_train)

    # Store the best model and cross-validation results
    best_models_pca[model_name] = search.best_estimator_

    # Compute train mean accuracy and standard deviation
    train_scores = cross_val_score(search.best_estimator_, X_train, y_train, cv=3, scoring='accuracy')
    train_mean = np.mean(train_scores)
    train_std = np.std(train_scores)

    results_pca.append({
        'Model': model_name,
        'Best Parameters': search.best_params_,
        'Best CV Score': search.best_score_,
        'Train Mean Accuracy': train_mean,
        'Train Std Dev': train_std
    })

    print(f"Best parameters for {model_name} with PCA: {search.best_params_}")
    print(f"Best cross-validation score for {model_name} with PCA: {search.best_score_:.4f}")
    print(f"Train mean accuracy: {train_mean:.4f}, Train std deviation: {train_std:.4f}\n")

# Evaluate best models on the test set
for model_name, model in best_models_pca.items():
    print(f"Evaluating {model_name} with PCA on the test set...")
    y_pred = model.predict(X_test)
    print(f"Classification report for {model_name} with PCA:\n{classification_report(y_test, y_pred)}\n")

# Convert results to DataFrame for better visualization
results_pca_df = pd.DataFrame(results_pca)
print(results_pca_df)


Column names: Index([      0,       1,       2,       3,       4,       5,       6,       7,
             8,       9,
       ...
           759,     760,     761,     762,     763,     764,     765,     766,
           767, 'Class'],
      dtype='object', length=769)
Tuning LogisticRegression with PCA...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for LogisticRegression with PCA: {'model__C': 580.8362216819946, 'model__penalty': 'l2', 'model__solver': 'liblinear'}
Best cross-validation score for LogisticRegression with PCA: 0.5126
Train mean accuracy: 0.5119, Train std deviation: 0.0428

Tuning RandomForestClassifier with PCA...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for RandomForestClassifier with PCA: {'model__max_depth': 20, 'model__min_samples_leaf': 8, 'model__min_samples_split': 14, 'model__n_estimators': 70}
Best cross-validation score for RandomForestClassifier with PCA: 0.5179
Train mean accuracy: 0.5045, T



Best parameters for GaussianNB with PCA: {}
Best cross-validation score for GaussianNB with PCA: 0.3802
Train mean accuracy: 0.3802, Train std deviation: 0.0376

Tuning XGBClassifier with PCA...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Best parameters for XGBClassifier with PCA: {'model__learning_rate': 0.038573363584388155, 'model__max_depth': 5, 'model__n_estimators': 199}
Best cross-validation score for XGBClassifier with PCA: 0.5134
Train mean accuracy: 0.5134, Train std deviation: 0.0346

Tuning AdaBoostClassifier with PCA...
Fitting 3 folds for each of 10 candidates, totalling 30 fits




Best parameters for AdaBoostClassifier with PCA: {'model__learning_rate': 0.7319987722668247, 'model__n_estimators': 87}
Best cross-validation score for AdaBoostClassifier with PCA: 0.4836
Train mean accuracy: 0.4836, Train std deviation: 0.0028

Tuning CatBoostClassifier with PCA...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for CatBoostClassifier with PCA: {'model__depth': 4, 'model__iterations': 98, 'model__learning_rate': 0.11495493205167782}
Best cross-validation score for CatBoostClassifier with PCA: 0.5372
Train mean accuracy: 0.5372, Train std deviation: 0.0148

Evaluating LogisticRegression with PCA on the test set...
Classification report for LogisticRegression with PCA:
              precision    recall  f1-score   support

           0       0.66      0.55      0.60       125
           1       0.44      0.55      0.49       124
           2       0.55      0.48      0.52        87

    accuracy                           0.53       336
   m

In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7
