In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
import numpy as np

# Load the dataset
file_path = r"/content/training_with_glove embeddings_split.xlsx"
data = pd.read_excel(file_path, engine='openpyxl')

# Define features and target
X = data.drop(columns=['input', 'Class'])
y = data['Class']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models and parameter grids
models = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'model__C': np.logspace(-4, 4, 10),
            'model__penalty': ['l1', 'l2'],
            'model__solver': ['liblinear', 'saga']
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'model__n_estimators': [50, 100, 200],
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4]
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'model__C': np.logspace(-3, 3, 7),
            'model__gamma': ['scale', 'auto'],
            'model__kernel': ['linear', 'rbf']
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'model__n_neighbors': range(3, 15),
            'model__weights': ['uniform', 'distance'],
            'model__metric': ['euclidean', 'manhattan']
        }
    },
    'GradientBoostingClassifier': {
        'model': GradientBoostingClassifier(),
        'params': {
            'model__n_estimators': [50, 100, 200],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__max_depth': [3, 5, 7]
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4]
        }
    },
    'XGBClassifier': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'model__n_estimators': [50, 100, 200],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__max_depth': [3, 5, 7]
        }
    },
    'CatBoostClassifier': {
        'model': CatBoostClassifier(verbose=0),
        'params': {
            'model__iterations': [100, 200, 300],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__depth': [3, 5, 7]
        }
    },
    'AdaBoostClassifier': {
        'model': AdaBoostClassifier(),
        'params': {
            'model__n_estimators': [50, 100, 200],
            'model__learning_rate': [0.01, 0.1, 0.2]
        }
    },
    'GaussianNB': {
        'model': GaussianNB(),
        'params': {}
    }
}

# Perform RandomizedSearchCV with PCA
results_with_pca = []
best_models = {}

for model_name, model_info in models.items():
    # Define the pipeline with PCA
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=5)),  # Adjust the number of components as needed
        ('model', model_info['model'])
    ])

    # Perform hyperparameter search if params are available
    if model_info['params']:
        search = RandomizedSearchCV(pipe, model_info['params'], n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
        search.fit(X_train, y_train)
        best_models[model_name] = search.best_estimator_
        train_scores = cross_val_score(search.best_estimator_, X_train, y_train, cv=5, scoring='accuracy')
        train_mean = np.mean(train_scores)
        train_std = np.std(train_scores)
        results_with_pca.append({
            'Model': model_name,
            'Best Parameters': search.best_params_,
            'Best CV Score': search.best_score_,
            'Train Mean Accuracy': train_mean,
            'Train Std Dev': train_std
        })
        print(f"Best parameters for {model_name} with PCA: {search.best_params_}")
        print(f"Best CV score for {model_name} with PCA: {search.best_score_:.4f}")
    else:
        pipe.fit(X_train, y_train)
        best_models[model_name] = pipe
        train_scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy')
        train_mean = np.mean(train_scores)
        train_std = np.std(train_scores)
        results_with_pca.append({
            'Model': model_name,
            'Best Parameters': "None",
            'Best CV Score': train_mean,
            'Train Mean Accuracy': train_mean,
            'Train Std Dev': train_std
        })

# Evaluate models with PCA on the test set
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"Classification report for {model_name} with PCA:\n{classification_report(y_test, y_pred)}\n")

# Convert results with PCA to DataFrame
results_pca_df = pd.DataFrame(results_with_pca)
print(results_pca_df)

Best parameters for LogisticRegression with PCA: {'model__solver': 'liblinear', 'model__penalty': 'l1', 'model__C': 0.3593813663804626}
Best CV score for LogisticRegression with PCA: 0.4315
Best parameters for RandomForestClassifier with PCA: {'model__n_estimators': 200, 'model__min_samples_split': 2, 'model__min_samples_leaf': 1, 'model__max_depth': 30}
Best CV score for RandomForestClassifier with PCA: 0.4933
Best parameters for SVC with PCA: {'model__kernel': 'rbf', 'model__gamma': 'scale', 'model__C': 10.0}
Best CV score for SVC with PCA: 0.4755
Best parameters for KNeighborsClassifier with PCA: {'model__weights': 'distance', 'model__n_neighbors': 9, 'model__metric': 'manhattan'}
Best CV score for KNeighborsClassifier with PCA: 0.4762
Best parameters for GradientBoostingClassifier with PCA: {'model__n_estimators': 50, 'model__max_depth': 5, 'model__learning_rate': 0.1}
Best CV score for GradientBoostingClassifier with PCA: 0.4799
Best parameters for DecisionTreeClassifier with PCA:

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Best parameters for XGBClassifier with PCA: {'model__n_estimators': 100, 'model__max_depth': 7, 'model__learning_rate': 0.1}
Best CV score for XGBClassifier with PCA: 0.4806
Best parameters for CatBoostClassifier with PCA: {'model__learning_rate': 0.2, 'model__iterations': 300, 'model__depth': 5}
Best CV score for CatBoostClassifier with PCA: 0.4881




Best parameters for AdaBoostClassifier with PCA: {'model__n_estimators': 200, 'model__learning_rate': 0.1}
Best CV score for AdaBoostClassifier with PCA: 0.4524
Classification report for LogisticRegression with PCA:
              precision    recall  f1-score   support

           0       0.63      0.34      0.45       125
           1       0.40      0.78      0.53       124
           2       0.23      0.07      0.11        87

    accuracy                           0.43       336
   macro avg       0.42      0.40      0.36       336
weighted avg       0.44      0.43      0.39       336


Classification report for RandomForestClassifier with PCA:
              precision    recall  f1-score   support

           0       0.54      0.45      0.49       125
           1       0.41      0.50      0.45       124
           2       0.56      0.51      0.53        87

    accuracy                           0.48       336
   macro avg       0.50      0.48      0.49       336
weighted avg     

DON'T TOUCH THE COMPUTER