In [6]:
!pip install catboost




In [7]:
pip install pandas numpy scikit-learn xgboost catboost shap matplotlib




In [8]:
!pip install lime




In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings

# Suppress warnings for clean output
warnings.filterwarnings("ignore")


In [10]:
# Load dataset
file_path = "/content/training_with_glove embeddings_split.xlsx"  # Replace with your dataset path
data = pd.read_excel(file_path)

# Preprocessing
X = data.drop(["input", "Class"], axis=1)  # Drop 'input' and target
y = data["Class"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [11]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check the class distribution after SMOTE
balanced_class_distribution = Counter(y_train_smote)
print("Class distribution after SMOTE:", balanced_class_distribution)


Class distribution after SMOTE: Counter({0: 539, 1: 539, 2: 539})


In [12]:
# Logistic Regression Model
from sklearn.linear_model import LogisticRegression

logistic_regression_model = LogisticRegression(max_iter=1000)

logistic_regression_grid = {
    "C": [0.1, 1, 10],
    "solver": ["liblinear", "lbfgs"]
}


In [13]:
# Random Forest Model
from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier()

random_forest_grid = {
    "n_estimators": [100, 200, 500, 1000],
    "max_depth": [None, 10, 20, 50],
    "max_features": ["sqrt", "log2", None],
    "min_samples_split": [2, 5, 10]
}


In [14]:
# Support Vector Classifier (SVC) Model
from sklearn.svm import SVC

svc_model = SVC(probability=True)

svc_grid = {
    "C": [0.1, 1, 10, 100],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"]
}


In [15]:
# K-Nearest Neighbors Model
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()

knn_grid = {
    "n_neighbors": [3, 5, 7, 10],
    "weights": ["uniform", "distance"],
    "p": [1, 2]
}


In [16]:
# Gradient Boosting Classifier Model
from sklearn.ensemble import GradientBoostingClassifier

gradient_boosting_model = GradientBoostingClassifier()

gradient_boosting_grid = {
    "n_estimators": [100, 200, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 10],
    "subsample": [0.8, 1.0]
}


In [17]:
# Decision Tree Model
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier()

decision_tree_grid = {
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5]
}


In [18]:
# XGBoost Classifier Model
from xgboost import XGBClassifier

xgboost_model = XGBClassifier(eval_metric="logloss", use_label_encoder=False)

xgboost_grid = {
    "n_estimators": [100, 200, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 10],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}


In [19]:
# AdaBoost Classifier Model
from sklearn.ensemble import AdaBoostClassifier

adaboost_model = AdaBoostClassifier()

adaboost_grid = {
    "n_estimators": [50, 100],
    "learning_rate": [0.5, 1]
}


In [20]:
# Gaussian Naive Bayes Model
from sklearn.naive_bayes import GaussianNB

gaussian_nb_model = GaussianNB()

gaussian_nb_grid = {}  # No hyperparameters to tune


In [21]:
# CatBoost Classifier Model
from catboost import CatBoostClassifier

catboost_model = CatBoostClassifier(verbose=0)

catboost_grid = {
    "depth": [4, 6, 8, 10],
    "learning_rate": [0.01, 0.03, 0.1],
    "iterations": [100, 200, 500]
}


In [None]:
# Store results
results = []

# Train, optimize, and evaluate each model
for model_name, model in models.items():
    print(f"Optimizing {model_name}...")

    # Use extended hyperparameters if available
    if extended_param_grids.get(model_name):
        random_search = RandomizedSearchCV(
            model,
            param_distributions=extended_param_grids[model_name],
            n_iter=20,  # Number of parameter settings to sample
            cv=5,
            scoring="accuracy",
            random_state=42,
            n_jobs=-1,
        )
        random_search.fit(X_train_smote, y_train_smote)
        best_model = random_search.best_estimator_
    else:
        best_model = model
        best_model.fit(X_train_smote, y_train_smote)

    # Cross-validation
    cv_scores = cross_val_score(best_model, X_train_smote, y_train_smote, cv=5, scoring="accuracy")
    train_mean_accuracy = np.mean(cv_scores)
    train_std_dev = np.std(cv_scores)

    # Test set evaluation
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")

    print(f"Model: {model_name}")
    print(f"Train Mean Accuracy: {train_mean_accuracy:.4f}, Train Std Dev: {train_std_dev:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}\n")

    # Store results
    results.append({
        "Model": model_name,
        "Train Mean Accuracy": train_mean_accuracy,
        "Train Std Dev": train_std_dev,
        "Test Accuracy": test_accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
    })


Optimizing LogisticRegression...
Model: LogisticRegression
Train Mean Accuracy: 0.5628, Train Std Dev: 0.0518
Test Accuracy: 0.4702, Precision: 0.4756, Recall: 0.4702, F1 Score: 0.4719

Optimizing RandomForestClassifier...
Model: RandomForestClassifier
Train Mean Accuracy: 0.5919, Train Std Dev: 0.0579
Test Accuracy: 0.4940, Precision: 0.4980, Recall: 0.4940, F1 Score: 0.4952

Optimizing SVC...
Model: SVC
Train Mean Accuracy: 0.6303, Train Std Dev: 0.0689
Test Accuracy: 0.5179, Precision: 0.5260, Recall: 0.5179, F1 Score: 0.5199

Optimizing KNeighborsClassifier...
Model: KNeighborsClassifier
Train Mean Accuracy: 0.5808, Train Std Dev: 0.0631
Test Accuracy: 0.5149, Precision: 0.5229, Recall: 0.5149, F1 Score: 0.5154

Optimizing GradientBoostingClassifier...


In [None]:
# Convert results to a DataFrame for easy comparison
results_df = pd.DataFrame(results)

# Save results to Excel
results_df.to_excel("optimized_model_results.xlsx", index=False)

# Print final results
print("Final Model Evaluation Results:")
print(results_df)
