In [1]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [2]:
pip install pandas numpy scikit-learn xgboost catboost shap matplotlib




In [3]:
!pip install lime


Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/275.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=aed95b93319db597ce7829697f4f171d7a71ddd04c8902fb09e7bd8d36707dba
  Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings

# Suppress warnings for clean output
warnings.filterwarnings("ignore")

# Load dataset
file_path = "/content/training_with_glove embeddings_split.xlsx"  # Replace with your dataset path
data = pd.read_excel(file_path)

# Preprocessing
X = data.drop(["input", "Class"], axis=1)  # Drop 'input' and target
y = data["Class"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check the class distribution after SMOTE
balanced_class_distribution = Counter(y_train_smote)
print("Class distribution after SMOTE:", balanced_class_distribution)

# Define base learners for StackingClassifier
base_learners = [
    ('decision_tree', DecisionTreeClassifier(max_depth=5)),
    ('svc', SVC(probability=True)),
]

# Define the meta-model
stacking_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=LogisticRegression(max_iter=1000)
)

# Define 6 models
models = {
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "StackingClassifier": stacking_model,
    "AdaBoostClassifier": AdaBoostClassifier(),
    "GaussianNB": GaussianNB(),
    "CatBoostClassifier": CatBoostClassifier(verbose=0),
    "HistGradientBoostingClassifier": HistGradientBoostingClassifier(),
}

# Define hyperparameter grids for each model
extended_param_grids = {
    "DecisionTreeClassifier": {"max_depth": [None, 10, 20], "min_samples_split": [2, 5]},
    "StackingClassifier": {},  # No hyperparameters for stacking as defined here
    "AdaBoostClassifier": {"n_estimators": [50, 100], "learning_rate": [0.5, 1]},
    "GaussianNB": {},  # No hyperparameters to tune
    "CatBoostClassifier": {"depth": [4, 6, 8, 10], "learning_rate": [0.01, 0.03, 0.1], "iterations": [100, 200, 500]},
    "HistGradientBoostingClassifier": {
        "learning_rate": [0.01, 0.05, 0.1],
        "max_iter": [100, 200, 500],
        "max_depth": [None, 10, 20],
        "l2_regularization": [0.0, 0.1, 1.0],
    },
}

# Store results
results = []

# Train, optimize, and evaluate each model
for model_name, model in models.items():
    print(f"Optimizing {model_name}...")

    # Use extended hyperparameters if available
    if extended_param_grids.get(model_name):
        random_search = RandomizedSearchCV(
            model,
            param_distributions=extended_param_grids[model_name],
            n_iter=20,  # Number of parameter settings to sample
            cv=5,
            scoring="accuracy",
            random_state=42,
            n_jobs=-1,
        )
        random_search.fit(X_train_smote, y_train_smote)
        best_model = random_search.best_estimator_
    else:
        best_model = model
        best_model.fit(X_train_smote, y_train_smote)

    # Cross-validation
    cv_scores = cross_val_score(best_model, X_train_smote, y_train_smote, cv=5, scoring="accuracy")
    train_mean_accuracy = np.mean(cv_scores)
    train_std_dev = np.std(cv_scores)

    # Test set evaluation
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")

    print(f"Model: {model_name}")
    print(f"Train Mean Accuracy: {train_mean_accuracy:.4f}, Train Std Dev: {train_std_dev:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}\n")

    # Store results
    results.append({
        "Model": model_name,
        "Train Mean Accuracy": train_mean_accuracy,
        "Train Std Dev": train_std_dev,
        "Test Accuracy": test_accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
    })

# Convert results to a DataFrame for easy comparison
results_df = pd.DataFrame(results)

# Save results to Excel
results_df.to_excel("optimized_model_results.xlsx", index=False)

# Print final results
print("Final Model Evaluation Results:")
print(results_df)


Class distribution after SMOTE: Counter({0: 539, 1: 539, 2: 539})
Optimizing DecisionTreeClassifier...
Model: DecisionTreeClassifier
Train Mean Accuracy: 0.5443, Train Std Dev: 0.0347
Test Accuracy: 0.4821, Precision: 0.4832, Recall: 0.4821, F1 Score: 0.4825

Optimizing StackingClassifier...
Model: StackingClassifier
Train Mean Accuracy: 0.5944, Train Std Dev: 0.0497
Test Accuracy: 0.5268, Precision: 0.5277, Recall: 0.5268, F1 Score: 0.5271

Optimizing AdaBoostClassifier...
Model: AdaBoostClassifier
Train Mean Accuracy: 0.5486, Train Std Dev: 0.0503
Test Accuracy: 0.4940, Precision: 0.5040, Recall: 0.4940, F1 Score: 0.4953

Optimizing GaussianNB...
Model: GaussianNB
Train Mean Accuracy: 0.4596, Train Std Dev: 0.0528
Test Accuracy: 0.4077, Precision: 0.4386, Recall: 0.4077, F1 Score: 0.4054

Optimizing CatBoostClassifier...


In [1]:
pip install -U scikit-learn

