In [None]:
!pip install --upgrade scikit-learn
!pip install openpyxl imbalanced-learn xgboost --quiet
from google.colab import drive
drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/best_mathbert_finetuned_embeddings.xlsx"

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from collections import defaultdict

# Models
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# Load file
data = pd.read_excel(file_path)
X = data.drop(columns=['Class'])
y = data['Class']

X = SimpleImputer(strategy='mean').fit_transform(X)
X = StandardScaler().fit_transform(X)

# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, class_weight='balanced', random_state=42),
    "SVM": SVC(C=1, kernel='rbf', gamma='scale', probability=True, class_weight='balanced', random_state=42),
    "XGBoost": XGBClassifier(learning_rate=0.05, max_depth=8, n_estimators=150, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "Logistic Regression": LogisticRegression(C=1.0, penalty='l2', solver='liblinear', class_weight='balanced', max_iter=1000, random_state=42),
    "Naive Bayes": GaussianNB(var_smoothing=1e-9),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=5, class_weight='balanced', random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=7, metric='minkowski', p=2),
    "MLP": MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', alpha=0.0001, max_iter=300, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Result containers
results_cv = {}
results_precision_recall = {}
per_class_report = defaultdict(dict)

# Stratified 10-fold CV
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for model_name, model in models.items():
    print(f"\nEvaluating: {model_name}")
    accs, f1s, precs, recs = [], [], [], []
    classwise_reports = []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # SMOTE
        sm = SMOTE(random_state=42)
        X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

        # Tree-based Feature Selection
        tree_selector = SelectFromModel(
           RandomForestClassifier(
              n_estimators=850,        # More trees = better averaging of importance
              max_depth=30,          # Let trees grow fully for finer feature importance
              min_samples_split=7,     # Allow more detailed splits
                max_features='sqrt',     # Common practice for RF, good with high-dim data
              class_weight='balanced', # Adjust for imbalanced classes
              random_state=42,
              n_jobs=-1                # Use all CPU cores
          ),
            threshold=0.0005,          # Instead of default 'mean', select top 50% of features
            prefit=False
        )
        tree_selector.fit(X_resampled, y_resampled)
        X_train_sel = tree_selector.transform(X_resampled)
        X_test_sel = tree_selector.transform(X_test)


        # Train and evaluate
        model.fit(X_train_sel, y_resampled)
        y_pred = model.predict(X_test_sel)

        accs.append(accuracy_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred, average='weighted'))
        precs.append(precision_score(y_test, y_pred, average='weighted', zero_division=0))
        recs.append(recall_score(y_test, y_pred, average='weighted'))

        report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        classwise_reports.append(report_dict)

    # Aggregate metrics
    results_cv[model_name] = {
        "Mean Accuracy": np.mean(accs),
        "Std Accuracy": np.std(accs),
        "Mean F1-Score": np.mean(f1s),
        "Std F1-Score": np.std(f1s),
    }

    results_precision_recall[model_name] = {
        "Mean Precision": np.mean(precs),
        "Std Precision": np.std(precs),
        "Mean Recall": np.mean(recs),
        "Std Recall": np.std(recs),
    }

    # Per-class averages
    all_labels = y.unique()
    for label in all_labels:
        label = str(label)
        avg_precision = np.mean([fold[label]['precision'] for fold in classwise_reports if label in fold])
        avg_recall = np.mean([fold[label]['recall'] for fold in classwise_reports if label in fold])
        avg_f1 = np.mean([fold[label]['f1-score'] for fold in classwise_reports if label in fold])
        per_class_report[model_name][f"Class {label} Precision"] = avg_precision
        per_class_report[model_name][f"Class {label} Recall"] = avg_recall
        per_class_report[model_name][f"Class {label} F1"] = avg_f1

# Final DataFrames
cv_df = pd.DataFrame(results_cv).T.sort_values(by="Mean F1-Score", ascending=False)
prec_rec_df = pd.DataFrame(results_precision_recall).T
per_class_df = pd.DataFrame(per_class_report).T

# Save to Drive
output_file = "/content/drive/MyDrive/fs_fine_mathbert_eval_results_tree(2).xlsx"
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    cv_df.to_excel(writer, sheet_name='Accuracy_F1')
    prec_rec_df.to_excel(writer, sheet_name='Precision_Recall')
    per_class_df.to_excel(writer, sheet_name='Per_Class_Report')

print(f"\nAll results saved to: {output_file}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Evaluating: Random Forest

Evaluating: SVM

Evaluating: XGBoost


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Evaluating: Logistic Regression

Evaluating: Naive Bayes

Evaluating: Decision Tree

Evaluating: KNN

Evaluating: MLP

Evaluating: AdaBoost

✅ All results saved to: /content/drive/MyDrive/fs_fine_mathbert_eval_results_tree(2).xlsx
