In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings

# Suppress warnings for clean output
warnings.filterwarnings("ignore")

# Load dataset
file_path = "/content/training_with_glove embeddings_split.xlsx"  # Replace with your dataset path
data = pd.read_excel(file_path)

# Preprocessing
X = data.drop(["input", "Class"], axis=1)  # Drop 'input' and target
y = data["Class"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check the class distribution after SMOTE
balanced_class_distribution = Counter(y_train_smote)
print("Class distribution after SMOTE:", balanced_class_distribution)

# Define 2 models
models = {
    "ExtraTreesClassifier": ExtraTreesClassifier(),
    "MLPClassifier": MLPClassifier(),
}

# Define hyperparameter grids for each model
extended_param_grids = {
    "ExtraTreesClassifier": {
        "n_estimators": [100, 200, 500],
        "max_depth": [None, 10, 20],
        "max_features": ["sqrt", "log2"],
        "min_samples_split": [2, 5, 10],
    },
    "MLPClassifier": {
        "hidden_layer_sizes": [(50,), (100,), (50, 50)],
        "activation": ["relu", "tanh", "logistic"],
        "solver": ["adam", "sgd"],
        "learning_rate_init": [0.001, 0.01, 0.1],
        "max_iter": [200, 500, 1000],
    },
}

# Store results
results = []

# Train, optimize, and evaluate each model
for model_name, model in models.items():
    print(f"Optimizing {model_name}...")

    # Use extended hyperparameters if available
    if extended_param_grids.get(model_name):
        random_search = RandomizedSearchCV(
            model,
            param_distributions=extended_param_grids[model_name],
            n_iter=20,  # Number of parameter settings to sample
            cv=5,
            scoring="accuracy",
            random_state=42,
            n_jobs=-1,
        )
        random_search.fit(X_train_smote, y_train_smote)
        best_model = random_search.best_estimator_
    else:
        best_model = model
        best_model.fit(X_train_smote, y_train_smote)

    # Cross-validation
    cv_scores = cross_val_score(best_model, X_train_smote, y_train_smote, cv=5, scoring="accuracy")
    train_mean_accuracy = np.mean(cv_scores)
    train_std_dev = np.std(cv_scores)

    # Test set evaluation
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")

    print(f"Model: {model_name}")
    print(f"Train Mean Accuracy: {train_mean_accuracy:.4f}, Train Std Dev: {train_std_dev:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}\n")

    # Store results
    results.append({
        "Model": model_name,
        "Train Mean Accuracy": train_mean_accuracy,
        "Train Std Dev": train_std_dev,
        "Test Accuracy": test_accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
    })

# Convert results to a DataFrame for easy comparison
results_df = pd.DataFrame(results)

# Save results to Excel
results_df.to_excel("optimized_model_results.xlsx", index=False)

# Print final results
print("Final Model Evaluation Results:")
print(results_df)


Class distribution after SMOTE: Counter({0: 539, 1: 539, 2: 539})
Optimizing ExtraTreesClassifier...
Model: ExtraTreesClassifier
Train Mean Accuracy: 0.6074, Train Std Dev: 0.0661
Test Accuracy: 0.5357, Precision: 0.5532, Recall: 0.5357, F1 Score: 0.5385

Optimizing MLPClassifier...
Model: MLPClassifier
Train Mean Accuracy: 0.6092, Train Std Dev: 0.0691
Test Accuracy: 0.5000, Precision: 0.4988, Recall: 0.5000, F1 Score: 0.4993

Final Model Evaluation Results:
                  Model  Train Mean Accuracy  Train Std Dev  Test Accuracy  \
0  ExtraTreesClassifier             0.607373       0.066061       0.535714   
1         MLPClassifier             0.609229       0.069114       0.500000   

   Precision    Recall  F1 Score  
0   0.553219  0.535714  0.538510  
1   0.498784  0.500000  0.499301  


In [None]:
!pip install lime


In [None]:
pip install scikit-learn imbalanced-learn
