In [15]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Load dataset from Excel file
def load_data(file_path, sheet_name=0):
    data = pd.read_excel(file_path, sheet_name=sheet_name)
    print(f"Data loaded successfully with shape: {data.shape}")
    return data

# Hyperparameter tuning for Perceptron
def tune_perceptron(X_train, y_train):
    perceptron = Perceptron()
    param_dist = {
        'penalty': ['l2', None],  # Valid options for regularization
        'eta0': np.logspace(-4, 1, 10),  # Learning rate values
        'max_iter': [1000, 2000, 3000]
    }
    random_search = RandomizedSearchCV(perceptron, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)
    random_search.fit(X_train, y_train)
    return random_search

# Hyperparameter tuning for MLP
def tune_mlp(X_train, y_train):
    mlp = MLPClassifier()
    param_dist = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['tanh', 'relu'],
        'solver': ['adam', 'sgd'],
        'alpha': np.logspace(-4, 1, 10),
        'learning_rate': ['constant', 'adaptive']
    }
    random_search = RandomizedSearchCV(mlp, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)
    random_search.fit(X_train, y_train)
    return random_search

# Load data
data = pd.read_excel('training_mathbert 3.xlsx')
X = data.iloc[:, :-1].values  # Features
y = data.iloc[:, -1].values   # Target

# Ensure target labels are integers
y = y.astype(int)

# Check for missing values
print(f"Missing values in X: {np.isnan(X).sum()}")
print(f"Missing values in y: {np.isnan(y).sum()}")

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Debugging: Check shape and initial values
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_train:", X_train[:1000])
print("y_train:", y_train[:1000])

# Tune Perceptron
perceptron_search = tune_perceptron(X_train, y_train)
print(f"Best Perceptron Params: {perceptron_search.best_params_}")
print(f"Perceptron Test Score: {perceptron_search.score(X_test, y_test)}")




Missing values in X: 0
Missing values in y: 0
X_train shape: (900, 384)
y_train shape: (900,)
X_train: [[ 0.43065925  0.31165501 -1.17599463 ... -0.0034474  -1.24896036
  -0.46903949]
 [-0.42325022 -1.61802092  1.47134977 ... -0.83380362  2.60301726
   0.16690703]
 [ 0.41468977  0.41584676  0.75013947 ... -0.96415061 -0.31073781
  -0.14098914]
 ...
 [ 0.38083368  0.31547139  0.92295139 ... -0.6160675  -0.413935
   0.99635437]
 [ 0.54531332 -1.01878535 -0.41128906 ... -0.34846709  0.40775728
  -0.97471566]
 [-0.77235995 -0.54224716  0.24817542 ... -0.42302967  1.2995227
  -1.22055318]]
y_train: [4 3 3 3 3 2 3 4 3 5 2 5 3 4 4 4 4 3 3 2 2 2 2 2 0 0 3 5 4 2 2 2 3 2 5 3 2
 4 2 3 4 4 2 2 0 2 1 1 1 3 2 1 5 2 1 3 2 4 2 3 4 2 3 4 4 3 4 3 4 2 1 3 1 3
 4 4 2 3 2 2 2 3 4 1 2 3 3 3 2 4 2 2 3 3 4 1 3 3 4 3 4 4 0 2 2 2 4 2 2 4 3
 2 4 4 3 1 2 3 5 2 2 4 2 2 2 2 2 3 2 2 4 5 2 3 3 1 1 3 0 5 2 2 3 4 4 2 1 1
 2 1 5 3 5 4 4 3 3 2 3 0 2 4 3 1 1 4 4 2 3 3 1 2 4 2 3 2 3 2 4 3 3 1 3 3 2
 5 2 3 0 3 3 2 3 2 1 2 3

In [16]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


# Fit and evaluate a classifier
def evaluate_classifier(clf, X_train, y_train, X_test, y_test):
    """Fit the classifier and evaluate its performance."""
    print(f"Training {clf.__class__.__name__}...")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # Calculate performance metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred, average='weighted')
    }

    # Try to calculate ROC AUC if classifier supports predict_proba or decision_function
    if hasattr(clf, "predict_proba"):
        try:
            y_prob = clf.predict_proba(X_test)
            metrics['ROC AUC'] = roc_auc_score(y_test, y_prob, multi_class='ovr')
        except ValueError:
            metrics['ROC AUC'] = "N/A - issue with probabilities"
    else:
        metrics['ROC AUC'] = "N/A - no predict_proba"

    return metrics

# Compare different classifiers
def compare_classifiers(X_train, y_train, X_test, y_test):
    """Compare multiple classifiers and tabulate their results."""
    classifiers = {
        'SVM': SVC(probability=True),  # Ensure SVM has probability enabled
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'AdaBoost': AdaBoostClassifier(),
        'Naive Bayes': GaussianNB(),
        'GradientBoosting': GradientBoostingClassifier()
    }

    # Evaluate each classifier and store results
    results = {}
    for name, clf in classifiers.items():
        results[name] = evaluate_classifier(clf, X_train, y_train, X_test, y_test)

    # Print the results
    for name, metrics in results.items():
        print(f"Classifier: {name}")
        for metric, value in metrics.items():
            print(f"{metric}: {value}")
        print("\n")

# Main function for A3 (Classifier Comparison)
# Load data
data = pd.read_excel('training_mathbert 3.xlsx')
X = data.iloc[:, :-1].values  # Features
y = data.iloc[:, -1].values  # Target

# Debugging: Check shapes and first few rows of data
print("X shape:", X.shape)
print("y shape:", y.shape)
print("X:", X[:1000])
print("y:", y[:1000])

# Ensure target labels are integers
y = y.astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Debugging: Check for missing values
print(f"Missing values in X_train: {np.isnan(X_train).sum()}")
print(f"Missing values in y_train: {np.isnan(y_train).sum()}")

# Compare classifiers
compare_classifiers(X_train, y_train, X_test, y_test)


X shape: (1126, 384)
y shape: (1126,)
X: [[-0.08992592  0.34387389  0.17638168 ...  0.10559644  0.1964383
   0.11719853]
 [ 0.30326107  0.08492954  0.04736905 ... -0.02969474  0.33597666
  -0.19753899]
 [-0.27429068  0.21680066  0.02911038 ... -0.08576754  0.5129559
   0.02333383]
 ...
 [-0.29696792 -0.20464683  0.07746858 ...  0.11653966  0.19947569
  -0.09766734]
 [-0.07764415  0.16842389 -0.18984014 ... -0.16399319  0.06891123
   0.0273101 ]
 [-0.22067161  0.30338642  0.34436339 ...  0.18375197  0.11251207
  -0.08536078]]
y: [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.25       0.25
 0.375      0.5        0.5        0.5        0.5        0.5
 0.5        0.5        0.5        0.5        0.5        0.5
 0.5        0.5        0.5        0.5        0.5        0.5
 0.5        0.5        0.625      0.75       0.75       0.75
 0.75       0.875      0.875      1.         1.         1.
 1.         1.         1.         1.         1



Training GaussianNB...
Training GradientBoostingClassifier...
Classifier: SVM
Accuracy: 0.40707964601769914
F1 Score: 0.3753058367758543
ROC AUC: 0.7877274390467807


Classifier: Decision Tree
Accuracy: 0.36283185840707965
F1 Score: 0.3599368564191973
ROC AUC: 0.5766398781435266


Classifier: Random Forest
Accuracy: 0.45132743362831856
F1 Score: 0.43971866521945435
ROC AUC: 0.7596088258615284


Classifier: AdaBoost
Accuracy: 0.28761061946902655
F1 Score: 0.28114902316435814
ROC AUC: 0.573377318573513


Classifier: Naive Bayes
Accuracy: 0.35398230088495575
F1 Score: 0.36538420243380343
ROC AUC: 0.6601524990452755


Classifier: GradientBoosting
Accuracy: 0.4557522123893805
F1 Score: 0.44370854888729433
ROC AUC: 0.7087535847701139


