<a href="https://colab.research.google.com/github/washwin/RamSpec_Bacterial_Classification/blob/main/classical_machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/preprocessed_ds used.csv')
# df = pd.read_csv('/content/drive/MyDrive/preprocessed_ds.csv')

# Separate features and labels
features = df.drop(['label', 'Class'], axis=1)
# features = df.drop(['label'], axis=1)
labels = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [None]:
# Function to compute metrics
def compute_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    print([model_name, accuracy, precision, recall, f1])
    return [model_name, accuracy, precision, recall, f1]

metrics = []

In [None]:
# Hyperparameters
n_estimators = 100
random_state = 42
max_iter = 2000

In [None]:
# 1. Random Forest
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}


random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=random_state),
    param_distributions=param_dist,
    n_iter=20,  # Reduced iterations
    cv=5,
    scoring='accuracy',
    random_state=random_state,
    n_jobs=-1  # Use all CPU cores
)


random_search.fit(X_train, y_train)
best_rf = random_search.best_estimator_

# Evaluate with the best parameters
rf_pred = best_rf.predict(X_test)
metrics.append(compute_metrics(y_test, rf_pred, 'Random Forest'))

In [None]:
# 2. Gradient Boosting
gb_clf = GradientBoostingClassifier(n_estimators=n_estimators, random_state=random_state)
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)
metrics.append(compute_metrics(y_test, gb_pred, 'Gradient Boosting'))

['Gradient Boosting', 0.7759562841530054, 0.7884021985747613, 0.7759562841530054, 0.7771711627634387]


In [None]:
# 3. AdaBoost
ada_clf = AdaBoostClassifier(n_estimators=n_estimators, random_state=random_state)
ada_clf.fit(X_train, y_train)
ada_pred = ada_clf.predict(X_test)
metrics.append(compute_metrics(y_test, ada_pred, 'AdaBoost'))

['AdaBoost', 0.3005464480874317, 0.419008154314032, 0.3005464480874317, 0.28759171384398224]


In [None]:
# 4. Voting Classifier (Hard Voting)
dt = DecisionTreeClassifier(random_state=random_state)
nn = MLPClassifier(hidden_layer_sizes=(64,), max_iter=max_iter, random_state=random_state)
svm = SVC(kernel='rbf', random_state=random_state)
voting_clf = VotingClassifier(
    estimators=[('dt', dt), ('nn', nn), ('svm', svm)],
    voting='hard'
)
voting_clf.fit(X_train, y_train)
voting_pred = voting_clf.predict(X_test)
metrics.append(compute_metrics(y_test, voting_pred, 'Voting Classifier'))

['Voting Classifier', 0.7540983606557377, 0.8139391825257116, 0.7540983606557377, 0.7526929657644327]


In [None]:
#5. SVM
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}
grid = GridSearchCV(SVC(), param_grid, refit=True, cv=5)
grid.fit(X_train, y_train)

# Use the best estimator
svm_clf = grid.best_estimator_
svm_pred = svm_clf.predict(X_test)
metrics.append(compute_metrics(y_test, svm_pred, 'SVM'))

['SVM', 0.9398907103825137, 0.9470041369752072, 0.9398907103825137, 0.9407548385801341]


In [None]:
import pandas as pd
import plotly.graph_objects as go

# Your metrics data
metrics = [
    ['RF', 0.7650, 0.7740, 0.7650, 0.7645],
    ['SVM', 0.9399, 0.9470, 0.9399, 0.9408],
    ['CNN', 0.94, 0.94, 0.94, 0.94],
    ['NN-Tree Ensemble', 0.9609, 0.97, 0.96, 0.96]
]

# Convert to DataFrame
metrics_df = pd.DataFrame(metrics, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
metrics_df.set_index('Model', inplace=True)

# Create the Plotly figure
fig = go.Figure()

# Define colors for each metric
colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA"]

# Add a bar trace for each metric with text annotations
for i, metric in enumerate(metrics_df.columns):
    fig.add_trace(go.Bar(
        x=metrics_df.index,
        y=metrics_df[metric],
        name=metric,
        marker=dict(color=colors[i]),
        width=0.2,
        text=[f"{val:.4f}" for val in metrics_df[metric]],  # Add exact values
        textposition="outside"  # Position labels above bars
    ))

# Highlight SVM, CNN, and NN-Tree Ensemble with arrows to show differences
annotations = [
    dict(
        x="SVM", y=metrics_df.loc["SVM", "Accuracy"] + 0.005,  # Slightly above the bar
        text="SVM (≈94%)", showarrow=True, arrowhead=2, ax=0, ay=-30
    ),
    dict(
        x="CNN", y=metrics_df.loc["CNN", "Accuracy"] + 0.005,
        text="CNN (≈94%)", showarrow=True, arrowhead=2, ax=0, ay=-30
    ),
    dict(
        x="NN-Tree Ensemble", y=metrics_df.loc["NN-Tree Ensemble", "Accuracy"] + 0.005,
        text="NN (≈96%)", showarrow=True, arrowhead=2, ax=0, ay=-30
    )
]

# Update layout
fig.update_layout(
    title="Model Performance Metrics",
    xaxis_title="Model",
    yaxis_title="Score",
    barmode="group",
    bargap=0.3,
    bargroupgap=0.15,
    legend_title="Metrics",
    template="plotly",
    annotations=annotations  # Add annotations
)

# Show the plot
fig.show()
