### Student information:
- Name: Vineet Kumar
- Roll No.: 2024AC05100
- Assignment-1: Video Classification

### Objective 1: (Classical Models)
Perform comparative analysis of classical machine learning models for video action classification. 

The goal is to evaluate and contrast different approaches in terms of predictive performance, computational efficiency, and interpretability.

Classical Models compared:
- Support Vector Machine (SVM)
- Random Forest
- k-Nearest Neighbors (k-NN)
- Logistic Regression
- Gradient Boosting.



### Environment Setup

> Installing required modules

In [None]:
import sys
import subprocess

def install(pkg):
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg]);

packages = ["pandas", "umap-learn"];

for p in packages:
    try:
        __import__(p.split("-")[0]);
    except ImportError:
        print("Installing package:", p);
        install(p);

> importing modules

In [None]:
import numpy as np;
import pandas as pd;
import matplotlib.pyplot as plt;
import os;
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc;
from sklearn.preprocessing import label_binarize;
from sklearn.metrics import ConfusionMatrixDisplay;

from sklearn.model_selection import train_test_split;
from sklearn.base import clone;
import joblib;
from tqdm import tqdm

from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import umap
import warnings
import tracemalloc





Additional tuning for better and clean results

In [None]:
warnings.filterwarnings("ignore", category=UserWarning)
np.random.seed(42);

Creating required directory

In [None]:
os.makedirs("../results/performance_plots", exist_ok=True);
os.makedirs("../results/saved_models", exist_ok=True);
os.makedirs("../results/feature_visualizations/tsne_umap/", exist_ok=True); 

> Load saved results from Part-A (classical models)

In [None]:
# ===================================================
# Load saved results from Part-A (classical models)
# ===================================================

y_test = np.load("../results/y_test.npy");

svm_test_pred = np.load("../results/svm_test_pred.npy");
rf_test_pred  = np.load("../results/rf_test_pred.npy");
knn_test_pred = np.load("../results/knn_test_pred.npy");
logreg_test_pred  = np.load("../results/logreg_test_pred.npy");
gb_test_pred  = np.load("../results/gb_test_pred.npy");

svm_train_time = np.load("../results/svm_train_time.npy");
rf_train_time  = np.load("../results/rf_train_time.npy");
knn_train_time = np.load("../results/knn_train_time.npy");
logreg_train_time  = np.load("../results/logreg_train_time.npy");
gb_train_time  = np.load("../results/gb_train_time.npy");

svm_test_time = np.load("../results/svm_test_time.npy");
rf_test_time  = np.load("../results/rf_test_time.npy");
knn_test_time = np.load("../results/knn_test_time.npy");
logreg_test_time  = np.load("../results/logreg_test_time.npy");
gb_test_time  = np.load("../results/gb_test_time.npy");

svm_acc = np.load("../results/svm_accuracy.npy");
rf_acc  = np.load("../results/rf_accuracy.npy");
knn_acc = np.load("../results/knn_accuracy.npy");
logreg_acc  = np.load("../results/logreg_accuracy.npy");
gb_acc  = np.load("../results/gb_accuracy.npy");

svm_f1 = np.load("../results/svm_f1.npy");
rf_f1  = np.load("../results/rf_f1.npy");
knn_f1 = np.load("../results/knn_f1.npy");
logreg_f1  = np.load("../results/logreg_f1.npy");
gb_f1  = np.load("../results/gb_f1.npy");

best_svm = joblib.load("../results/saved_models/svm_trained_model.joblib");
best_rf = joblib.load("../results/saved_models/rf_trained_model.joblib");
best_knn = joblib.load("../results/saved_models/knn_trained_model.joblib");
logreg = joblib.load("../results/saved_models/logreg_trained_model.joblib");
gb = joblib.load("../results/saved_models/gb_trained_model.joblib");

X_train = np.load("../results/saved_feature_matrices/X_train.npy");
X_val   = np.load("../results/saved_feature_matrices/X_val.npy");
X_test  = np.load("../results/saved_feature_matrices/X_test.npy");

y_train = np.load("../results/saved_feature_matrices/y_train.npy");
y_val   = np.load("../results/saved_feature_matrices/y_val.npy");
y_test  = np.load("../results/saved_feature_matrices/y_test.npy");

X_test_rf   = np.load("../results/X_test_rf.npy");

### Classical models: Evaluation Metrices Comparison

1. Classical models: Highlighting the best model

In [None]:
models = ["SVM", "Random Forest", "k-NN", "Logistic Regression", "Gradient Boosting"];
preds = [svm_test_pred, rf_test_pred, knn_test_pred, logreg_test_pred, gb_test_pred];

acc_list = [];
prec_list = [];
rec_list = [];
f1_list = [];

for p in preds:
    acc_list.append(accuracy_score(y_test, p));
    prec_list.append(precision_score(y_test, p, average="macro"));
    rec_list.append(recall_score(y_test, p, average="macro"));
    f1_list.append(f1_score(y_test, p, average="macro"));

# Create comparison dataframe
perf_df = pd.DataFrame({
    "Model": models,
    "Accuracy": acc_list,
    "Precision": prec_list,
    "Recall": rec_list,
    "F1-score": f1_list
});


perf_df

# Printing the table and Highlighting the best model
def highlight_best(s):
    is_best = s == s.max();
    return ["background-color: red" if v else "" for v in is_best];

perf_df.style.apply(highlight_best, subset=["Accuracy","Precision","Recall","F1-score"])

2. Classical models: Accuracy Comparison Plot

In [None]:
plt.figure(figsize=(8,5));
plt.bar(perf_df["Model"], perf_df["Accuracy"]);
plt.xticks(rotation=30)
plt.ylabel("Accuracy");
plt.title("Accuracy Comparison of Classical Models");
plt.tight_layout();
plt.savefig("../results/performance_plots/accuracy_comparison.png", dpi=300);
plt.show();



3. Classical models: F1-Score Comparison Plot

In [None]:
plt.figure(figsize=(8,5));
plt.bar(perf_df["Model"], perf_df["F1-score"]);
plt.xticks(rotation=30);
plt.ylabel("F1-score");
plt.title("F1-score Comparison of Classical Models");
plt.tight_layout();
plt.savefig("../results/performance_plots/f1_score_comparison.png", dpi=300);
plt.show();


4. Classical models: Precision & Recall grouped plot

In [None]:
x = np.arange(len(perf_df));
width = 0.35;

plt.figure(figsize=(9,5));
plt.bar(x - width/2, perf_df["Precision"], width, label="Precision");
plt.bar(x + width/2, perf_df["Recall"], width, label="Recall");

plt.xticks(x, perf_df["Model"], rotation=30);
plt.ylabel("Score");
plt.title("Precision and Recall Comparison");
plt.legend();
plt.tight_layout();
plt.savefig("../results/performance_plots/precision_recall_comparison.png", dpi=300);
plt.show();


5. Classical models: ROC multiclass comparison

In [None]:


# Class labels (0,1,2)
classes = np.unique(y_test);
n_classes = len(classes);

# Binarize true labels
y_test_bin = label_binarize(y_test, classes=classes);

models = ["SVM", "Random Forest", "k-NN", "Logistic Regression", "Gradient Boosting"];
preds = [svm_test_pred, rf_test_pred, knn_test_pred, logreg_test_pred, gb_test_pred];

os.makedirs("../results/performance_plots", exist_ok=True);

plt.figure(figsize=(9,7));

for model_name, y_pred in zip(models, preds):
    
    # Binarize predictions
    y_pred_bin = label_binarize(y_pred, classes=classes);
    
    fpr = dict();
    tpr = dict();
    roc_auc = dict();
    
    # ROC per class
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_bin[:, i]);
        roc_auc[i] = auc(fpr[i], tpr[i]);
    
    # Macro-average ROC
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]));
    mean_tpr = np.zeros_like(all_fpr);

    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i]);

    mean_tpr /= n_classes;
    macro_auc = auc(all_fpr, mean_tpr);

    plt.plot(all_fpr, mean_tpr, lw=2,
             label=f"{model_name} (AUC = {macro_auc:.3f})");

# Random guess line
plt.plot([0, 1], [0, 1], "k--", lw=1);

plt.xlabel("False Positive Rate");
plt.ylabel("True Positive Rate");
plt.title("Multi-class ROC Curve Comparison (Macro-average)");
plt.legend(loc="lower right");
plt.grid(True);
plt.tight_layout();
plt.savefig("../results/performance_plots/roc_auc_comparison.png", dpi=300);
plt.show();


6. Classical models: Confusion matrix comparision

In [None]:


models = ["SVM", "Random Forest", "k-NN", "Logistic Regression", "Gradient Boosting"];
preds = [svm_test_pred, rf_test_pred, knn_test_pred, logreg_test_pred, gb_test_pred];

os.makedirs("../results/performance_plots", exist_ok=True);

fig, axes = plt.subplots(2, 3, figsize=(15, 9));
axes = axes.ravel();

for i, (name, y_pred) in enumerate(zip(models, preds)):
    disp = ConfusionMatrixDisplay.from_predictions(
        y_test, 
        y_pred, 
        ax=axes[i],
        cmap="Blues",
        colorbar=False
    );
    axes[i].set_title(name);

# remove empty subplot (6th one)
axes[-1].axis("off");

fig.suptitle("Confusion Matrix Comparison – Classical Models", fontsize=16);
plt.tight_layout();
plt.subplots_adjust(top=0.92);

plt.savefig("../results/performance_plots/confusion_matrix_comparison.png", dpi=300);
plt.show();


### Classical models: Comparative Analysis

1. Classical models: Training time comparison

In [None]:
# ===================================================
# Computational Efficiency: Training Time Comparison
# ===================================================
models = ["SVM", "Random Forest", "k-NN", "Logistic Regression", "Gradient Boosting"]
train_times = [
    svm_train_time, 
    rf_train_time, 
    knn_train_time, 
    logreg_train_time, 
    gb_train_time
];

# Create dataframe for display
train_time_df = pd.DataFrame({
    "Model": models,
    "Training Time (seconds)": train_times
});

train_time_df

- Plot: Training time comparison (Classical Models)

In [None]:
plt.figure(figsize=(8,5));
plt.bar(models, train_times);
plt.xticks(rotation=30);
plt.ylabel("Training Time (seconds)");
plt.title("Training Time Comparison of Classical Models");
plt.tight_layout();
plt.savefig("../results/performance_plots/training_time_comparison.png", dpi=300);
plt.show();


2. Classical models: Inference time per video

In [None]:
# ============================================
# Inference Time per Video (seconds)
# ============================================

num_test_videos = len(y_test)

inf_times = [
    svm_test_time / num_test_videos,
    rf_test_time / num_test_videos,
    knn_test_time / num_test_videos,
    logreg_test_time / num_test_videos,
    gb_test_time / num_test_videos
]

models = ["SVM", "Random Forest", "k-NN", "Logistic Regression", "Gradient Boosting"]

inf_df = pd.DataFrame({
    "Model": models,
    "Inference Time per Video (s)": inf_times
})

inf_df


- Plot: Inference time per video (Classical Models)

In [None]:
# ============================================
# Plot: Inference time per video
# ============================================

os.makedirs("../results/performance_plots", exist_ok=True);

plt.figure(figsize=(8,5));
plt.bar(inf_df["Model"], inf_df["Inference Time per Video (s)"]);
plt.xticks(rotation=30);
plt.ylabel("Seconds per video");
plt.title("Inference Time per Video – Classical Models");
plt.tight_layout();
plt.savefig("../results/performance_plots/inference_time_comparison.png", dpi=300);
plt.show();


3. Classical models: Model Size Comparison

In [None]:
# ============================================
# Model Size (Disk footprint in MB)
# ============================================

model_files = {
    "SVM": "../results/saved_models/svm_trained_model.joblib",
    "Random Forest": "../results/saved_models/rf_trained_model.joblib",
    "k-NN": "../results/saved_models/knn_trained_model.joblib",
    "Logistic Regression": "../results/saved_models/logreg_trained_model.joblib",
    "Gradient Boosting": "../results/saved_models/gb_trained_model.joblib"
};

sizes = [];

for model, path in model_files.items():
    if os.path.exists(path):
        size_mb = os.path.getsize(path) / (1024 * 1024);
        sizes.append([model, size_mb]);
    else:
        sizes.append([model, np.nan]);

size_df = pd.DataFrame(sizes, columns=["Model", "Model Size (MB)"]);
size_df


- Plot: Model size comparison (Classical Models)

In [None]:
# ============================================
# Plot: Model size comparison
# ============================================

os.makedirs("../results/performance_plots", exist_ok=True);

plt.figure(figsize=(8,5));
plt.bar(size_df["Model"], size_df["Model Size (MB)"]);
plt.xticks(rotation=30);
plt.ylabel("Model size (MB)");
plt.title("Model Size Comparison – Classical Models");
plt.tight_layout();
plt.savefig("../results/performance_plots/model_size_comparison.png", dpi=300);
plt.show();


4. Classical models: Data Efficiency Analysis

In [None]:
# Defining data splits

#fractions = [0.1, 0.3, 0.5, 0.7, 1.0];
fractions = [0.1, 0.25, 0.5, 0.75, 0.9]


# Store original full training data

X_full = np.vstack([X_train, X_val]);
y_full = np.hstack([y_train, y_val]);


models = {
    "SVM": best_svm,
    "Random Forest": best_rf,
    "k-NN": best_knn,
    "Logistic Regression": logreg,
    "Gradient Boosting": gb
};

# Computing learning curves:
learning_results = {name: [] for name in models}

for i, frac in enumerate(fractions):

    print(f"\n========== Training with {int(frac*100)}% data ==========")

    X_sub, _, y_sub, _ = train_test_split(
        X_full, y_full,
        train_size=frac,
        stratify=y_full,
        random_state=42
    )

    for name, model in tqdm(models.items(),
                             desc=f"Models @ {int(frac*100)}%",
                             position=0,
                             leave=True):

        temp_model = clone(model)
        temp_model.fit(X_sub, y_sub)

        preds = temp_model.predict(X_test)
        acc = accuracy_score(y_test, preds)

        learning_results[name].append(acc)



- Plot: Learning curves comparison (Classical Models)

In [None]:
# Plotting the learning curves

os.makedirs("../results/performance_plots", exist_ok=True)

plt.figure(figsize=(9,6))

for name, accs in learning_results.items():
    plt.plot([int(f*100) for f in fractions], accs, marker="o", label=name)

plt.xlabel("Training data size (%)")
plt.ylabel("Test Accuracy")
plt.title("Learning Curves – Data Efficiency Comparison")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("../results/performance_plots/learning_curves.png", dpi=300)
plt.show()





5. Classical models: Memory requirements

In [None]:
memory_results = []

models = {
    "SVM": best_svm,
    "Random Forest": best_rf,
    "k-NN": best_knn,
    "Logistic Regression": logreg,
    "Gradient Boosting": gb
};

for name, model in models.items():
    tracemalloc.start()

    if name == "Random Forest":
        _ = model.predict(X_test_rf)
    else:
        _ = model.predict(X_test)

    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    memory_results.append([name, peak/(1024*1024)]) # MB

memory_df = pd.DataFrame(memory_results, columns=["Model", "Peak Memory Usage (MB)"])
memory_df


- Plot: Memory requirements (Classical Models)

In [None]:
plt.figure(figsize=(8,5))
plt.bar(memory_df["Model"], memory_df["Peak Memory Usage (MB)"])
plt.xticks(rotation=30)
plt.ylabel("Peak RAM Usage (MB)")
plt.title("Memory Requirement Comparison (Inference)")
plt.tight_layout()
plt.savefig("../results/performance_plots/memory_usage_comparison.png", dpi=300)
plt.show()


6. Classical models: t-SNE Visualization (Feature Analysis)

In [None]:


# -------- Normalize features (important for t-SNE) --------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_test)

# -------- Run t-SNE --------
tsne = TSNE(
    n_components=2,
    perplexity=30,
    learning_rate=200,
    max_iter=1500,
    random_state=42
)

X_tsne = tsne.fit_transform(X_scaled)

# -------- Plot --------
plt.figure(figsize=(8,6))

for label in np.unique(y_test):
    idx = y_test == label
    plt.scatter(X_tsne[idx,0], X_tsne[idx,1], s=30, label=f"Class {label}")

plt.title("t-SNE visualization of handcrafted video features")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.legend()
plt.tight_layout()

os.makedirs("../results/feature_visualizations/tsne_umap", exist_ok=True)
plt.savefig("../results/feature_visualizations/tsne_umap/tsne_classical.png", dpi=300)
plt.show()
plt.close()


7. Classical models: UMAP Visualization (Feature Analysis)

In [None]:


# -------- Run UMAP --------
umap_model = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    n_components=2,
    random_state=42
)

X_umap = umap_model.fit_transform(X_scaled)

# -------- Plot --------
plt.figure(figsize=(8,6))

for label in np.unique(y_test):
    idx = y_test == label
    plt.scatter(X_umap[idx,0], X_umap[idx,1], s=30, label=f"Class {label}")

plt.title("UMAP visualization of handcrafted video features")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.legend()
plt.tight_layout()

plt.savefig("../results/feature_visualizations/tsne_umap/umap_classical.png", dpi=300)
plt.show()
plt.close()
