1. Recipe Reviews Dataset(SVM Pipeline)

In [10]:
import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Load and preprocess the dataset
df = pd.read_csv("Recipe Reviews and User Feedback Dataset.csv")
df = df.dropna(subset=["text"])
df = df[df["stars"].isin([0, 1, 2, 4, 5])]
df["label"] = df["stars"].apply(lambda x: 1 if x >= 4 else 0)
texts = df["text"].values
labels = df["label"].values

# Step 2: Define evaluation function
def evaluate_svm(X, y, c_value, sample_ratio=1.0):
    if sample_ratio < 1.0:
        X, _, y, _ = train_test_split(X, y, train_size=sample_ratio, stratify=y, random_state=42)

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
    model = LinearSVC(C=c_value, max_iter=10000)
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1_cv = cross_val_score(model, X, y, cv=5, scoring='f1_weighted').mean()
    report = classification_report(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)

    print(f"\n C={c_value}, Sample Ratio={sample_ratio}, TF-IDF={X.shape[1]} features")
    print(f"Accuracy: {acc:.4f}")
    print(f"CV F1 Score: {f1_cv:.4f}")
    print(f"Train Time: {train_time:.2f}s")
    print("\nConfusion Matrix:\n", matrix)
    print("\nClassification Report:\n", report)

# Step 3: Run experiments
for tfidf_size in [3000, 5000]:
    vectorizer = TfidfVectorizer(max_features=tfidf_size, stop_words="english")
    X_all = vectorizer.fit_transform(texts)

    for sample_ratio in [1.0, 0.5, 0.25]:
        for C in [0.1, 1.0, 10.0]:
            evaluate_svm(X_all, labels, C, sample_ratio)



 C=0.1, Sample Ratio=1.0, TF-IDF=3000 features
Accuracy: 0.8796
CV F1 Score: 0.8347
Train Time: 0.04s

Confusion Matrix:
 [[  27  415]
 [  11 3085]]

Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.06      0.11       442
           1       0.88      1.00      0.94      3096

    accuracy                           0.88      3538
   macro avg       0.80      0.53      0.52      3538
weighted avg       0.86      0.88      0.83      3538


 C=1.0, Sample Ratio=1.0, TF-IDF=3000 features
Accuracy: 0.8827
CV F1 Score: 0.8514
Train Time: 0.07s

Confusion Matrix:
 [[  95  347]
 [  68 3028]]

Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.21      0.31       442
           1       0.90      0.98      0.94      3096

    accuracy                           0.88      3538
   macro avg       0.74      0.60      0.62      3538
weighted avg       0.86      0.88      0.86      35

2. Mice Protein Expression Dataset(SVM Pipeline)

In [11]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Load and prepare the dataset
df = pd.read_csv("mice_protein_expression.csv")
df = df.drop(columns=["MouseID", "Genotype", "Treatment", "Behavior"]).dropna()

X_raw = df.drop(columns=["class"])
y_raw = df["class"]

# Encode class labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)

# Prepare scaled and unscaled versions
X_unscaled = X_raw.values
X_scaled = StandardScaler().fit_transform(X_unscaled)

# Step 2: Define evaluation function
def evaluate_svm(X, y, c_value, sample_ratio=1.0, scaled=True):
    if sample_ratio < 1.0:
        X, _, y, _ = train_test_split(X, y, train_size=sample_ratio, stratify=y, random_state=42)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    model = LinearSVC(C=c_value, max_iter=10000)

    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1_cv = cross_val_score(model, X, y, cv=5, scoring="f1_weighted").mean()
    report = classification_report(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)

    print(f"\n SVM | C={c_value} | Sample={sample_ratio} | Scaled={scaled}")
    print(f"Accuracy: {acc:.4f}")
    print(f"CV F1 Score: {f1_cv:.4f}")
    print(f"Training Time: {train_time:.2f}s")
    print("\nConfusion Matrix:\n", matrix)
    print("\nClassification Report:\n", report)

# Step 3: Run experiments
for X_data, label in [(X_scaled, True), (X_unscaled, False)]:
    for ratio in [1.0, 0.5, 0.25]:
        for C in [0.1, 1.0, 10.0]:
            evaluate_svm(X_data, y, c_value=C, sample_ratio=ratio, scaled=label)



 SVM | C=0.1 | Sample=1.0 | Scaled=True
Accuracy: 1.0000
CV F1 Score: 0.8150
Training Time: 0.03s

Confusion Matrix:
 [[ 9  0  0  0  0  0  0  0]
 [ 0 15  0  0  0  0  0  0]
 [ 0  0 12  0  0  0  0  0]
 [ 0  0  0 15  0  0  0  0]
 [ 0  0  0  0 18  0  0  0]
 [ 0  0  0  0  0 15  0  0]
 [ 0  0  0  0  0  0 12  0]
 [ 0  0  0  0  0  0  0 15]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00        15
           2       1.00      1.00      1.00        12
           3       1.00      1.00      1.00        15
           4       1.00      1.00      1.00        18
           5       1.00      1.00      1.00        15
           6       1.00      1.00      1.00        12
           7       1.00      1.00      1.00        15

    accuracy                           1.00       111
   macro avg       1.00      1.00      1.00       111
weighted avg       1.00      1.00      1.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



 SVM | C=0.1 | Sample=0.25 | Scaled=False
Accuracy: 0.7857
CV F1 Score: 0.7929
Training Time: 0.01s

Confusion Matrix:
 [[0 0 0 0 0 0 0 2]
 [0 3 0 1 0 0 0 0]
 [0 0 3 0 0 0 0 0]
 [0 0 0 4 0 0 0 0]
 [0 1 0 0 3 0 0 0]
 [0 0 0 0 0 4 0 0]
 [0 0 2 0 0 0 1 0]
 [0 0 0 0 0 0 0 4]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.75      0.75      0.75         4
           2       0.60      1.00      0.75         3
           3       0.80      1.00      0.89         4
           4       1.00      0.75      0.86         4
           5       1.00      1.00      1.00         4
           6       1.00      0.33      0.50         3
           7       0.67      1.00      0.80         4

    accuracy                           0.79        28
   macro avg       0.73      0.73      0.69        28
weighted avg       0.77      0.79      0.75        28


 SVM | C=1.0 | Sample=0.25 | Scaled=False
Accurac

3. Smartphone Specs Dataset(SVM Pipeline)

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Load and prepare the data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
df = pd.concat([train_df, test_df], ignore_index=True)
df = df.dropna(subset=["price_range"])
if "id" in df.columns:
    df = df.drop(columns=["id"])

X_raw = df.drop(columns=["price_range"]).values
y = df["price_range"].astype(int).values

# Step 2: Standardize features
X_scaled = StandardScaler().fit_transform(X_raw)

# Step 3: Define evaluation function for Holdout vs. CV
def evaluate_svm_holdout_cv(X, y, c_value, sample_ratio=1.0):
    if sample_ratio < 1.0:
        X, _, y, _ = train_test_split(X, y, train_size=sample_ratio, stratify=y, random_state=42)

    # Holdout
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
    model = LinearSVC(C=c_value, max_iter=10000)

    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    y_pred = model.predict(X_test)
    acc_holdout = accuracy_score(y_test, y_pred)
    f1_holdout = cross_val_score(model, X_train, y_train, cv=5, scoring="f1_weighted").mean()

    # Full cross-validation
    f1_cv_full = cross_val_score(model, X, y, cv=5, scoring="f1_weighted").mean()

    print(f"\n SVM | C={c_value} | Sample={sample_ratio}")
    print(f"Holdout Accuracy: {acc_holdout:.4f}")
    print(f"F1 Score (Holdout CV): {f1_holdout:.4f}")
    print(f"F1 Score (Full CV): {f1_cv_full:.4f}")
    print(f"Training Time: {train_time:.2f}s")

    matrix = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print("\nConfusion Matrix:\n", matrix)
    print("\nClassification Report:\n", report)

# Step 4: Run evaluations
for sample_ratio in [1.0, 0.5, 0.25]:
    for C in [0.1, 1.0, 10.0]:
        evaluate_svm_holdout_cv(X_scaled, y, c_value=C, sample_ratio=sample_ratio)



 SVM | C=0.1 | Sample=1.0
Holdout Accuracy: 0.8025
F1 Score (Holdout CV): 0.7982
F1 Score (Full CV): 0.8153
Training Time: 0.01s

Confusion Matrix:
 [[98  2  0  0]
 [10 61 29  0]
 [ 0 23 63 14]
 [ 0  0  1 99]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.98      0.94       100
           1       0.71      0.61      0.66       100
           2       0.68      0.63      0.65       100
           3       0.88      0.99      0.93       100

    accuracy                           0.80       400
   macro avg       0.79      0.80      0.80       400
weighted avg       0.79      0.80      0.80       400


 SVM | C=1.0 | Sample=1.0
Holdout Accuracy: 0.8175
F1 Score (Holdout CV): 0.8378
F1 Score (Full CV): 0.8510
Training Time: 0.01s

Confusion Matrix:
 [[99  1  0  0]
 [ 5 64 31  0]
 [ 0 26 65  9]
 [ 0  0  1 99]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99     

4. Brain Tumor Dataset(SVM Pipeline)

In [1]:
import numpy as np
import time
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Load pre-extracted ResNet50 features
X = np.load("X_brain_features.npy")
y = np.load("y_brain_labels.npy")
class_names = np.load("class_names.npy", allow_pickle=True)

# Step 2: Define evaluation function
def evaluate_svm(X, y, c_value, sample_ratio=1.0):
    if sample_ratio < 1.0:
        X, _, y, _ = train_test_split(X, y, train_size=sample_ratio, stratify=y, random_state=42)

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
    model = LinearSVC(C=c_value, max_iter=10000)

    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1_cv = cross_val_score(model, X, y, cv=5, scoring='f1_weighted').mean()
    report = classification_report(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)

    print(f"\n SVM | C={c_value} | Sample={sample_ratio}")
    print(f"Accuracy: {acc:.4f}")
    print(f"CV F1 Score: {f1_cv:.4f}")
    print(f"Training Time: {train_time:.2f}s")
    print("\nConfusion Matrix:\n", matrix)
    print("\nClassification Report:\n", report)

# Step 3: Run experiments
for sample_ratio in [1.0, 0.5, 0.25]:
    for C in [0.1, 1.0, 10.0]:
        evaluate_svm(X, y, c_value=C, sample_ratio=sample_ratio)



 SVM | C=0.1 | Sample=1.0
Accuracy: 0.9007
CV F1 Score: 0.9050
Training Time: 7.78s

Confusion Matrix:
 [[149  16   0   0]
 [ 14 136   5  10]
 [  0   4  74   1]
 [  2   4   1 158]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.90      0.90       165
           1       0.85      0.82      0.84       165
           2       0.93      0.94      0.93        79
           3       0.93      0.96      0.95       165

    accuracy                           0.90       574
   macro avg       0.90      0.91      0.90       574
weighted avg       0.90      0.90      0.90       574


 SVM | C=1.0 | Sample=1.0
Accuracy: 0.8920
CV F1 Score: 0.9006
Training Time: 8.66s

Confusion Matrix:
 [[147  18   0   0]
 [ 15 135   4  11]
 [  0   5  73   1]
 [  2   4   2 157]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.89      0.89       165
           1       0.83      0.82      0.8




 SVM | C=1.0 | Sample=0.5
Accuracy: 0.8815
CV F1 Score: 0.8801
Training Time: 46.88s

Confusion Matrix:
 [[71 11  1  0]
 [ 6 68  5  3]
 [ 0  4 35  0]
 [ 0  4  0 79]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.86      0.89        83
           1       0.78      0.83      0.80        82
           2       0.85      0.90      0.88        39
           3       0.96      0.95      0.96        83

    accuracy                           0.88       287
   macro avg       0.88      0.88      0.88       287
weighted avg       0.88      0.88      0.88       287






 SVM | C=10.0 | Sample=0.5
Accuracy: 0.8780
CV F1 Score: 0.8795
Training Time: 42.47s

Confusion Matrix:
 [[71 11  1  0]
 [ 6 67  5  4]
 [ 0  4 35  0]
 [ 0  4  0 79]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.86      0.89        83
           1       0.78      0.82      0.80        82
           2       0.85      0.90      0.88        39
           3       0.95      0.95      0.95        83

    accuracy                           0.88       287
   macro avg       0.88      0.88      0.88       287
weighted avg       0.88      0.88      0.88       287


 SVM | C=0.1 | Sample=0.25
Accuracy: 0.8681
CV F1 Score: 0.8551
Training Time: 5.50s

Confusion Matrix:
 [[37  4  0  0]
 [ 4 33  2  2]
 [ 0  5 14  1]
 [ 1  0  0 41]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.90      0.89        41
           1       0.79      0.80      0.80        41
           2     




 SVM | C=10.0 | Sample=0.25
Accuracy: 0.8542
CV F1 Score: 0.8553
Training Time: 15.43s

Confusion Matrix:
 [[37  4  0  0]
 [ 4 32  2  3]
 [ 0  5 14  1]
 [ 1  1  0 40]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.90      0.89        41
           1       0.76      0.78      0.77        41
           2       0.88      0.70      0.78        20
           3       0.91      0.95      0.93        42

    accuracy                           0.85       144
   macro avg       0.86      0.83      0.84       144
weighted avg       0.85      0.85      0.85       144

