In [None]:
import time
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

# -------------------------------
# Load dataset
# -------------------------------
raw_dataset = load_dataset('csv', data_files='dataset/gpt_multi_label_16000.csv')

# Convert label columns into multi-label format
def process_labels(example):
    example['label'] = [
        example['feature request'],
        example['bug report'],
        example['rating'],
        example['user experience']
    ]
    return example

raw_dataset = raw_dataset.map(process_labels)

# -------------------------------
# Split into train/val/test
# -------------------------------
split = raw_dataset['train'].train_test_split(test_size=0.2, shuffle=True)
train_dataset = split['train']
test_val_split = split['test'].train_test_split(test_size=0.5, shuffle=True)
val_dataset = test_val_split['train']
test_dataset = test_val_split['test']

# -------------------------------
# Extract texts and labels
# -------------------------------
train_texts = [x['review'] for x in train_dataset]
train_labels = np.array([x['label'] for x in train_dataset])

val_texts = [x['review'] for x in val_dataset]
val_labels = np.array([x['label'] for x in val_dataset])

# -------------------------------
# TF-IDF Vectorization
# -------------------------------
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_val_tfidf = tfidf_vectorizer.transform(val_texts)

# -------------------------------
# Hyperparameter Tuning
# -------------------------------
param_grid = {
    'C': [0.01, 0.1, 0.5, 1],
    'tol': [1e-5, 1e-4, 1e-3],
    'max_iter': [5000, 10000, 20000],
    'class_weight': [None, 'balanced']
}

grid_search = GridSearchCV(
    OneVsRestClassifier(LinearSVC(dual=False, random_state=42)),
    {
        'estimator__' + key: value for key, value in param_grid.items()
    },
    scoring='f1_macro',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# -------------------------------
# Train with timing
# -------------------------------
start_time = time.time()
grid_search.fit(X_train_tfidf, train_labels)
end_time = time.time()

print("Best Parameters:", grid_search.best_params_)

In [None]:
# -------------------------------
# Evaluate on validation set
# -------------------------------
val_preds = grid_search.predict(X_val_tfidf)

report_dict = classification_report(val_labels, val_preds, target_names=['bug report', 'feature request', 'rating', 'user experience'], output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()
report_df = report_df.round(4)

print("\nClassification Report:\n", report_df)

macro_f1 = report_dict["macro avg"]["f1-score"]
print(f"\nMacro F1 Score: {macro_f1:.4f}")
print("Total time: {:.2f}s".format(end_time - start_time))

In [None]:
import datetime
import os
# Save the classification report
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
report_path = os.path.join('models/svm', f"classification_report_{timestamp}.csv")
report_df.to_csv(report_path, float_format='%.4f')
print(f"Classification report saved to {report_path}")