# Traditional ML Text Classification
## Logistic Regression vs SVM with Multiple Embeddings

### 1. Imports & Setup

In [None]:

import numpy as np
import pandas as pd
import re
import string

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('stopwords')


### 2. Dataset Loading

In [None]:

data = pd.read_csv("/mnt/data/labeled_data.csv")
data.head()


### 3. Exploratory Data Analysis

In [None]:

plt.figure()
sns.countplot(x='label', data=data)
plt.title("Class Distribution")
plt.show()

data['text_length'] = data['text'].apply(lambda x: len(str(x).split()))
plt.figure()
sns.histplot(data['text_length'], bins=40)
plt.title("Text Length Distribution")
plt.show()


### 4. Preprocessing

In [None]:

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    return [t for t in tokens if t not in stop_words]

data['tokens'] = data['text'].apply(preprocess)
data['clean_text'] = data['tokens'].apply(lambda x: " ".join(x))


### 5. Train-Test Split

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    data['clean_text'], data['label'],
    test_size=0.2, random_state=42, stratify=data['label']
)


### 6. TF-IDF Embeddings

In [None]:

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


### 7. Logistic Regression vs SVM (TF-IDF)

In [None]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVM": LinearSVC()
}

results_tfidf = []

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    preds = model.predict(X_test_tfidf)
    results_tfidf.append({
        "Model": name,
        "Embedding": "TF-IDF",
        "Accuracy": accuracy_score(y_test, preds),
        "F1": f1_score(y_test, preds, average='weighted')
    })

pd.DataFrame(results_tfidf)


### 8. Word2Vec Training

In [None]:

w2v = Word2Vec(sentences=data['tokens'], vector_size=100, window=5, min_count=2)

def sentence_vector(tokens, model):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    return np.mean(vecs, axis=0) if len(vecs) > 0 else np.zeros(model.vector_size)

X_w2v = np.array([sentence_vector(t, w2v) for t in data['tokens']])

X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(
    X_w2v, data['label'], test_size=0.2, random_state=42, stratify=data['label']
)


### 9. Logistic Regression vs SVM (Word2Vec)

In [None]:

results_w2v = []

for name, model in models.items():
    model.fit(X_train_w2v, y_train_w2v)
    preds = model.predict(X_test_w2v)
    results_w2v.append({
        "Model": name,
        "Embedding": "Word2Vec",
        "Accuracy": accuracy_score(y_test_w2v, preds),
        "F1": f1_score(y_test_w2v, preds, average='weighted')
    })

pd.DataFrame(results_w2v)


### 10. Combined Results Table

In [None]:

results_df = pd.DataFrame(results_tfidf + results_w2v)
results_df


### 11. Visualization: Model Comparison

In [None]:

plt.figure()
sns.barplot(data=results_df, x="Embedding", y="F1", hue="Model")
plt.title("Traditional ML Models Comparison")
plt.show()
