In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

# Load dataset
train_df = pd.read_csv("/content/drive/MyDrive/Final project/train.csv", header=None)
test_df = pd.read_csv("/content/drive/MyDrive/Final project/test.csv", header=None)

train_df = train_df.rename(columns={
    train_df.columns[0]: "label",
    train_df.columns[1]: "title",
    train_df.columns[2]: "description"
})

test_df = test_df.rename(columns={
    test_df.columns[0]: "label",
    test_df.columns[1]: "title",
    test_df.columns[2]: "description"
})


print("Train size:", train_df.shape)
print("Test size:", test_df.shape)
print(train_df.head())

Train size: (120001, 3)
Test size: (7601, 3)
         label                                              title  \
0  Class Index                                              Title   
1            3  Wall St. Bears Claw Back Into the Black (Reuters)   
2            3  Carlyle Looks Toward Commercial Aerospace (Reu...   
3            3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
4            3  Iraq Halts Oil Exports from Main Southern Pipe...   

                                         description  
0                                        Description  
1  Reuters - Short-sellers, Wall Street's dwindli...  
2  Reuters - Private investment firm Carlyle Grou...  
3  Reuters - Soaring crude prices plus worries\ab...  
4  Reuters - Authorities have halted oil export\f...  


In [None]:
train_df["text"] = train_df["title"] + " " + train_df["description"]
test_df["text"] = test_df["title"] + " " + test_df["description"]

# Keep only required columns
train_df = train_df[["label", "text"]]
test_df = test_df[["label", "text"]]

In [None]:
# Reduce TRAIN dataset to 50% per class
train_df = (
    train_df.groupby("label", group_keys=False)
    .apply(lambda x: x.sample(frac=0.5, random_state=42))
    .reset_index(drop=True)
)

# Reduce TEST dataset to 50% per class
test_df = (
    test_df.groupby("label", group_keys=False)
    .apply(lambda x: x.sample(frac=0.5, random_state=42))
    .reset_index(drop=True)
)

print("Subset train size:", train_df.shape)
print("Subset test size:", test_df.shape)

# Check class distribution
print("Train distribution:\n", train_df["label"].value_counts())
print("Test distribution:\n", test_df["label"].value_counts())


Subset train size: (60000, 2)
Subset test size: (3800, 2)
Train distribution:
 label
1    15000
2    15000
3    15000
4    15000
Name: count, dtype: int64
Test distribution:
 label
1    950
2    950
3    950
4    950
Name: count, dtype: int64


  .apply(lambda x: x.sample(frac=0.5, random_state=42))
  .apply(lambda x: x.sample(frac=0.5, random_state=42))


In [None]:
import re
import string

def clean_text(text):
    text = text.lower()                                # Lowercase
    text = re.sub(r'<.*?>', '', text)                  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove URLs
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)         # Remove emojis & non-ASCII
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()           # Remove extra whitespace
    return text

train_df["text"] = train_df["text"].apply(clean_text)
test_df["text"] = test_df["text"].apply(clean_text)

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

train_df["text"] = train_df["text"].apply(remove_stopwords)
test_df["text"] = test_df["text"].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import pandas as pd

# Assuming your target column is named 'label' (change if needed)
target_column = 'label'

# Split train into X and y
X_train = train_df.drop(columns=[target_column])
y_train = train_df[[target_column]]
# Split test into X and y
X_test = test_df.drop(columns=[target_column])
y_test = test_df[[target_column]]

# Save to CSV
X_train.to_csv("/content/drive/MyDrive/Final project/split/X_train.csv", index=False)
y_train.to_csv("/content/drive/MyDrive/Final project/split/y_train.csv", index=False)
X_test.to_csv("/content/drive/MyDrive/Final project/split/X_test.csv", index=False)
y_test.to_csv("/content/drive/MyDrive/Final project/split/y_test.csv", index=False)

print("Files saved: X_train.csv, y_train.csv, X_test.csv, y_test.csv")

Files saved: X_train.csv, y_train.csv, X_test.csv, y_test.csv


In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(60000, 1) (60000, 1)
(3800, 1) (3800, 1)


In [2]:
!pip install transformers datasets scikit-learn evaluate mlflow


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting mlflow
  Downloading mlflow-3.3.2-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.3.2 (from mlflow)
  Downloading mlflow_skinny-3.3.2-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.3.2 (from mlflow)
  Downloading mlflow_tracing-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.5-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.3.2->mlflow)
  Downloading databricks_sdk-0.64.0-py3-none-any.whl.metadata (39 kB)
Collecting graphql-core<3.3,>=3.1 (from graphe

In [None]:
import torch
torch.cuda.empty_cache()


In [3]:
import numpy as np
import pandas as pd

In [4]:
# Load training and testing data
X_train = pd.read_csv("/content/drive/MyDrive/Final project/split/X_train.csv")
X_test = pd.read_csv("/content/drive/MyDrive/Final project/split/X_test.csv")
y_train = pd.read_csv("/content/drive/MyDrive/Final project/split/y_train.csv")
y_test = pd.read_csv("/content/drive/MyDrive/Final project/split/y_test.csv")

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import numpy as np
import evaluate
import os
from transformers.integrations import MLflowCallback
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Label encoding
le = LabelEncoder()
y_train_series = y_train.iloc[:, 0]
y_test_series = y_test.iloc[:, 0]

y_train_enc = le.fit_transform(y_train_series)
y_test_enc  = le.transform(y_test_series)

# Create Hugging Face Datasets
train_dataset = Dataset.from_dict({
    'text': X_train['text'].astype(str).tolist(),
    'label': y_train_enc.tolist()
})
test_dataset = Dataset.from_dict({
    'text': X_test['text'].astype(str).tolist(),
    'label': y_test_enc.tolist()
})

# Tokenization
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset  = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Load model
num_labels = len(np.unique(y_train_enc))
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Metrics
accuracy_metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": acc['accuracy']}

# TrainingArguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Final project/save models/TF/output",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    logging_dir="/content/drive/MyDrive/Final project/save models/TF/log",
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.remove_callback(MLflowCallback)

# Train and Evaluate
trainer.train()
results = trainer.evaluate()
print("Evaluation Results:", results)

# -------------------------
# Extra Evaluation Metrics
# -------------------------
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=-1)
y_true = predictions.label_ids

print("\nDetailed Evaluation Metrics (RoBERTa):")
print(f"Accuracy:  {accuracy_score(y_true, y_pred):.4f}")
print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.4f}")
print(f"Recall:    {recall_score(y_true, y_pred, average='weighted'):.4f}")
print(f"F1-score:  {f1_score(y_true, y_pred, average='weighted'):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))

print("\nClassification Report:")
print(classification_report(y_true, y_pred, zero_division=0))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3800 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mthanigai2307[0m ([33mthanigai2307-guvi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.4201
1000,0.3059
1500,0.2871
2000,0.2786
2500,0.269
3000,0.2574
3500,0.2481
4000,0.2212
4500,0.1956
5000,0.1848


Step,Training Loss
500,0.4201
1000,0.3059
1500,0.2871
2000,0.2786
2500,0.269
3000,0.2574
3500,0.2481
4000,0.2212
4500,0.1956
5000,0.1848


Evaluation Results: {'eval_loss': 0.2611856162548065, 'eval_accuracy': 0.9321052631578948, 'eval_runtime': 12.5854, 'eval_samples_per_second': 301.938, 'eval_steps_per_second': 9.455, 'epoch': 3.0}

Detailed Evaluation Metrics (RoBERTa):
Accuracy:  0.9321
Precision: 0.9322
Recall:    0.9321
F1-score:  0.9320

Confusion Matrix:
[[887  12  26  25]
 [  3 937   5   5]
 [ 32   6 839  73]
 [ 23   2  46 879]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.93      0.94       950
           1       0.98      0.99      0.98       950
           2       0.92      0.88      0.90       950
           3       0.90      0.93      0.91       950

    accuracy                           0.93      3800
   macro avg       0.93      0.93      0.93      3800
weighted avg       0.93      0.93      0.93      3800



In [6]:
# Save Model and Tokenizer
save_path = r"/content/drive/MyDrive/Final project/save models/TF"
os.makedirs(save_path, exist_ok=True)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Transformer model saved at: {save_path}")

Transformer model saved at: /content/drive/MyDrive/Final project/save models/TF


In [None]:
import numpy as np
import pandas as pd

In [None]:
# Load training and testing data
X_train = pd.read_csv("/content/drive/MyDrive/Final project/split/X_train.csv")
X_test = pd.read_csv("/content/drive/MyDrive/Final project/split/X_test.csv")
y_train = pd.read_csv("/content/drive/MyDrive/Final project/split/y_train.csv")
y_test = pd.read_csv("/content/drive/MyDrive/Final project/split/y_test.csv")

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, Conv1D, GlobalMaxPooling1D, LSTM, Bidirectional
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# If it's a pandas Series or DataFrame with a 'text' column:
X_train = X_train['text'].astype(str).tolist()
X_test  = X_test['text'].astype(str).tolist()

# If X_train and X_test are already lists:
X_train = [str(x).strip() for x in X_train]  # make sure each element is a string
X_test  = [str(x).strip() for x in X_test]

# Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test  = le.transform(y_test)

# Tokenize + Pad
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_VOCAB = 10000
MAX_LEN = 200

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

print("X_train_pad shape:", X_train_pad.shape)  # should be (14400, 200)
print("y_train shape:", y_train.shape)          # should be (14400,)


# -------------------------
# 2. Build Models
# -------------------------

# Kim's CNN (TextCNN)
def build_cnn():
    model = Sequential([
        Embedding(MAX_VOCAB, 128, input_length=MAX_LEN),
        Conv1D(filters=128, kernel_size=3, activation="relu"),
        GlobalMaxPooling1D(),
        Dropout(0.5),
        Dense(len(np.unique(y_train)), activation="softmax")
    ])
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

# BiLSTM
def build_bilstm():
    model = Sequential([
        Embedding(MAX_VOCAB, 128, input_length=MAX_LEN),
        Bidirectional(LSTM(128, return_sequences=False)),
        Dropout(0.5),
        Dense(len(np.unique(y_train)), activation="softmax")
    ])
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

# -------------------------
# 3. Train & Evaluate
# -------------------------
models = {"CNN": build_cnn(), "BiLSTM": build_bilstm()}
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_pad, y_train, epochs=3, batch_size=64, verbose=1)


    y_pred = np.argmax(model.predict(X_test_pad), axis=1)

    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    rec  = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    f1   = f1_score(y_test, y_pred, average="weighted", zero_division=0)

    print(f"\n{name} Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))
    print(f"{name} Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    results[name] = {"Accuracy": acc, "Precision": prec, "Recall": rec, "F1-score": f1}

# -------------------------
# 4. Model Comparison
# -------------------------
comparison_df = pd.DataFrame(results).T
print("\nModel Comparison:\n", comparison_df)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


X_train_pad shape: (60000, 200)
y_train shape: (60000,)





Training CNN...
Epoch 1/3
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 9ms/step - accuracy: 0.7306 - loss: 0.7374
Epoch 2/3
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.9235 - loss: 0.2384
Epoch 3/3
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9454 - loss: 0.1686
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step

CNN Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.89      0.90       950
           1       0.95      0.98      0.96       950
           2       0.87      0.85      0.86       950
           3       0.88      0.88      0.88       950

    accuracy                           0.90      3800
   macro avg       0.90      0.90      0.90      3800
weighted avg       0.90      0.90      0.90      3800

CNN Confusion Matrix:
 [[850  29  36  35]
 [ 10 929   3   8]
 [ 56   8 811  75]
 [ 26

In [None]:
# Find the best model based on Accuracy
best_model_name = comparison_df['Accuracy'].idxmax()
best_model = models[best_model_name]

print(f"Best model '{best_model_name}'")

Best model 'CNN'


In [None]:
import os

# Specify your desired folder path
save_path = r"/content/drive/MyDrive/Final project/save models/DL"  # <-- change this to your path

# Create folder if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# Save all trained models to the specified path
for name, model in models.items():
    model_file = os.path.join(save_path, f"{name}_model.h5")
    model.save(model_file)
    print(f"Model '{name}' saved as {model_file}")




Model 'CNN' saved as /content/drive/MyDrive/Final project/save models/DL/CNN_model.h5
Model 'BiLSTM' saved as /content/drive/MyDrive/Final project/save models/DL/BiLSTM_model.h5


In [None]:
import pickle
import os

# Specify the path
save_path = r"/content/drive/MyDrive/Final project/save models/DL"
os.makedirs(save_path, exist_ok=True)


# Save DL Tokenizer
dl_tokenizer_file = os.path.join(save_path, "dl_tokenizer.pkl")
with open(dl_tokenizer_file, "wb") as f:
    pickle.dump(tokenizer, f)
print(f"DL Tokenizer saved at: {dl_tokenizer_file}")


DL Tokenizer saved at: /content/drive/MyDrive/Final project/save models/DL/dl_tokenizer.pkl


In [None]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [None]:
# Load training and testing data
X_train = pd.read_csv("/content/drive/MyDrive/Final project/split/X_train.csv")
X_test = pd.read_csv("/content/drive/MyDrive/Final project/split/X_test.csv")
y_train = pd.read_csv("/content/drive/MyDrive/Final project/split/y_train.csv")
y_test = pd.read_csv("/content/drive/MyDrive/Final project/split/y_test.csv")

In [None]:
# Convert text into TF-IDF features
# -------------------------------
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train['text'])
X_test_tfidf = vectorizer.transform(X_test['text'])

# -------------------------------
# Define models
# -------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "SVM": LinearSVC()
}

# -------------------------------
# Train and Evaluate
# -------------------------------
results = []

for name, model in models.items():
    print(f"\n🔹 Training {name}...")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    # Evaluation
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="weighted")
    rec = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")

    results.append([name, acc, prec, rec, f1])

    print(f"\n📊 {name} Classification Report:\n")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# -------------------------------
# Show results in DataFrame
# -------------------------------
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-score"])
print("\n✅ Model Comparison:\n")
print(results_df)


🔹 Training Logistic Regression...


  y = column_or_1d(y, warn=True)



📊 Logistic Regression Classification Report:

              precision    recall  f1-score   support

           1       0.91      0.90      0.91       950
           2       0.95      0.97      0.96       950
           3       0.86      0.86      0.86       950
           4       0.88      0.87      0.87       950

    accuracy                           0.90      3800
   macro avg       0.90      0.90      0.90      3800
weighted avg       0.90      0.90      0.90      3800

Confusion Matrix:
 [[853  25  47  25]
 [ 11 925   6   8]
 [ 41  11 814  84]
 [ 28  17  77 828]]

🔹 Training Naive Bayes...

📊 Naive Bayes Classification Report:

              precision    recall  f1-score   support

           1       0.90      0.88      0.89       950
           2       0.95      0.97      0.96       950
           3       0.86      0.85      0.85       950
           4       0.86      0.87      0.86       950

    accuracy                           0.89      3800
   macro avg       0.89      0

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



📊 SVM Classification Report:

              precision    recall  f1-score   support

           1       0.91      0.89      0.90       950
           2       0.94      0.97      0.96       950
           3       0.85      0.86      0.85       950
           4       0.88      0.86      0.87       950

    accuracy                           0.89      3800
   macro avg       0.89      0.89      0.89      3800
weighted avg       0.89      0.89      0.89      3800

Confusion Matrix:
 [[843  30  49  28]
 [ 14 921   9   6]
 [ 41  13 815  81]
 [ 28  13  89 820]]

✅ Model Comparison:

                 Model  Accuracy  Precision    Recall  F1-score
0  Logistic Regression  0.900000   0.899635  0.900000  0.899745
1          Naive Bayes  0.890526   0.890245  0.890526  0.890314
2                  SVM  0.894474   0.894312  0.894474  0.894287


In [None]:
# Select the best model (based on highest F1-score)
best_model_name = results_df.sort_values("Accuracy", ascending=False).iloc[0]["Model"]
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name}")


Best Model: Logistic Regression


In [None]:
import joblib
import os

# Create a folder to save models
save_dir = r"/content/drive/MyDrive/Final project/save models/ML"
os.makedirs(save_dir, exist_ok=True)
# Save each model
for name, model in models.items():
    file_path = os.path.join(save_dir, f"{name}.pkl")
    joblib.dump(model, file_path)
    print(f"{name} saved at: {file_path}")

Logistic Regression saved at: /content/drive/MyDrive/Final project/save models/ML/Logistic Regression.pkl
Naive Bayes saved at: /content/drive/MyDrive/Final project/save models/ML/Naive Bayes.pkl
SVM saved at: /content/drive/MyDrive/Final project/save models/ML/SVM.pkl


In [None]:
import pickle
import os

# Specify the path
save_path = r"/content/drive/MyDrive/Final project/save models/ML"
os.makedirs(save_path, exist_ok=True)

# Save ML Vectorizer
ml_vectorizer_file = os.path.join(save_path, "ml_vectorizer.pkl")
with open(ml_vectorizer_file, "wb") as f:
    pickle.dump(vectorizer, f)
print(f"ML Vectorizer saved at: {ml_vectorizer_file}")


ML Vectorizer saved at: /content/drive/MyDrive/Final project/save models/ML/ml_vectorizer.pkl


In [None]:
import pandas as pd
from tabulate import tabulate


# Best ML model metrics (replace with your ML evaluation)
best_ml_model_name = "Logistic Regression"
best_ml_accuracy = 0.85   # Example
best_ml_f1 = 0.84         # Example

# Best DL model metrics (from your BiLSTM/CNN results)
best_dl_model_name = "BiLSTM"
best_dl_accuracy = 0.90
best_dl_f1 = 0.90

# Best Pre-trained Transformer model (DistilBERT)
best_transformer_model_name = "DistilBERT"
best_transformer_accuracy = 0.93   # Use your actual variable
best_transformer_f1 = 0.93        # Use your actual variable

# Create Comparison Table
# -------------------------
comparison_data = {
    "Model Type": ["ML Model", "DL Model", "Pre-trained Model"],
    "Name of Model": [best_ml_model_name, best_dl_model_name, best_transformer_model_name],
    "Accuracy": [best_ml_accuracy, best_dl_accuracy, best_transformer_accuracy],
    "F1-Score": [best_ml_f1, best_dl_f1, best_transformer_f1],
    "Pros": [
        "Simple, fast, interpretable",
        "Captures sequential context well",
        "Lightweight, fast, retains BERT's performance"
    ],
    "Cons": [
        "Struggles with complex patterns",
        "Training is slower, needs more data",
        "Slightly less accurate than full BERT"
    ]
}

comparison_df = pd.DataFrame(comparison_data)

# Display as a table
print(tabulate(comparison_df, headers='keys', tablefmt='fancy_grid'))



╒════╤═══════════════════╤═════════════════════╤════════════╤════════════╤═══════════════════════════════════════════════╤═══════════════════════════════════════╕
│    │ Model Type        │ Name of Model       │   Accuracy │   F1-Score │ Pros                                          │ Cons                                  │
╞════╪═══════════════════╪═════════════════════╪════════════╪════════════╪═══════════════════════════════════════════════╪═══════════════════════════════════════╡
│  0 │ ML Model          │ Logistic Regression │       0.85 │       0.84 │ Simple, fast, interpretable                   │ Struggles with complex patterns       │
├────┼───────────────────┼─────────────────────┼────────────┼────────────┼───────────────────────────────────────────────┼───────────────────────────────────────┤
│  1 │ DL Model          │ BiLSTM              │       0.9  │       0.9  │ Captures sequential context well              │ Training is slower, needs more data   │
├────┼────────────────