In [9]:
from google.colab import drive
drive.mount('/content/drive') #, force_remount=True)

Mounted at /content/drive


In [10]:
!pip install -q unsloth
# Also get the latest nightly Unsloth!
!pip install -q --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone


In [11]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! Unsloth also supports RoPE (Rotary Positinal Embedding) scaling internally.
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit, # Will load the 4Bit Quantized Model
)

==((====))==  Unsloth 2025.12.9: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [14]:


%cd /content/drive/MyDrive
# !ls
import pandas as pd
# train_spam_data = pd.read_csv('train_spam_data.csv')
test_spam_data = pd.read_csv('test_spam_data.csv')


/content/drive/MyDrive


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix
import torch, csv, re
CSV_PATH = "test_spam_data.csv"   # change this
TEXT_COLUMN = "text"

In [None]:

def classify_text(text: str) -> str:
  # Minimal prompt
  prompt = f"""
  SMS: "{text}"
  Classify as SPAM or HAM.
  Reply with only one word: SPAM or HAM.
  """

  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

  with torch.no_grad():
      outputs = model.generate(
          **inputs,
          max_new_tokens=5,
          do_sample=False,
          temperature=0.0,
          pad_token_id=tokenizer.eos_token_id
      )

  decoded = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)

  # Robust post-processing
  decoded = decoded.upper().strip()

  # Extract only SPAM or HAM from anywhere in text
  match = re.search(r"\b(SPAM|HAM)\b", decoded)
  if match:
      label = match.group(1)
  else:
      # fallback: treat message with phone/prize/urgency as SPAM
      if re.search(r"\b\d{6,}\b", text) or re.search(r"prize|win|congratulations|urgent|claim", text, re.I):
          label = "SPAM"
      else:
          label = "HAM"

  print("Prediction:", label)
  return label


# Read CSV
# Read original CSV
df = pd.read_csv(CSV_PATH)

records = []

for idx, row in df.iterrows():
    text = row[TEXT_COLUMN]
    gt = row["label"].upper()

    pred = classify_text(text)
    records.append({
        "label": gt,
        "text": text,
        "prediction": pred
    })

# Create new DataFrame
pred_df = pd.DataFrame(records)

# Save new CSV
OUTPUT_CSV = "spam_ham_predictions.csv"
pred_df.to_csv(OUTPUT_CSV, index=False)

print(f"Saved predictions to {OUTPUT_CSV}")

# Read prediction CSV
df_pred = pd.read_csv(OUTPUT_CSV)

# Normalize labels
df_pred["label"] = df_pred["label"].str.upper()
df_pred["prediction"] = df_pred["prediction"].str.upper()

# Filter valid predictions only
valid_df = df_pred[df_pred["prediction"].isin(["HAM", "SPAM"])]

y_true = valid_df["label"]
y_pred = valid_df["prediction"]

# Accuracy
accuracy = accuracy_score(y_true, y_pred)

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred, labels=["HAM", "SPAM"])

print("\nAccuracy:", round(accuracy * 100, 2), "%")

print("\nConfusion Matrix (GT rows, Pred columns)")
print("        HAM   SPAM")
print(f"HAM   {cm[0][0]:5d}  {cm[0][1]:5d}")
print(f"SPAM  {cm[1][0]:5d}  {cm[1][1]:5d}")






Prediction: SPAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: SPAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: SPAM
Prediction: SPAM
Prediction: SPAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: SPAM
Prediction: HAM
Prediction: SPAM
Prediction: SPAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: SPAM
Prediction: HAM
Prediction: SPAM
Prediction: HAM
Prediction: HAM
Prediction: SPAM
Prediction: HAM
Prediction: SPAM
Prediction: HAM
Prediction: SPAM
Prediction: HAM
Prediction: HAM
Prediction: SPAM
Prediction: SPAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: SPAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Prediction: HAM
Predicti