In [10]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt

# Add project root to path
PROJECT_ROOT = os.path.abspath("..")
sys.path.insert(0, PROJECT_ROOT)

from src.preprocessing import detect_label_and_text_columns, clean_dataframe

RAW_CSV = os.path.join(PROJECT_ROOT, "data", "raw", "spam-sms.csv")
CLEAN_OUT = os.path.join(PROJECT_ROOT, "data", "cleaned", "sms_cleaned.csv")
SPLITS_DIR = os.path.join(PROJECT_ROOT, "data", "splits")

os.makedirs(os.path.dirname(CLEAN_OUT), exist_ok=True)
os.makedirs(SPLITS_DIR, exist_ok=True)

print("Paths ready")
print("RAW_CSV:", RAW_CSV)

Paths ready
RAW_CSV: /Users/amatyaumanga/Downloads/Applied-Machine-Learning-Coursework/data/raw/spam-sms.csv


In [11]:
df_raw = pd.read_csv(RAW_CSV, encoding="latin-1")

print("Shape:", df_raw.shape)
print("Columns:", df_raw.columns.tolist())
display(df_raw.head())

label_col, text_col = detect_label_and_text_columns(df_raw)
print("Detected label_col:", label_col)
print("Detected text_col :", text_col)

Shape: (5572, 5)
Columns: ['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


Detected label_col: v1
Detected text_col : v2


In [6]:
df = clean_dataframe(df_raw, label_col=label_col, text_col=text_col)

print("After cleaning:", df.shape)
print("\nLabel distribution:")
print(df["label"].value_counts())
print("\nLabel %:")
print((df["label"].value_counts(normalize=True) * 100).round(2))

display(df.sample(5, random_state=42))

df.to_csv(CLEAN_OUT, index=False)
print("Saved cleaned:", CLEAN_OUT)

After cleaning: (5169, 3)

Label distribution:
label
ham     4516
spam     653
Name: count, dtype: int64

Label %:
label
ham     87.37
spam    12.63
Name: proportion, dtype: float64


Unnamed: 0,label,text,text_clean
1617,ham,Did u download the fring app?,did u download the fring app?
2064,ham,Pass dis to all ur contacts n see wat u get! R...,pass dis to all ur contacts n see wat u get! r...
1272,ham,Ok...,ok...
3020,ham,Am in film ill call you later.,am in film ill call you later.
3642,ham,"Sorry, left phone upstairs. OK, might be hecti...","sorry, left phone upstairs. ok, might be hecti..."


Saved cleaned: /Users/amatyaumanga/Downloads/Applied-Machine-Learning-Coursework/data/cleaned/sms_cleaned.csv


In [12]:
from sklearn.model_selection import train_test_split

SEED = 42

train_df, temp_df = train_test_split(
    df, test_size=0.20, random_state=SEED, stratify=df["label"]
)

val_df, test_df = train_test_split(
    temp_df, test_size=0.50, random_state=SEED, stratify=temp_df["label"]
)

print("Train:", train_df.shape, train_df["label"].value_counts().to_dict())
print("Val  :", val_df.shape,   val_df["label"].value_counts().to_dict())
print("Test :", test_df.shape,  test_df["label"].value_counts().to_dict())

train_path = os.path.join(SPLITS_DIR, "train.csv")
val_path   = os.path.join(SPLITS_DIR, "val.csv")
test_path  = os.path.join(SPLITS_DIR, "test.csv")

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print("Saved:", train_path)
print("Saved:", val_path)
print("Saved:", test_path)

Train: (4135, 3) {'ham': 3613, 'spam': 522}
Val  : (517, 3) {'ham': 451, 'spam': 66}
Test : (517, 3) {'ham': 452, 'spam': 65}
Saved: /Users/amatyaumanga/Downloads/Applied-Machine-Learning-Coursework/data/splits/train.csv
Saved: /Users/amatyaumanga/Downloads/Applied-Machine-Learning-Coursework/data/splits/val.csv
Saved: /Users/amatyaumanga/Downloads/Applied-Machine-Learning-Coursework/data/splits/test.csv
