In [1]:
!pip install dvc





[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

def load_data(path):
    df = pd.read_csv(path, sep="\t", header=None, names=["label", "text"])
    return df

def preprocess_data(df):
    df = df.copy()
    df["text"] = df["text"].str.lower().str.strip()
    df["label"] = df["label"].map({"ham": 0, "spam": 1})
    return df

def save_raw_data(df):
    os.makedirs("data", exist_ok=True)
    df.to_csv("data/raw_data.csv", index=False)
    print("✅ Saved raw_data.csv")


In [3]:
def split_data(df, seed=42):

    train_df, temp_df = train_test_split(
        df,
        test_size=0.3,
        stratify=df["label"],
        random_state=seed
    )

    val_df, test_df = train_test_split(
        temp_df,
        test_size=0.5,
        stratify=temp_df["label"],
        random_state=seed
    )

    return train_df, val_df, test_df


def save_splits(train_df, val_df, test_df):

    train_df.to_csv("data/train.csv", index=False)
    val_df.to_csv("data/validation.csv", index=False)
    test_df.to_csv("data/test.csv", index=False)

    print("✅ Saved train.csv, validation.csv, test.csv")


In [18]:
df = load_data("SMSSpamCollection")
df = preprocess_data(df)

save_raw_data(df)

train_df, val_df, test_df = split_data(df, seed=42)
save_splits(train_df, val_df, test_df)


✅ Saved raw_data.csv
✅ Saved train.csv, validation.csv, test.csv


In [32]:
df = load_data("SMSSpamCollection")
df = preprocess_data(df)

save_raw_data(df)

train_df, val_df, test_df = split_data(df, seed=42)
save_splits(train_df, val_df, test_df)


✅ Saved raw_data.csv
✅ Saved train.csv, validation.csv, test.csv


In [30]:
train_df, val_df, test_df = split_data(df, seed=99)
save_splits(train_df, val_df, test_df)


✅ Saved train.csv, validation.csv, test.csv


In [20]:
import pandas as pd

def print_dist(file):
    df = pd.read_csv(file)
    print("\n", file)
    print(df["label"].value_counts())

print_dist("data/train.csv")
print_dist("data/validation.csv")
print_dist("data/test.csv")



 data/train.csv
label
0    3377
1     523
Name: count, dtype: int64

 data/validation.csv
label
0    724
1    112
Name: count, dtype: int64

 data/test.csv
label
0    724
1    112
Name: count, dtype: int64


In [21]:
print_dist("data/train.csv")
print_dist("data/validation.csv")
print_dist("data/test.csv")



 data/train.csv
label
0    3377
1     523
Name: count, dtype: int64

 data/validation.csv
label
0    724
1    112
Name: count, dtype: int64

 data/test.csv
label
0    724
1    112
Name: count, dtype: int64


In [13]:
import os
print(os.getcwd())


c:\Users\titli\Documents\sms-spam-dvc


In [31]:
import pandas as pd

train = pd.read_csv(r"C:\Users\titli\Documents\sms-spam-dvc\data\train.csv")

print("VERSION 1 (seed=99) first SMS:")
print(train.iloc[0]["text"])


VERSION 1 (seed=99) first SMS:
congratulations - thanks to a good friend u have won the £2,000 xmas prize. 2 claim is easy, just call 08712103738 now! only 10p per minute. bt-national-rate


In [33]:
import pandas as pd

train = pd.read_csv(r"C:\Users\titli\Documents\sms-spam-dvc\data\train.csv")

print("VERSION 1 (seed=42) first SMS:")
print(train.iloc[0]["text"])


VERSION 1 (seed=42) first SMS:
goal! arsenal 4 (henry, 7 v liverpool 2 henry scores with a simple shot from 6 yards from a pass by bergkamp to give arsenal a 2 goal margin after 78 mins.
