# SPAM explorations


In [1]:
%load_ext autoreload
%autoreload 2

## Imports


In [30]:
from pathlib import Path

import evaluate
import numpy as np
import pandas as pd
import sklearn
import torch
from datasets import ClassLabel, Features, Value, load_dataset
from tqdm.rich import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    pipeline,
)
from transformers.pipelines.pt_utils import KeyDataset

In [47]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cuda'

## Load data


In [3]:
data_path = Path("data").resolve()
train_path = data_path / "train_spam.csv"
test_path = data_path / "test_spam.csv"

In [4]:
train_raw = pd.read_csv(train_path)
test_raw = pd.read_csv(test_path)

In [5]:
spam_dataset = load_dataset(
    "csv",
    data_files=str(train_path),
    features=Features(
        {"text_type": ClassLabel(num_classes=2, names=["ham", "spam"]), "text": Value("string")}
    ),
    split="train",
).rename_column("text_type", "label")
spam_dataset

Dataset({
    features: ['label', 'text'],
    num_rows: 16278
})

## Explore data


In [6]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16278 entries, 0 to 16277
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text_type  16278 non-null  object
 1   text       16278 non-null  object
dtypes: object(2)
memory usage: 254.5+ KB


In [7]:
train_raw.head(3)

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...


In [8]:
test_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4070 entries, 0 to 4069
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4070 non-null   object
dtypes: object(1)
memory usage: 31.9+ KB


In [9]:
test_raw.text[0]

'j jim whitehead ejw cse ucsc edu writes j you open sourced the new components you developed for this j project so the next person who comes along won t have to j reimplement them right no need all those components already exist either in the java class libraries or from the various java jar collections most of the classes i used came from the jakarta project and apachexml but if it s any consolation my threading of them all together into a newswire server is gpl and available on sourceforge gary lawrence murphy garym teledyn com teledynamics communications inc business advantage through community software url computers are useless they can only give you answers pablo picasso'

## Tries


In [19]:
tokenizer: DistilBertTokenizer = DistilBertTokenizer.from_pretrained(
    "distilbert/distilbert-base-uncased"
)

In [20]:
tokenized_spam = spam_dataset.map(
    lambda row: tokenizer(row["text"], truncation=True, max_length=512),
    batched=True,
).train_test_split(0.2)  # type: ignore

In [21]:
id2label = {0: "ham", 1: "spam"}
label2id = {"ham": 0, "spam": 1}

model: DistilBertForSequenceClassification = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)  # type: ignore

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [23]:
roc_auc_score = evaluate.load("roc_auc")

In [24]:
def compute_metrics(eval_pred: EvalPrediction) -> dict[str, float] | None:
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return roc_auc_score.compute(prediction_scores=predictions, references=labels)

In [25]:
training_args = TrainingArguments(
    output_dir="./models/distilbert",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_spam["train"],
    eval_dataset=tokenized_spam["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # type: ignore
)

In [27]:
trainer.train()

  0%|          | 0/3256 [00:00<?, ?it/s]

{'loss': 0.23, 'grad_norm': 0.10618958622217178, 'learning_rate': 1.6928746928746932e-05, 'epoch': 0.31}
{'loss': 0.1451, 'grad_norm': 1.069311499595642, 'learning_rate': 1.3857493857493858e-05, 'epoch': 0.61}
{'loss': 0.1238, 'grad_norm': 0.021182937547564507, 'learning_rate': 1.0786240786240787e-05, 'epoch': 0.92}


  0%|          | 0/407 [00:00<?, ?it/s]

{'eval_loss': 0.1230822280049324, 'eval_roc_auc': 0.9595074135275801, 'eval_runtime': 80.3762, 'eval_samples_per_second': 40.51, 'eval_steps_per_second': 5.064, 'epoch': 1.0}
{'loss': 0.0761, 'grad_norm': 16.749755859375, 'learning_rate': 7.714987714987717e-06, 'epoch': 1.23}
{'loss': 0.0594, 'grad_norm': 0.06073906272649765, 'learning_rate': 4.643734643734644e-06, 'epoch': 1.54}
{'loss': 0.0501, 'grad_norm': 0.007219531107693911, 'learning_rate': 1.5724815724815726e-06, 'epoch': 1.84}


  0%|          | 0/407 [00:00<?, ?it/s]

{'eval_loss': 0.1235177218914032, 'eval_roc_auc': 0.966318865994447, 'eval_runtime': 33.6504, 'eval_samples_per_second': 96.76, 'eval_steps_per_second': 12.095, 'epoch': 2.0}
{'train_runtime': 2180.9648, 'train_samples_per_second': 11.942, 'train_steps_per_second': 1.493, 'train_loss': 0.11060477298952145, 'epoch': 2.0}


TrainOutput(global_step=3256, training_loss=0.11060477298952145, metrics={'train_runtime': 2180.9648, 'train_samples_per_second': 11.942, 'train_steps_per_second': 1.493, 'train_loss': 0.11060477298952145, 'epoch': 2.0})

In [44]:
pipe = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    batch_size=256,
    device=device,
)


In [34]:
spam_test_dataset = load_dataset(
    "csv",
    data_files=str(test_path),
    features=Features({"text": Value("string")}),
    split="train",
)
spam_test_dataset


Dataset({
    features: ['text'],
    num_rows: 4070
})

In [43]:
predict_iter = pipe(KeyDataset(spam_test_dataset, "text"))  # type: ignore
predicts = [row["label"] for row in tqdm(predict_iter)]


Output()

  predicts = [row["label"] for row in tqdm(predict_iter)]


In [58]:
test_raw["text_type"] = predicts

In [63]:
predicts_df = test_raw.reindex(columns=["text_type", "text"])

In [64]:
predicts_df.to_csv(data_path / "predict_spam.csv", index=False)