# SPAM explorations

In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Imports

In [2]:
from pathlib import Path

import evaluate
import numpy as np
import pandas as pd
import sklearn
import torch
from datasets import ClassLabel, Features, Value, load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
)

  from .autonotebook import tqdm as notebook_tqdm
2024-05-03 19:43:06.286366: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load data

In [3]:
data_path = Path("data").resolve()
train_path = data_path / "train_spam.csv"
test_path = data_path / "test_spam.csv"

In [4]:
train_raw = pd.read_csv(train_path)
test_raw = pd.read_csv(test_path)

In [None]:
spam_dataset = load_dataset(
    "csv",
    data_files=str(train_path),
    features=Features(
        {"text_type": ClassLabel(num_classes=2, names=["ham", "spam"]), "text": Value("string")}
    ),
    split="train",
).rename_column("text_type", "label")
spam_dataset

## Explore data

In [5]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16278 entries, 0 to 16277
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text_type  16278 non-null  object
 1   text       16278 non-null  object
dtypes: object(2)
memory usage: 254.5+ KB


In [6]:
train_raw.head(3)

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...


In [7]:
test_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4070 entries, 0 to 4069
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4070 non-null   object
dtypes: object(1)
memory usage: 31.9+ KB


In [8]:
test_raw.text[0]

'j jim whitehead ejw cse ucsc edu writes j you open sourced the new components you developed for this j project so the next person who comes along won t have to j reimplement them right no need all those components already exist either in the java class libraries or from the various java jar collections most of the classes i used came from the jakarta project and apachexml but if it s any consolation my threading of them all together into a newswire server is gpl and available on sourceforge gary lawrence murphy garym teledyn com teledynamics communications inc business advantage through community software url computers are useless they can only give you answers pablo picasso'

## Tries

In [50]:
tokenizer: DistilBertTokenizer = DistilBertTokenizer.from_pretrained(
    "distilbert/distilbert-base-uncased"
)

In [51]:
tokenized_spam = spam_dataset.map(
    lambda row: tokenizer(row["text"], truncation=True),
    batched=True,
).train_test_split(0.2)  # type: ignore

Map:   0%|          | 0/16278 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 16278/16278 [00:25<00:00, 642.12 examples/s]


In [52]:
id2label = {0: "ham", 1: "spam"}
label2id = {"ham": 0, "spam": 1}

model: DistilBertForSequenceClassification = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)  # type: ignore

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [54]:
roc_auc_score = evaluate.load("roc_auc")

In [55]:
def compute_metrics(eval_pred: EvalPrediction) -> dict[str, float] | None:
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return roc_auc_score.compute(predictions=predictions, references=labels)

In [56]:
training_args = TrainingArguments(
    output_dir="./models/distilbert",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [57]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_spam["train"],
    eval_dataset=tokenized_spam["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics, # type: ignore
)

In [58]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 