# Text classification


In [1]:
! pip install transformers datasets evaluate accelerate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("huggingface_token")

In [3]:
from huggingface_hub import login
login(token=hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
# Check if GPU is available and set device
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Load datasets

In [5]:
from datasets import load_dataset

dataset = load_dataset("vishnun0027/spam-detection")

dataset

README.md:   0%|          | 0.00/407 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.92M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/663k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8175 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8175
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2725
    })
})

In [6]:
dataset["train"][0]

{'text': 'hey I am looking for Xray baggage datasets can you provide me with the same ',
 'label': 'not_spam'}

## Preprocess

In [7]:
# Define label mapping
label_map = {"not_spam": 0,"spam": 1}

# Apply the mapping function to the dataset
def map_labels(example):
    example['label'] = label_map[example['label']]
    return example

# Apply the label mapping to the dataset
dataset = dataset.map(map_labels)

# Print a sample to verify the change
dataset['train'][0]


Map:   0%|          | 0/8175 [00:00<?, ? examples/s]

Map:   0%|          | 0/2725 [00:00<?, ? examples/s]

{'text': 'hey I am looking for Xray baggage datasets can you provide me with the same ',
 'label': 0}

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [9]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [10]:
tokenized_data = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8175 [00:00<?, ? examples/s]

Map:   0%|          | 0/2725 [00:00<?, ? examples/s]

In [11]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluate

In [12]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train

In [14]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [15]:
# id2label = {0: "not_spam", 1: "spam"}
# label2id = {"not_spam": 0, "spam": 1}

In [16]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

# Move model to the GPU
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [17]:

training_args = TrainingArguments(
    output_dir="spam-detection",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    report_to=["tensorboard"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Use early stopping

)

trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.014879,0.997798
2,0.021800,0.01528,0.996697
3,0.021800,0.004024,0.998532
4,0.001100,0.005207,0.998532
5,0.001100,0.005533,0.998532
6,0.000000,0.005895,0.998532


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=1536, training_loss=0.0074465628800529276, metrics={'train_runtime': 1755.8882, 'train_samples_per_second': 69.836, 'train_steps_per_second': 2.187, 'total_flos': 8057515014899760.0, 'train_loss': 0.0074465628800529276, 'epoch': 6.0})

In [18]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/vishnun0027/spam-detection/commit/3d9a4172d2d4b63b22dc2dc39cc8f3f54d4c9fd1', commit_message='End of training', commit_description='', oid='3d9a4172d2d4b63b22dc2dc39cc8f3f54d4c9fd1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/vishnun0027/spam-detection', endpoint='https://huggingface.co', repo_type='model', repo_id='vishnun0027/spam-detection'), pr_revision=None, pr_num=None)