# Text classification


In [1]:
! pip install transformers datasets evaluate accelerate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("huggingface_token")

In [4]:
from huggingface_hub import login
login(token=hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
# Check if GPU is available and set device
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Load datasets

In [6]:
from datasets import load_dataset

dataset = load_dataset("vishnun0027/spam-detection")

dataset

README.md:   0%|          | 0.00/407 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.92M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/663k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8175 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8175
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2725
    })
})

In [7]:
dataset["train"][0]

{'text': 'hey I am looking for Xray baggage datasets can you provide me with the same ',
 'label': 'not_spam'}

## Preprocess

In [8]:
# Define label mapping
label_map = {"not_spam": 0,"spam": 1}

# Apply the mapping function to the dataset
def map_labels(example):
    example['label'] = label_map[example['label']]
    return example

# Apply the label mapping to the dataset
dataset = dataset.map(map_labels)

# Print a sample to verify the change
dataset['train'][0]


Map:   0%|          | 0/8175 [00:00<?, ? examples/s]

Map:   0%|          | 0/2725 [00:00<?, ? examples/s]

{'text': 'hey I am looking for Xray baggage datasets can you provide me with the same ',
 'label': 0}

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [11]:
tokenized_data = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8175 [00:00<?, ? examples/s]

Map:   0%|          | 0/2725 [00:00<?, ? examples/s]

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluate

In [13]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [14]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train

In [15]:
id2label = {0: "NOT SPAM", 1: "SPAM"}
label2id = {"NOT SPAM": 0, "SPAM": 1}

In [16]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

# Move model to the GPU
model.to(device)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {trainable_params}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total trainable parameters: 109483778


## Freezing Layers

In [17]:

for name, param in model.base_model.named_parameters():
    # print(name)
    param.requires_grad =False
    if "pooler" in name:
        param.requires_grad =True

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {trainable_params}")


Total trainable parameters: 592130


In [18]:

training_args = TrainingArguments(
    output_dir="spam-detection_m1",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    report_to=["tensorboard"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Use early stopping

)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.114395,0.991927
2,0.220000,0.048302,0.992294
3,0.220000,0.032133,0.994862
4,0.036100,0.027547,0.994862
5,0.036100,0.024473,0.995229
6,0.023300,0.02316,0.995963
7,0.023300,0.022029,0.996697
8,0.017100,0.020931,0.996697
9,0.017100,0.021056,0.996697
10,0.014800,0.020189,0.996697


TrainOutput(global_step=2560, training_loss=0.061206577345728874, metrics={'train_runtime': 1425.8237, 'train_samples_per_second': 86.003, 'train_steps_per_second': 2.693, 'total_flos': 1.344907730302674e+16, 'train_loss': 0.061206577345728874, 'epoch': 10.0})

In [19]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/vishnun0027/spam-detection_m1/commit/8dc1ab813066fc8b21aff13ec6a647137ec152fb', commit_message='End of training', commit_description='', oid='8dc1ab813066fc8b21aff13ec6a647137ec152fb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/vishnun0027/spam-detection_m1', endpoint='https://huggingface.co', repo_type='model', repo_id='vishnun0027/spam-detection_m1'), pr_revision=None, pr_num=None)