In [1]:
import numpy as np
import pandas as pd
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments, ProgressCallback
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from datasets import Dataset

from pathlib import Path

2024-05-25 17:41:53.829003: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-25 17:41:53.829113: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-25 17:41:53.946872: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Configuration
BATCH_SIZE = 128
MODEL_NAME = 'distilbert-base-uncased'
NUM_EPOCHS = 5
MAX_LR = 2.5e-4
WEIGHT_DECAY = 1e-5

## Load the data and check for missing values

In [3]:
data_path = Path('/kaggle/input/nlp-getting-started')
train = pd.read_csv(data_path / 'train.csv')
train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# check for missing values
train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

## Dataset and tokenizer

In [5]:
# create dataset
train_dataset = Dataset.from_pandas(train[['text', 'target']]).class_encode_column("target")

# split into 80-20 training-test splits
train_dataset = train_dataset.train_test_split(test_size = 0.2,seed = 1)

train_dataset

Stringifying the column:   0%|          | 0/7613 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/7613 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'target'],
        num_rows: 6090
    })
    test: Dataset({
        features: ['text', 'target'],
        num_rows: 1523
    })
})

In [6]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=512)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
# Create a Dataset object from the DataFrame
tokenized_dataset = train_dataset.map(tokenize, batched=True, batch_size=BATCH_SIZE)

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [8]:
# Renaming the target column
tokenized_dataset = tokenized_dataset.rename_column("target", "labels")
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 6090
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1523
    })
})

## Load the pretrained model and train

In [9]:
# Load the DistilBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    logits = pred.predictions
    preds = logits.argmax(-1) # might not be the best threshold
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    auc = roc_auc_score(labels, logits[:, -1])
    
    return {"accuracy": accuracy, "f1": f1, "auc": auc}

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    report_to="none", # disable all integrations,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy="epoch", # to log training loss
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)

In [11]:
# define the optimizer and scheduler
optimizer = AdamW(model.parameters(), weight_decay= WEIGHT_DECAY)
lr_scheduler = OneCycleLR(
    optimizer, max_lr = MAX_LR,
    epochs = NUM_EPOCHS,
    steps_per_epoch=int(np.ceil(len(tokenized_dataset['train']) / BATCH_SIZE)),
    anneal_strategy='cos',
    pct_start=0.2
)


# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, lr_scheduler),
)

# Fine-tune the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
1,0.5223,0.479255,0.822062,0.790734,0.885883
2,0.3577,0.443848,0.799081,0.769925,0.874286
3,0.205,0.491359,0.826658,0.784314,0.877261
4,0.1058,0.625332,0.803677,0.775019,0.874617
5,0.0647,0.678389,0.815496,0.776096,0.871656


TrainOutput(global_step=240, training_loss=0.25109943151474, metrics={'train_runtime': 130.7473, 'train_samples_per_second': 232.892, 'train_steps_per_second': 1.836, 'total_flos': 660211822942968.0, 'train_loss': 0.25109943151474, 'epoch': 5.0})

## Test predictions and submission

In [12]:
test = pd.read_csv(data_path / 'test.csv')
test_dataset_tokenized = (
    Dataset
    .from_pandas(test[['text']])
    .map(tokenize, batched=True, batch_size=BATCH_SIZE)
)

# returns logits
test_predictions = trainer.predict(test_dataset_tokenized)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [13]:
submission = pd.DataFrame({
    'id': test['id'],
    'target':  test_predictions.predictions.argmax(-1)
})

submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [14]:
submission.to_csv('submission.csv', index=False)