In [1]:
import numpy as np
import pandas as pd
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments, ProgressCallback
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from datasets import Dataset

from pathlib import Path

2024-05-27 21:53:51.383512: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-27 21:53:51.383666: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-27 21:53:51.506039: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Configuration
BATCH_SIZE = 128
MODEL_NAME = 'roberta-base'
NUM_EPOCHS = 5
MAX_LR = 2.5e-4
WEIGHT_DECAY = 1e-5

## Load the data and check for missing values

In [3]:
data_path = Path('/kaggle/input/nlp-getting-started')
train = pd.read_csv(data_path / 'train.csv')
train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# check for missing values
train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

## Dataset and tokenizer

In [5]:
# create dataset
train_dataset = Dataset.from_pandas(train[['text', 'target']]).class_encode_column("target")

# split into 80-20 training-test splits
train_dataset = train_dataset.train_test_split(test_size = 0.2,seed = 1)

train_dataset

Stringifying the column:   0%|          | 0/7613 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/7613 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'target'],
        num_rows: 6090
    })
    test: Dataset({
        features: ['text', 'target'],
        num_rows: 1523
    })
})

In [6]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=512)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [7]:
# Create a Dataset object from the DataFrame
tokenized_dataset = train_dataset.map(tokenize, batched=True, batch_size=BATCH_SIZE)

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [8]:
# Renaming the target column
tokenized_dataset = tokenized_dataset.rename_column("target", "labels")
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 6090
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1523
    })
})

## Load the pretrained model and train

In [9]:
# Load the DistilBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    logits = pred.predictions
    preds = logits.argmax(-1) # might not be the best threshold
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    auc = roc_auc_score(labels, logits[:, -1])
    
    return {"accuracy": accuracy, "f1": f1, "auc": auc}

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    report_to="none", # disable all integrations,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy="epoch", # to log training loss
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=True
)

In [11]:
# define the optimizer and scheduler
optimizer = AdamW(model.parameters(), weight_decay= WEIGHT_DECAY)
lr_scheduler = OneCycleLR(
    optimizer, max_lr = MAX_LR,
    epochs = NUM_EPOCHS,
    steps_per_epoch=int(np.ceil(len(tokenized_dataset['train']) / BATCH_SIZE)),
    anneal_strategy='cos',
    pct_start=0.2
)


# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, lr_scheduler),
)

# Fine-tune the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
1,0.5728,0.479593,0.782009,0.748865,0.865277
2,0.5693,0.684491,0.588969,0.039877,0.539666
3,0.7252,0.759777,0.41891,0.590467,0.665241
4,0.6985,0.680048,0.58109,0.0,0.413188
5,0.6851,0.679762,0.58109,0.0,0.547274


TrainOutput(global_step=240, training_loss=0.6501601775487263, metrics={'train_runtime': 254.4033, 'train_samples_per_second': 119.692, 'train_steps_per_second': 0.943, 'total_flos': 1325690191319160.0, 'train_loss': 0.6501601775487263, 'epoch': 5.0})

## Test predictions and submission

In [12]:
test = pd.read_csv(data_path / 'test.csv')
test_dataset_tokenized = (
    Dataset
    .from_pandas(test[['text']])
    .map(tokenize, batched=True, batch_size=BATCH_SIZE)
)

# returns logits
test_predictions = trainer.predict(test_dataset_tokenized)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [13]:
submission = pd.DataFrame({
    'id': test['id'],
    'target':  test_predictions.predictions.argmax(-1)
})

submission.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1


In [14]:
submission.to_csv('submission.csv', index=False)