### A few functions taken from "Natural Language Processing with Transformers: Building Language Applications with Hugging Face"

This notebook demonstrate how to use transformers with fine tuning. I used only the basic distilbert model, without any special care and it done well.

In [1]:
import numpy as np
import pandas as pd
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
!ls /kaggle/input

nlp-getting-started


In [3]:
X_raw = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv", index_col=0)
# target should be renamed to label for transformers to recognize it as its label
X_raw = X_raw.rename(columns={"target": "label"})
X_test_raw = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv", index_col=0)

# test init labels to 0
X_test_raw["label"] = 0
submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv", index_col=0)

print(X_raw.columns)
print(X_test_raw.columns)
print(submission.columns)

Index(['keyword', 'location', 'text', 'label'], dtype='object')
Index(['keyword', 'location', 'text', 'label'], dtype='object')
Index(['target'], dtype='object')


In [4]:
col_unuse = [
    'keyword', 'location'
]

X = X_raw.drop(col_unuse, axis = 1)
X_test = X_test_raw.drop(col_unuse, axis = 1)
print(X.shape)
print(X_test.shape)
X

(7613, 2)
(3263, 2)


Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Our Deeds are the Reason of this #earthquake M...,1
4,Forest fire near La Ronge Sask. Canada,1
5,All residents asked to 'shelter in place' are ...,1
6,"13,000 people receive #wildfires evacuation or...",1
7,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
10869,Two giant cranes holding a bridge collapse int...,1
10870,@aria_ahrary @TheTawniest The out of control w...,1
10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,Police investigating after an e-bike collided ...,1


In [5]:
# simple validation
val_pct = 0.3
train_test_roll = np.random.uniform(size=X.shape[0])
X_train = X[train_test_roll >= val_pct]
X_val = X[train_test_roll < val_pct]
print(X_train.shape)
print(X_val.shape)

(5342, 2)
(2271, 2)


In [6]:
# Prepare data for ingestion
from datasets import Dataset, DatasetDict

dataset_train = Dataset.from_pandas(X_train[["text", "label"]])
dataset_val = Dataset.from_pandas(X_val[["text", "label"]])
dataset_test = Dataset.from_pandas(X_test[["text", "label"]])
dataset = DatasetDict({"train": dataset_train, "val": dataset_val, "test": dataset_test})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 5342
    })
    val: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 2271
    })
    test: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 3263
    })
})

In [7]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [8]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [9]:
# Tokenize
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)
dataset_encoded

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'id', 'input_ids', 'attention_mask'],
        num_rows: 5342
    })
    val: Dataset({
        features: ['text', 'label', 'id', 'input_ids', 'attention_mask'],
        num_rows: 2271
    })
    test: Dataset({
        features: ['text', 'label', 'id', 'input_ids', 'attention_mask'],
        num_rows: 3263
    })
})

In [10]:
# Get latest input stage
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [11]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [12]:
# Preparing code to run in model
dataset_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [13]:
from transformers import AutoModelForSequenceClassification

num_labels = 2
# Ready model for finetuning for classification
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifi

In [14]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(dataset_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-disaster"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=4,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  report_to="none",
                                  logging_steps=logging_steps,
                                  push_to_hub=False,
                                  log_level="error")

In [15]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=dataset_encoded["train"],
                  eval_dataset=dataset_encoded["val"],
                  tokenizer=tokenizer)
trainer.train();

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4928,0.407684,0.822985,0.822067
2,0.3521,0.409629,0.82915,0.828362
3,0.303,0.434415,0.816821,0.816821
4,0.2628,0.444045,0.81638,0.816393


In [16]:
dataset_train_full = Dataset.from_pandas(X[["text", "label"]])
dataset_test = Dataset.from_pandas(X_test[["text", "label"]])
dataset = DatasetDict({"train": dataset_train_full, "test": dataset_test})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 7613
    })
    test: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 3263
    })
})

In [17]:
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [18]:
dataset_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [19]:
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

In [20]:
model_name = f"{model_ckpt}-finetuned-disaster_full"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=4,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  disable_tqdm=False,
                                  report_to="none",
                                  logging_steps=logging_steps,
                                  push_to_hub=False,
                                  log_level="error")

In [21]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=dataset_encoded["train"],
                  tokenizer=tokenizer)
trainer.train();

Step,Training Loss
83,0.4861
166,0.3555
249,0.3442
332,0.2987
415,0.2872


In [22]:
preds_output = trainer.predict(dataset_encoded["test"])
preds_output

PredictionOutput(predictions=array([[-1.3586378,  1.1642065],
       [-2.1424394,  2.1221132],
       [-2.039575 ,  1.9563769],
       ...,
       [-2.340807 ,  2.264688 ],
       [-1.7295129,  1.8334826],
       [-1.4943331,  1.6878994]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 0]), metrics={'test_loss': 1.4131245613098145, 'test_accuracy': 0.6068035550107264, 'test_f1': 0.7552927713141331, 'test_runtime': 4.6194, 'test_samples_per_second': 706.368, 'test_steps_per_second': 11.04})

In [23]:
submission["target"] = (preds_output.predictions[:, 1] > 0).astype(int)
submission

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1
...,...
10861,1
10865,1
10868,1
10874,1


In [24]:
submission.to_csv("submission.csv")