# Bert Based NLP Project

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

In [2]:
# load a pretrained model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

# freeze the model until the classifier
for name, param in model.named_parameters():
    if 'classifier' not in name:
        param.requires_grad = False

# load the model into the GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [3]:
# load the test and train data (they should be balanced)
og_df = pd.read_csv('train.csv')

# shuffle the train data a bit (not necessary)
shuffle_df = og_df.sample(frac=1).reset_index(drop=True)

# split the train data into train and validation
from sklearn.model_selection import train_test_split    
train, valid = train_test_split(shuffle_df, test_size=0.2, random_state=42)

In [4]:
# prepare the data, put keywords and location into text
train['text'] = train['keyword'].fillna('') + '; ' + train['location'].fillna('') + '; ' + train['text']
valid['text'] = valid['keyword'].fillna('') + '; ' + valid['location'].fillna('') + '; ' + valid['text']

In [5]:
# get a tokenizer, setting vocab size of your tokenizer in bert config
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True)

In [6]:
# batch tokenize the text from the train and test data, and convert to torch tensors with max length of 128
train_encodings = tokenizer(list(train['text']), truncation=True, padding=True, max_length=128)
valid_encodings = tokenizer(list(valid['text']), truncation=True, padding=True, max_length=128)

In [7]:
# create a torch dataset and dataloader for the train and test data
from torch.utils.data import Dataset, DataLoader

class DisasterDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DisasterDataset(train_encodings, list(train['target']))
valid_dataset = DisasterDataset(valid_encodings, list(valid['target']))

# make a dataloader for the train and test data
batch_size_curr = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size_curr)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size_curr)


## We do a manual PyTorch Train loop

In [8]:
# use the untrained model to make predictions on the valid set for 1 batch for sanity check
for batch in valid_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    break

# use the untrained model to make predictions on the train set for 1 batch for sanity check
for batch in train_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    break

# test training for 1 batch
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for batch in train_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    break



In [12]:
# training loop
from tqdm import tqdm
from sklearn.metrics import f1_score

def train_loop(epochs):
    best_f1 = 0
    for epoch in range(epochs):
        model.train()
        for batch in tqdm(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        model.eval()
        preds = []
        labels = []
        for batch in valid_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).tolist())
            labels.extend(batch['labels'].tolist())
        f1 = f1_score(labels, preds)
        if f1 > best_f1:
            best_f1 = f1
        #     torch.save(model.state_dict(), 'best_model.pt')
        print(f'Epoch {epoch} - f1: {f1}')
    return best_f1


In [15]:
# test untrained model on validation set and get f1 score
model.eval()
preds = []
labels = []
for batch in valid_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    logits = outputs.logits
    preds.extend(torch.argmax(logits, dim=1).tolist())
    labels.extend(batch['labels'].tolist())
f1 = f1_score(labels, preds)
print(f'Untrained model - f1: {f1}')


Untrained model - f1: 0.1769230769230769


In [None]:
# train the model for several epoch, and get the f1 score
# best_f1 = train_loop(5)
# print(f'Train 1 Epoch - f1: {best_f1}')

# save the model
torch.save(model.state_dict(), 'model_f1_05333.pt')

In [18]:
# train the model for 1 epoch, and get the f1 score
best_f1 = train_loop(1)
print(f'Train 1 Epoch - f1: {best_f1}')

100%|██████████| 381/381 [06:00<00:00,  1.06it/s]


Epoch 0 - f1: 0.49907918968692455
Train 1 Epoch - f1: 0.49907918968692455


In [22]:
# load the model
model.load_state_dict(torch.load('model_f1_05333.pt'))
# freeze the model until the classifier
for name, param in model.named_parameters():
    if 'classifier' not in name:
        param.requires_grad = False

In [23]:
# test on eval set and get percentage that are correctly classified
model.eval()
preds = []
labels = []
for batch in valid_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    logits = outputs.logits
    preds.extend(torch.argmax(logits, dim=1).tolist())
    labels.extend(batch['labels'].tolist())

# get the percentage of correctly classified tweets
correct = 0
for i in range(len(preds)):
    if preds[i] == labels[i]:
        correct += 1

print(f'Percentage correctly classified: {correct/len(preds)}')

Percentage correctly classified: 0.6572554169402495


In [24]:
# train the model for 1 epoch, and get the f1 score
best_f1 = train_loop(5)
print(f'Train 1 Epoch - f1: {best_f1}')

100%|██████████| 381/381 [06:12<00:00,  1.02it/s]


Epoch 0 - f1: 0.5576592082616179


100%|██████████| 381/381 [06:10<00:00,  1.03it/s]


Epoch 1 - f1: 0.5530434782608695


100%|██████████| 381/381 [06:55<00:00,  1.09s/it]


Epoch 2 - f1: 0.5847953216374269


100%|██████████| 381/381 [06:08<00:00,  1.03it/s]


Epoch 3 - f1: 0.6014790468364831


100%|██████████| 381/381 [07:09<00:00,  1.13s/it]


Epoch 4 - f1: 0.6166263115415657
Train 1 Epoch - f1: 0.6166263115415657


In [25]:
# test on eval set and get percentage that are correctly classified
model.eval()
preds = []
labels = []
for batch in valid_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    logits = outputs.logits
    preds.extend(torch.argmax(logits, dim=1).tolist())
    labels.extend(batch['labels'].tolist())

# get the percentage of correctly classified tweets
correct = 0
for i in range(len(preds)):
    if preds[i] == labels[i]:
        correct += 1

print(f'Percentage correctly classified: {correct/len(preds)}')

Percentage correctly classified: 0.6881155613919895


In [26]:
# save the model
torch.save(model.state_dict(), 'model_f1_68811.pt')

In [27]:
# load the model
model.load_state_dict(torch.load('model_f1_68811.pt'))

<All keys matched successfully>

In [28]:
# set all parameters to trainable
for name, param in model.named_parameters():
    param.requires_grad = True

# train the model for 1 epoch, and get the f1 score
best_f1 = train_loop(1)

100%|██████████| 381/381 [13:54<00:00,  2.19s/it]


Epoch 0 - f1: 0.7812739831158865


In [29]:
# test on eval set and get percentage that are correctly classified
model.eval()
preds = []
labels = []
for batch in valid_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    logits = outputs.logits
    preds.extend(torch.argmax(logits, dim=1).tolist())
    labels.extend(batch['labels'].tolist())

# get the percentage of correctly classified tweets
correct = 0
for i in range(len(preds)):
    if preds[i] == labels[i]:
        correct += 1

print(f'Percentage correctly classified: {correct/len(preds)}')

Percentage correctly classified: 0.8128693368351937


In [30]:
# save the model
torch.save(model.state_dict(), 'model_f1_81286.pt')

In [31]:
# train the model for 1 epoch, and get the f1 score
best_f1 = train_loop(1)

100%|██████████| 381/381 [13:59<00:00,  2.20s/it]


Epoch 0 - f1: 0.7792828685258963


In [32]:
# test on eval set and get percentage that are correctly classified
model.eval()
preds = []
labels = []
for batch in valid_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    logits = outputs.logits
    preds.extend(torch.argmax(logits, dim=1).tolist())
    labels.extend(batch['labels'].tolist())

# get the percentage of correctly classified tweets
correct = 0
for i in range(len(preds)):
    if preds[i] == labels[i]:
        correct += 1

print(f'Percentage correctly classified: {correct/len(preds)}')

Percentage correctly classified: 0.8181221273801708


In [34]:
# load the test dataset and make predictions
test_df = pd.read_csv('test.csv')

# prepare the data, put keywords and location into text
test_df['text'] = test_df['keyword'].fillna('') + '; ' + test_df['location'].fillna('') + '; ' + test_df['text']

# tokenize the text
test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=128)

# create the dataset
fake_targets = [0] * len(test_df)
test_dataset = DisasterDataset(test_encodings, fake_targets)

# create the dataloader
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# make predictions
model.eval()
preds = []
for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    logits = outputs.logits
    preds.extend(torch.argmax(logits, dim=1).tolist())

# create the submission file
submission = pd.DataFrame({'id': test_df['id'], 'target': preds})
submission.to_csv('submission.csv', index=False)


In [35]:
# save the model
torch.save(model.state_dict(), 'not_bad_bert_model.pt')