### Bert on Kaggle

#### Data Preparation

In [1]:
# import resorce
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# Loading data
tweets_train_val = pd.read_pickle('/kaggle/input/dataset/df_train.pkl')
tweets_test = pd.read_pickle('/kaggle/input/dataset/df_test.pkl')
tweets_train_val.set_index('tweet_id', inplace=True)
tweets_test.set_index('tweet_id', inplace=True)

In [3]:
from sklearn import preprocessing, metrics, decomposition, pipeline, dummy
from sklearn.model_selection import train_test_split

# Using LabelEncoder to turn emotion labels into numeric labels
mle = preprocessing.LabelEncoder()
mle.fit(tweets_train_val['emotion'])
tweets_train_val['label'] = mle.transform(tweets_train_val['emotion'])

# train val split
tweets_train, tweets_val, y_train, y_val = train_test_split(
    tweets_train_val.index.values, 
    tweets_train_val.label.values, 
    test_size=0.2, 
    random_state=42, 
    stratify = tweets_train_val.label.values
)


# Label train and val
tweets_train_val['data_type'] = ['not_set']*tweets_train_val.shape[0]
tweets_train_val.loc[tweets_train, 'data_type'] = "train"
tweets_train_val.loc[tweets_val, 'data_type'] = "val"

# train and val counts
tweets_train_val.groupby(['emotion', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,hashtags,text
emotion,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1
anger,0,train,31894,31894
anger,0,val,7973,7973
anticipation,1,train,199148,199148
anticipation,1,val,49787,49787
disgust,2,train,111281,111281
disgust,2,val,27820,27820
fear,3,train,51199,51199
fear,3,val,12800,12800
joy,4,train,412813,412813
joy,4,val,103204,103204


#### Data Preprocessing and Feature engineering

In [4]:
# Input pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
# text encoding
encoded_data_train = tokenizer.batch_encode_plus(
    tweets_train_val[tweets_train_val.data_type == "train"].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=100, 
    return_tensors='pt',
    truncation=True
)

encoded_data_val = tokenizer.batch_encode_plus(
    tweets_train_val[tweets_train_val.data_type == "val"].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding="max_length", 
    max_length=100, 
    return_tensors='pt',
    truncation=True
)

In [6]:
from torch.utils.data import DataLoader, TensorDataset

# include labels
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(tweets_train_val[tweets_train_val.data_type == "train"].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(tweets_train_val[tweets_train_val.data_type == "val"].label.values)

# pre-load data
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

dataloader_train = DataLoader(dataset_train, shuffle=True, batch_size=64)
dataloader_val = DataLoader(dataset_val, shuffle=True, batch_size=64)







#### Model Setting and Training

In [13]:
from transformers import get_scheduler, AdamW

# label dictionary of emotion labels into numeric labels
label_dict = dict(zip(mle.classes_, mle.transform(mle.classes_)))

# Setting up the pretrained model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_dict),
    output_attentions=False,
    output_hidden_states=False
)

# Using AdamW as the optimizer
optimizer = AdamW(
    model.parameters(),
    lr=1e-05, 
    eps=1e-08
)
                  
# learning rate scheduler
num_epochs = 4
scheduler = get_scheduler(
    name = "LINEAR", 
    optimizer = optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_epochs* len(dataloader_train)
)


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [14]:
import random
from sklearn.metrics import f1_score

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
model.to(device)

# evaluate function to calculate validation accuracy of model
def evaluate(dataloader_val):
    model.eval()
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        logits = outputs[1]
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
        
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    predictions_flat = np.argmax(predictions, axis=1).flatten()
    true_vals_flat = true_vals.flatten()
    
    return f1_score(true_vals_flat, predictions_flat, average='weighted')

In [15]:
# Start training model!
print("starting training...")
    
for epoch in (range(num_epochs)):    
    model.train()
    loss_train_total = 0
    progress_bar = tqdm(dataloader_train, 
                        desc='Epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    torch.save(model.state_dict(), 'finetuned_BERT_emoji_epoch_{}.model'.format(epoch))   
    tqdm.write(f'\nEpoch {epoch}')
    f1_acc = evaluate(dataloader_val)
    tqdm.write("epoch {} validation f1 score: {}".format(epoch, f1_acc))

starting training...


  0%|          | 0/18195 [00:00<?, ?it/s]


Epoch 0
epoch 0 validation f1 score: 0.6391105586785453


  0%|          | 0/18195 [00:00<?, ?it/s]

KeyboardInterrupt: 

#### Test data prediction

In [16]:
# Load model
model.load_state_dict(torch.load('/kaggle/working/finetuned_BERT_emoji_epoch_0.model', map_location=torch.device('cpu')))

# text encoding
encoded_data_test = tokenizer.batch_encode_plus(
    tweets_test.text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=100, 
    # Pytorch tensor
    return_tensors='pt',
    truncation=True
)

In [17]:
# include labels and pre-load data
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']

dataset_test = TensorDataset(input_ids_test, attention_masks_test)
dataloader_test = DataLoader(dataset_test, shuffle=False, batch_size=64)

In [18]:
model.to(device)

# function to return predictions
def testing(dataloader_test):
    model.eval()
    predictions = []
    
    for batch in dataloader_test:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                 }
        
        with torch.no_grad():        
            outputs = model(**inputs)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
    
    predictions = np.concatenate(predictions, axis=0)
    predictions = np.argmax(predictions, axis=1).flatten()
    return predictions

# Start predicting with test data
predictions = testing(dataloader_test)

In [32]:
# inverse label dictionary of numeric labels into emotion labels
label_dict_inverse = {v: k for k, v in label_dict.items()}

# convert and insert predictions to test dataframe
tweets_test.insert(0, "emotion", predictions)
tweets_test['emotion'] = tweets_test['emotion'].map(label_dict_inverse)

In [35]:
result_bert = tweets_test[['emotion']]
result_bert.index.names = ['id']
result_bert

Unnamed: 0_level_0,emotion
id,Unnamed: 1_level_1
0x28b412,anticipation
0x2de201,trust
0x218443,joy
0x2939d5,trust
0x26289a,trust
...,...
0x2913b4,anticipation
0x2a980e,anticipation
0x316b80,sadness
0x29d0cb,joy


In [34]:
# save result to csv
result_bert.to_csv('bert_emoji.csv')