### imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, Dataset
from torch.utils.data import RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve

import transformers
from transformers import RobertaModel, RobertaTokenizerFast
from transformers import AdamW

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
wpt = nltk.WordPunctTokenizer()
stop_words = stopwords.words('english')

### Data preprocessing

In [5]:
def normalize_str(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[:!.,\-]', ' ', doc, re.I|re.A)
    doc = re.sub(r'ё', 'е', doc, re.T|re.A)
    doc = re.sub(r'[^@a-zA-ZА-я\s\d]', ' ', doc, re.I|re.A)
    doc = doc.strip()
    
    # clean text
    doc = re.sub(r'[^А-яa-zA-Z\s]', '', doc, re.I|re.A)
    doc = re.sub(r'[a-zA-z\d]{1,3}', '', doc, re.I|re.A)
    doc = re.sub(r' \w ', ' ', doc, re.A|re.I)
    
    # tokenize document
    tokens = wpt.tokenize(doc)
    
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [6]:
class TwitterDataset(Dataset):
    def __init__(self, dataframe, is_train=True, is_val=False, is_test=False):
        # set tokenizer
        self.tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
        self.is_train, self.is_val, self.is_test = is_train, is_val, is_test
        # prepate text data
        self.text_data = dataframe['tweet'].apply(normalize_str)
        
        tokens = self.tokenizer.batch_encode_plus(self.text_data.tolist(),
                                                  max_length=100,
                                                  pad_to_max_length=True,
                                                  truncation=True)
        
        self.seq = torch.tensor(tokens['input_ids'])
        self.mask = torch.tensor(tokens['attention_mask'])
        
        # set labels
        self.labels = torch.tensor(dataframe['label'].tolist())
    
    
    def __len__(self):
        return len(self.text_data)
    
    
    def __getitem__(self, index):
        if self.is_train or self.is_val:
            return (self.seq[index], self.mask[index], self.labels[index])
        else:
            return (self.seq[index], self.mask[index])

### Model definition

In [7]:
device = torch.device("cuda")

In [8]:
# import pretrained model
roberta = RobertaModel.from_pretrained('roberta-base')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




In [9]:
batch_size = 32

In [10]:
# freeze all the layers of the model before fine-tuning it
for param in roberta.parameters():
    param.requires_grad = False

In [11]:
class Roberta(nn.Module):
    def __init__(self, roberta):
        super(Roberta, self).__init__()
        
        self.roberta = roberta
        
        self.text = nn.Sequential(nn.Linear(768, 512),
                                  nn.BatchNorm1d(512),
                                  nn.ReLU(),
                                  nn.Dropout(p=0.4),
                                  nn.Linear(512, 512),
                                  )
        
        #dense layer (Output)
        self.output = nn.Linear(512, 2)
        
        #sigmoid activation
        self.softmax = nn.LogSoftmax(dim=1)
        
        
    # define forward pass
    def forward(self, sent_id, mask):
        
        # pass inputs to the model
        _, cls_hs = self.roberta(sent_id, attention_mask=mask)

        x = self.text(cls_hs)
        
        # output layer
        out = self.output(x)
        
        #apply softmax
        out = self.softmax(out)
        
        return out

In [12]:
# pass pre-trained RoBERTa to our architecture
model = Roberta(roberta)

# pass the model to gpu
model = model.to(device)

In [13]:
#define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

### Initialize data

In [15]:
full_dataset = pd.read_csv('train_tweet.csv')

train_size = round(full_dataset.shape[0] * 0.8)
train_dataset = full_dataset[:train_size]
val_dataset = full_dataset[train_size:]

In [16]:
train = TwitterDataset(train_dataset,
                       is_train=True, 
                       is_val=False, 
                       is_test=False)

train_sampler = RandomSampler(train)

train_dataloader = DataLoader(train,
                              sampler=train_sampler,
                              batch_size=batch_size)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…






In [17]:
val = TwitterDataset(val_dataset,
                     is_train=False,
                     is_val=True, 
                     is_test=False)

val_sampler = SequentialSampler(val)

val_dataloader = DataLoader(val,
                            sampler=val_sampler,
                            batch_size=batch_size)



In [18]:
# compute class weights
class_weights = compute_class_weight('balanced',
                                     np.unique(train_dataset['label']),
                                     train_dataset['label'])
print('Class weights:', class_weights)

Class weights: [0.53784023 7.10672596]


In [19]:
# convert list of class weights to tensor
weights = torch.tensor(class_weights, dtype=torch.float)

#push to GPU
weights = weights.to(device)

# define loss function
cross_entropy = nn.NLLLoss(weight=weights)

# number of epochs
epochs = 10

### Function for fitting

In [20]:
def train():
    model.train()
    
    total_loss, total_accuracy = 0, 0
    
    # empty list to save model predictions
    total_preds = []
    
    #iterate over batches
    for batch in tqdm(train_dataloader):
        
        # push the batch to GPU
        batch = [r.to(device) for r in batch]
        
        sent_id, mask, labels = batch
                
        # clear previously calculated gradients
        model.zero_grad()
        
        # get the model predictions for current batch
        preds = model(sent_id, mask)
        
        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)
        
        # add on to the total loss
        total_loss = total_loss + loss.item()
        
        # backward pass to calculate the gradients
        loss.backward()
        
        # clip the gradients to 1.0. (anti exploding gradient)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # update parameters
        optimizer.step()
        
        # model predictions are stored on GPU. Push them to CPU
        preds = preds.detach().cpu().numpy()
        
        # append model predictions
        total_preds.append(preds)
        
    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)
    
    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds = np.concatenate(total_preds, axis=0)
    
    return avg_loss, total_preds

### Function for evaluating

In [21]:
def evaluate():
    print('\nEvaluating...')
    
    # deactivate dropout layers
    model.eval()
    
    total_loss, total_accuracy = 0, 0
    
    # empty list to save model predictions
    total_preds = []
    
    # iterate over batches
    for batch in tqdm(val_dataloader):
        
        # push batch to GPU
        batch = [t.to(device) for t in batch]
        
        sent_id, mask, labels = batch
        
        # deactivate autograd
        with torch.no_grad():
            # model predictions
            preds = model(sent_id, mask)
            
            # compute validation loss between actual and predicted values
            loss = cross_entropy(preds, labels)
            
            total_loss = total_loss + loss.item()
            
            preds = preds.detach().cpu().numpy()
            
            total_preds.append(preds)
    
    avg_loss = total_loss / len(val_dataloader)
    
    total_preds = np.concatenate(total_preds, axis=0)
    
    return avg_loss, total_preds

### Finally train the model

In [22]:
# set initial loss to +inf
best_valid_loss = float('inf')

# empty lists for train and val losses for each epoch
train_losses = []
valid_losses = []

for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    # train model
    train_loss, _ = train()
    
    # evaluate model
    valid_loss, _ = evaluate()
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'roberta-parameters.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

  0%|          | 0/800 [00:00<?, ?it/s]


 Epoch 1 / 10


100%|██████████| 800/800 [02:20<00:00,  5.71it/s]
  0%|          | 1/200 [00:00<00:30,  6.47it/s]


Evaluating...


100%|██████████| 200/200 [00:32<00:00,  6.11it/s]
  0%|          | 1/800 [00:00<02:20,  5.68it/s]


Training Loss: 0.736
Validation Loss: 0.685

 Epoch 2 / 10


100%|██████████| 800/800 [02:33<00:00,  5.20it/s]
  0%|          | 1/200 [00:00<00:31,  6.27it/s]


Evaluating...


100%|██████████| 200/200 [00:33<00:00,  5.92it/s]
  0%|          | 1/800 [00:00<02:32,  5.23it/s]


Training Loss: 0.732
Validation Loss: 0.687

 Epoch 3 / 10


100%|██████████| 800/800 [02:35<00:00,  5.16it/s]
  0%|          | 1/200 [00:00<00:32,  6.20it/s]


Evaluating...


100%|██████████| 200/200 [00:33<00:00,  5.93it/s]
  0%|          | 1/800 [00:00<02:36,  5.10it/s]


Training Loss: 0.736
Validation Loss: 0.694

 Epoch 4 / 10


  1%|▏         | 11/800 [00:02<02:33,  5.13it/s]


KeyboardInterrupt: ignored

### Testing

In [36]:
# pass pre-trained BERT to our architecture
model_test = Roberta(roberta)

# pass the model to gpu
model_test = model_test.to(device)

#load weights of best model
path = 'roberta-parameters.pt'
model_test.load_state_dict(torch.load(path))

<All keys matched successfully>

In [38]:
def test():

    model.eval()

    cur_preds = []
    cur_labels = []

    # predict
    for batch in tqdm(val_dataloader):
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        with torch.no_grad():
            preds = model(sent_id, mask)

            preds = preds.detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()

            cur_preds.append(preds)
            cur_labels.append(labels)

    cur_preds = np.concatenate(cur_preds, axis=0)
    cur_labels = np.concatenate(cur_labels, axis=0)

    cur_preds_binary = np.argmin(cur_preds, axis=1)

    cur_accuracy = accuracy_score(cur_labels, cur_preds_binary)
    
    pr_curve = precision_recall_curve(cur_labels, cur_preds_binary)
    
    print("\nAccuracy:  " + str(cur_accuracy))

In [39]:
test()

100%|██████████| 200/200 [00:30<00:00,  6.49it/s]


Accuracy:  0.9306946182728411



