In [1]:
#install dependecies
!pip install transformers==3.0.2



In [2]:
# This code has been referenced from multiple sources. Following are list of links
# HuggingFace transformer notebooks and community notebooks -
# https://huggingface.co/transformers/notebooks.html
# https://github.com/DhavalTaunk08/NLP_scripts

# importing dependecies
import pandas as pd
import numpy as np
import torch
from transformers import RobertaModel, RobertaTokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import logging
logging.basicConfig(level=logging.ERROR)
torch.manual_seed(7)

<torch._C.Generator at 0x7fecdcd9eb88>

In [3]:
# setting device type 
from torch import cuda
if cuda.is_available():
  device = 'cuda'
else:
  device = 'cpu'

In [4]:
# reading datasets
train = pd.read_csv('train_attr.csv')
test = pd.read_csv('test_attr.csv')

# extracting required fields
cols =['title']
train['text_final'] = train[cols].apply(lambda row: '. '.join(row.values.astype(str)), axis=1).tolist()
test['text_final'] = test[cols].apply(lambda row: '. '.join(row.values.astype(str)), axis=1).tolist()
train_df = train[['title', 'label']]
test_df = test[['title', 'label']]

In [5]:
# Setting hyper parameters
MAX_LEN = 128
TRAIN_BATCH_SIZE = 36
VALID_BATCH_SIZE = 16
# EPOCHS = 1
LEARNING_RATE = 2.5e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

In [6]:
class StockData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.title
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [7]:
train_size = 0.85
train_data=train_df.sample(frac=train_size,random_state=2)
val_data = train_df.drop(train_data.index).reset_index(drop=True)
test_data=test_df
train_data = train_data.reset_index(drop=True)
final_train = train_df
print("TRAIN Dataset: {}".format(train_data.shape))
print("Val dataset : {}".format(val_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = StockData(train_data, tokenizer, MAX_LEN)
final_training_set = StockData(final_train, tokenizer, MAX_LEN)
val_set = StockData(val_data, tokenizer, MAX_LEN)
testing_set = StockData(test_data, tokenizer, MAX_LEN)

TRAIN Dataset: (7367, 2)
Val dataset : (1300, 2)
TEST Dataset: (2040, 2)


In [8]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
final_training_loader = DataLoader(final_training_set, **train_params)
val_loader = DataLoader(val_set, **test_params)
testing_loader = DataLoader(testing_set, **test_params)

In [9]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [10]:
model = RobertaClass()
model.to(device)

RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, eleme

In [11]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [12]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch, dataloader):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(dataloader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [13]:
EPOCHS = 3
for epoch in range(EPOCHS):
    train(epoch, training_loader)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Training Loss per 5000 steps: 1.5827823877334595
Training Accuracy per 5000 steps: 38.888888888888886

The Total Accuracy for Epoch 0: 50.90267408714538
Training Loss Epoch: 0.7504771290755854
Training Accuracy Epoch: 50.90267408714538


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Training Loss per 5000 steps: 0.7049793601036072
Training Accuracy per 5000 steps: 55.55555555555556

The Total Accuracy for Epoch 1: 50.99769241210805
Training Loss Epoch: 0.7050577870229395
Training Accuracy Epoch: 50.99769241210805


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Training Loss per 5000 steps: 0.7196435928344727
Training Accuracy per 5000 steps: 44.44444444444444

The Total Accuracy for Epoch 2: 53.68535360390933
Training Loss Epoch: 0.6967494124319495
Training Accuracy Epoch: 53.68535360390933


In [14]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [15]:
acc = valid(model, val_loader)
print("Accuracy on val data = %0.2f%%" % acc)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Validation Loss per 100 steps: 0.687965452671051
Validation Accuracy per 100 steps: 50.0

Validation Loss Epoch: 0.6917874958457016
Validation Accuracy Epoch: 50.76923076923077
Accuracy on val data = 50.77%


In [16]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Validation Loss per 100 steps: 0.6695374846458435
Validation Accuracy per 100 steps: 81.25

Validation Loss Epoch: 0.6793883149512112
Validation Accuracy Epoch: 59.80392156862745
Accuracy on test data = 59.80%


In [17]:
## Adding back validation to train set and train with best hyper parameters
model = RobertaClass()
model.to(device)

RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, eleme

In [18]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [19]:
EPOCHS = 3
for epoch in range(EPOCHS):
    train(epoch, final_training_loader)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Training Loss per 5000 steps: 1.6481839418411255
Training Accuracy per 5000 steps: 0.0

The Total Accuracy for Epoch 0: 49.867312795661704
Training Loss Epoch: 0.7464312371871283
Training Accuracy Epoch: 49.867312795661704


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Training Loss per 5000 steps: 0.6764538884162903
Training Accuracy per 5000 steps: 61.111111111111114

The Total Accuracy for Epoch 1: 51.02111457251644
Training Loss Epoch: 0.7014699919589822
Training Accuracy Epoch: 51.02111457251644


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Training Loss per 5000 steps: 0.6806009411811829
Training Accuracy per 5000 steps: 55.55555555555556

The Total Accuracy for Epoch 2: 56.27091265720549
Training Loss Epoch: 0.6843340394407882
Training Accuracy Epoch: 56.27091265720549


In [20]:
acc = valid(model, val_loader)
print("Accuracy on val data = %0.2f%%" % acc)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Validation Loss per 100 steps: 0.7438175678253174
Validation Accuracy per 100 steps: 50.0

Validation Loss Epoch: 0.6369505408333569
Validation Accuracy Epoch: 65.84615384615384
Accuracy on val data = 65.85%


In [21]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Validation Loss per 100 steps: 0.6399781107902527
Validation Accuracy per 100 steps: 62.5

Validation Loss Epoch: 0.6547616759780794
Validation Accuracy Epoch: 61.81372549019608
Accuracy on test data = 61.81%
