In [1]:
import transformers
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
from sklearn import model_selection
from sklearn import metrics
import numpy as np
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import torch.nn as nn
import torch


MAX_LEN = 512
TRAINING_BATCH_SIZE = 16
VALIDATION_BATCH_SIZE = 8
EPOCHS = 2
TRAINING_PATH = '../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv'
BERT_PATH = '../input/bert-base-uncased/'
MODEL_PATH = 'model.bin'
# Bert tokenizer is WorldPiece tokenizer
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    pretrained_model_name_or_path= BERT_PATH,
    do_lower_case = True
)





In [2]:
class BERTsentiment(nn.Module):
    def __init__(self):
        super(BERTsentiment, self).__init__()
        
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.drop = nn.Dropout(0.5) # for regularization
        self.out_layer = nn.Linear(768, 1) # BERT model uses 768 in last, 1 output because its binary 
        
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        # out1 = Sequence of hidden states at the output of the last layer of the model
        # out2 =  Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function.
        # The Linear layer weights are trained from the next sentence prediction (classification) objective during pretraining.
        out1, out2 = self.bert(
            input_ids = input_ids, # token indices 
            attention_mask = attention_mask, # indices for padding 0 and 1
            token_type_ids = token_type_ids  # indices for sentences, we dont really need this becasue our input is only one sentence so its always gona be 0
        )
        
        bert_output = self.drop(out2) # apply Dropout
        output = self.out_layer(bert_output) # pass to Linear layer
        
        return output # Linear output 
    

In [3]:



class BERTdataset:
    def __init__(self, review, sentiment):
        self.review = review # input
        self.sentiment = sentiment # target
        self.tokenizer = TOKENIZER
        self.max_length = MAX_LEN
        
        
    def __len__(self):
        return len(self.review)
    
    
    
    def __getitem__(self, item_index):
        review = str(self.review[item_index])
        review = ' '.join(review.split()) # first make a list out of sentences than make a sentnces with only one space between words
                                          #this just removes if there are some weired spaces between words
            
        # BERT can take as input either one or two sentences, and uses [SEP] token to separate them.
        # [CLS] token always appears at start of sentences
        # Both tokens are always required even if we only have one sentences becasue thats how BERT was pretrained and how expects input
        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens = True,
            max_length = self.max_length,
            truncation = True
        )
        
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']
        
        # we could have done padding  as parametar in encode_plus but lets act fancy
        padding_length = self.max_length - len(input_ids)
        
        input_ids = input_ids + ([0] * padding_length)             # add [0] to the max lenght
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        # return tensors
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'sentiments': torch.tensor(self.sentiment[item_index], dtype=torch.float)
        }
    

In [4]:

def loss_function(outputs, sentiments):
    return nn.BCEWithLogitsLoss()(outputs,sentiments.view(-1, 1))
    # This loss combines a Sigmoid layer and the BCELoss in one single class.




def training_loop(training_data_loader, model, optimizer, scheduler, device):
    # training state
    model.train()
    
    for batch_index, dataset in tqdm(enumerate(training_data_loader), total=len(training_data_loader)):
        # load from dataset
        input_ids = dataset['input_ids']
        attention_mask = dataset['attention_mask']
        token_type_ids = dataset['token_type_ids']
        sentiments = dataset['sentiments']
        # move to cuda device
        input_ids = input_ids.to(device, dtype=torch.long)
        attention_mask = attention_mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        sentiments = sentiments.to(device, dtype=torch.float)
        
        # set gradients to zero before every backprop becasue pytorch does not do that
        optimizer.zero_grad()
        outputs = model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
        )
        
        loss = loss_function(outputs, sentiments)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        if batch_index % 500 == 0 and batch_index != 0:
            print('BATCH_INDEX: ',batch_index ,'==========', 'LOSS: ', loss.item())
            
            
def evaluation_loop(validation_data_loader, model, device):
    
    # evaluation state
    model.eval()
    final_sentiments = []
    final_outputs = []
    with torch.no_grad():
        # deactivate autograd, helps with memory usage
        
        for batch_index, dataset in tqdm(enumerate(validation_data_loader), total=len(validation_data_loader)):
            # load from dataset
            input_ids = dataset['input_ids']
            attention_mask = dataset['attention_mask']
            token_type_ids = dataset['token_type_ids']
            sentiments = dataset['sentiments']
            # move to cuda device
            input_ids = input_ids.to(device, dtype=torch.long)
            attention_mask = attention_mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            sentiments = sentiments.to(device, dtype=torch.float)

            # set gradients to zero before every backprop becasue pytorch does not do that
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                token_type_ids = token_type_ids,
            )
            
            final_sentiments.extend(sentiments.cpu().detach().numpy().tolist())
            # move to cpu
            # detach beacause no need for gradients
            # numpy array
            #list
            final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            
    return final_outputs, final_sentiments    
        

In [5]:

    
dataframe = pd.read_csv(TRAINING_PATH) # load dataframe
dataframe.sentiment = dataframe.sentiment.apply(
        lambda x: 1 if x == 'positive' else 0
    )
    # sentiment is category target variable so we have to label encode it, we can do it like this by hand, or simply with sklearn.model_selection.LabelEncoder
    
    
    # now split data into validation and training

df_train, df_valid = model_selection.train_test_split(
        dataframe,
        test_size = 0.1, # 10 percent of dataframe will be for validation
        random_state = 42, # if we are going to run multiple time this script, random state enables that everytime we get same split with same random state
        shuffle = True, # shuffle indices
        stratify = dataframe.sentiment.values # same distribution in train and valid 
    )
    
df_train = df_train.reset_index(drop=True) # we reset indices from 0 to len(df_train)
df_valid = df_valid.reset_index(drop=True) # we reset indices from 0 to len(df_valid)
    
    # make datasets with our class in order to make data loaders
training_dataset = BERTdataset(
        review = df_train.review.values,
        sentiment = df_train.sentiment.values
    )
    # from dataset to dataloader
training_data_loader = torch.utils.data.DataLoader(
        dataset = training_dataset,
        batch_size = TRAINING_BATCH_SIZE,
        shuffle = True,
        num_workers = 4
    )
    
validation_dataset = BERTdataset(
        review = df_valid.review.values,
        sentiment = df_valid.sentiment.values,
    )
    # from dataset to dataloader
validation_data_loader = torch.utils.data.DataLoader(
        dataset = validation_dataset,
        batch_size = VALIDATION_BATCH_SIZE,
        shuffle = False,
        num_workers = 4
    )
    
device = torch.device('cuda')
model = BERTsentiment()
model.to(device) # move model to cuda device 
    # params to optimize 
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if  any(nd in n for nd in no_decay)], 'weight_decay': 0.00}
    ]
    
number_of_training_steps = int(len(df_train) / TRAINING_BATCH_SIZE * EPOCHS) 
    #AdamW focuses on regularization and model does better on  generalization
optimizer = AdamW(
        params = optimizer_parameters,
        lr = 3e-5
    )
scheduler = get_linear_schedule_with_warmup(
        optimizer = optimizer,
        num_warmup_steps = 0,
        num_training_steps = number_of_training_steps,
        
    )

In [6]:
best_accuracy = []
    
for epoch in range(EPOCHS):
    print('EPOCH:', epoch + 1)
    training_loop(
            training_data_loader,
            model,
            optimizer,
            scheduler,
            device)
    outputs, sentiments = evaluation_loop(
            validation_data_loader, 
            model, 
            device)
        # distribution is 50 50 so we can use acc score
    outputs = np.array(outputs) >= 0.5 # positive class
    accuracy = metrics.accuracy_score(sentiments, outputs)
    print('ACCURACY SCORE',{accuracy})
        
    if accuracy > best_accuracy:
        torch.save(model.state_dict(),MODEL_PATH) # save model in working dir
        best_accuracy = accuracy


EPOCH: 1


 18%|█▊        | 501/2813 [07:07<32:38,  1.18it/s]



 36%|███▌      | 1001/2813 [14:12<25:42,  1.18it/s]



 53%|█████▎    | 1501/2813 [21:17<18:42,  1.17it/s]



 71%|███████   | 2001/2813 [28:22<11:34,  1.17it/s]



 89%|████████▉ | 2501/2813 [35:28<04:25,  1.18it/s]



100%|██████████| 2813/2813 [39:53<00:00,  1.18it/s]
100%|██████████| 625/625 [01:37<00:00,  6.41it/s]

ACCURACY SCORE {0.937}
EPOCH: 2



 18%|█▊        | 501/2813 [07:06<32:40,  1.18it/s]



 36%|███▌      | 1001/2813 [14:11<25:37,  1.18it/s]



 53%|█████▎    | 1501/2813 [21:17<18:38,  1.17it/s]



 71%|███████   | 2001/2813 [28:21<11:35,  1.17it/s]



 89%|████████▉ | 2501/2813 [35:27<04:23,  1.18it/s]



100%|██████████| 2813/2813 [39:52<00:00,  1.18it/s]
100%|██████████| 625/625 [01:36<00:00,  6.45it/s]

ACCURACY SCORE {0.9504}



