In [20]:
import torch 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import transformers  
from transformers import DistilBertModel
from os import listdir
from os.path import isfile, join
import torch.nn as nn
import torch.nn.functional as F

In [21]:
tokenizer = transformers.DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [22]:
pos_train = [f for f in listdir("aclImdb/train/pos") if isfile(join("aclImdb/train/pos", f))]
neg_train = [f for f in listdir("aclImdb/train/neg") if isfile(join("aclImdb/train/neg", f))]
pos_test = [f for f in listdir("aclImdb/test/pos") if isfile(join("aclImdb/test/pos", f))]
neg_test = [f for f in listdir("aclImdb/test/neg") if isfile(join("aclImdb/test/neg", f))]

In [23]:
int(pos_train[0].split("_")[1].split('.')[0])

9

In [24]:
pos_labels = []
neg_labels = []
pos_reviews = []
neg_reviews = []

In [25]:
for file in pos_train:
    pos_labels.append(1)
    filepath = "aclImdb/train/pos/" + file
    with open(filepath) as f:
        lines = f.readlines()
    lines = lines[0]
    pos_reviews.append(lines)

In [26]:
for file in neg_train:
    neg_labels.append(0)
    filepath = "aclImdb/train/neg/" + file
    
    with open(filepath) as f:
        lines = f.readlines()
    lines = lines[0]
    neg_reviews.append(lines)

In [27]:
reviews = pos_reviews + neg_reviews
labels = pos_labels + neg_labels

In [28]:
class imdb(torch.utils.data.Dataset):
    def __init__(self,data,labels):
        self.data = data
        self.labels = labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,index):
        return self.data[index],self.labels[index]

In [29]:
train_data = imdb(reviews,labels)

In [30]:
train_data = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True,drop_last=True)

In [31]:
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
class CustomBERTModel(nn.Module):
    def __init__(self):
        super(CustomBERTModel, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        # add your additional layers here, for example a dropout layer followed by a linear classification head
        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)
            
        


    def forward(self, ids, mask):
        sequence_output = self.bert(
               ids, 
               attention_mask=mask,     
          )
            
        # we apply dropout to the sequence output, tensor has shape (batch_size, sequence_length, 768)
        sequence_output = self.dropout(sequence_output[0])
        
    
        # next, we apply the linear layer. The linear layer (which applies a linear transformation)
        # takes as input the hidden states of all tokens (so seq_len times a vector of size 768, each corresponding to
        # a single token in the input sequence) and outputs 2 numbers (scores, or logits) for every token
        # so the logits are of shape (batch_size, sequence_length, 2)
        logits = self.out(sequence_output)
        print(logits.shape)
        logits = torch.squeeze(logits)
        logits = torch.mean(logits,1)
    
        logits = F.sigmoid(logits)
        return logits


In [34]:
dist_model = CustomBERTModel()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [35]:
learning_rate = 1e-1
optimizer = torch.optim.SGD(dist_model.parameters(),lr=learning_rate)
loss_fn = nn.BCELoss()

In [36]:
device = torch.device('cuda')

In [37]:
print('Using device:', device)
dist_model.to(device)

Using device: cuda


In [None]:
epochs = 1
for epoch in range(epochs):
    for j,(x_train,y_train) in enumerate(train_data): ## If you have a DataLoader()  object to get the data.

        data = list(x_train)
        
        dist_model.train()
        targets = y_train ## assuming that data loader returns a tuple of data and its targets
        optimizer.zero_grad()   
        encoding = tokenizer.batch_encode_plus(data, return_tensors='pt', padding=True, truncation=True,max_length=512, add_special_tokens = True)
       
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        input_ids.to(device)
        attention_mask.to(device)
        outputs = dist_model(input_ids,attention_mask)
        
        #outputs = F.log_softmax(outputs, dim=1)
        #print(outputs)
        #print(outputs.shape)
        #outputs = torch.squeeze(outputs)
        loss = loss_fn(outputs, targets.float())
        print(f"Epoch {epoch+1} , LOSS :  {loss}")
        loss.backward()
        optimizer.step()
    
        