In [None]:
pip install transformers



In [None]:
import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
from tqdm import tqdm

In [None]:
class BertDataset(Dataset):
  def __init__(self, tokenizer, max_length):
    super(BertDataset, self).__init__()
    self.train_csv = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
    self.tokenizer = tokenizer
    self.n = self.train_csv.shape[0]
    self.target = self.train_csv.iloc[:, 1]
    self.max_length = max_length

  def __len__(self):
    return len(self.train_csv)
  def getNumSample(self):
    return self.train_csv.shape[0]
  def __getitem__(self, index):
    text1 = self.train_csv.iloc[index, 0]

    inputs = self.tokenizer.encode_plus(
        text1,
        None,
        pad_to_max_length = True,
        add_special_tokens =  True,
        return_attention_mask = True,
        max_length = self.max_length,
    )

    ids = inputs["input_ids"]
    token_type_ids = inputs["token_type_ids"]
    mask = inputs["attention_mask"]

    return{
        "ids" : torch.tensor(ids, dtype = torch.long),
        "mask" : torch.tensor(mask, dtype = torch.long),
        "token_type_ids":torch.tensor(token_type_ids, dtype = torch.long),
        "target":torch.tensor(self.train_csv.iloc[index, 1], dtype = torch.long)
    }
  

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
dataset = BertDataset(tokenizer, max_length = 100)

dataloader = DataLoader(dataset = dataset, batch_size = 32)

In [None]:
class BERT(nn.Module):
  def __init__(self):
    super(BERT, self).__init__()
    self.bert_model = transformers.BertModel.from_pretrained("bert-base-uncased")
    self.out = nn.Linear(768, 1)

  def forward(self, ids, mask, token_type_ids):
    _,o2 = self.bert_model(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict = False)

    output = self.out(o2)
    return output

model = BERT()

loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
for param in model.bert_model.parameters():
  param.requires_grad = False


In [None]:
def finetune(epochs,dataloader,model,loss_fn,optimizer):
    model.train()
    for  epoch in range(epochs):
        print(epoch)
        
        loop=tqdm(enumerate(dataloader),leave=False,total=len(dataloader))
        for batch, dl in loop:
            ids=dl['ids']
            token_type_ids=dl['token_type_ids']
            mask= dl['mask']
            label=dl['target']
            label = label.unsqueeze(1)
            
            optimizer.zero_grad()
            
            output=model(
                ids=ids,
                mask=mask,
                token_type_ids = token_type_ids)
            label = label.type_as(output)

            loss=loss_fn(output,label)
            loss.backward()
            
            optimizer.step()
            
            pred = np.where(output >= 0, 1, 0)

            num_correct = sum(1 for a, b in zip(pred, label) if a[0] == b[0])
            num_samples = pred.shape[0]
            accuracy = num_correct/num_samples
            
            print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')
            
            # Show progress while training
            loop.set_description(f'Epoch={epoch}/{epochs}')
            loop.set_postfix(loss=loss.item(),acc=accuracy)

    return model

In [None]:
# model=finetune(5, dataloader, model, loss_fn, optimizer)

In [None]:
torch.save(model.state_dict(), "model1.pth")

In [None]:
model.load_state_dict(torch.load("/content/drive/MyDrive/Fine-Tune-Bert/model1.pth"))

<All keys matched successfully>

In [None]:
#model= torch.load("/content/drive/MyDrive/Fine-Tune-Bert/model1.pth")

In [None]:
loop=tqdm(enumerate(dataloader),leave=False,total=len(dataloader))
num_samples = dataset.getNumSample()
num_correct = 0
for batch, dl in loop:
  ids=dl['ids']
  print(ids.shape)
  token_type_ids=dl['token_type_ids']
  print(token_type_ids.shape)
  mask= dl['mask']
  print(mask.shape)
  label=dl['target']
  label = label.unsqueeze(1)
            
  output=model(
                ids=ids,
                mask=mask,
                token_type_ids = token_type_ids)
  label = label.type_as(output)
  pred = np.where(output >= 0, 1, 0)

  num_correct += sum(1 for a, b in zip(pred, label) if a[0] == b[0])
            
print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')
            

In [None]:
# def evaluate(dataloader, model, dataset):
#   num_correct = 0
#   for batch, dl in enumerate(dataloader):
#     label = dl['target']
#     label = label.unsqueeze(1)
#     temp = dl['token_type_ids']
#     output = model(
#           ids = dl['ids'],
#           mask = dl['mask'],
#           token_type_ids = temp
#     )
#     print(batch)

#     predict = np.where(output >=0, 1, 0)
#     num_correct += sum(1 for a, b in zip(predict, label) if a[0] == b[0])
#   result = num_correct / float(dataset.getNumSample())
#   return result

In [None]:
# print(evaluate(dataloader, model, dataset))

In [None]:
def predict(text, tokenizer):
  inputs = tokenizer.encode_plus(
      text,
      None,
      pad_to_max_length = True,
      add_special_tokens =  True,
      return_attention_mask = True,
      max_length = 100,
  )
  output = model(
        ids = torch.tensor(inputs["input_ids"], dtype = torch.long).reshape(1,-1),
        mask = torch.tensor(inputs["attention_mask"], dtype = torch.long).reshape(1,-1),
        token_type_ids = torch.tensor(inputs["token_type_ids"], dtype = torch.long).reshape(1,-1)
  )
  print("Nhận xét : ",text)
  if(np.where(output >=0, 1, 0)[0][0] == 1):
    print("dự đoán: positive")
  else:
    print("dự đoán: negative")

In [None]:
text1 = "I really miss this movie so much. That time when I watched this movie on theatres with full of people and the crowd went crazy during epic fights"
text2 = "the film is strictly routine"
text3 = "Endgame was such a legendary movie that people come to re-watch it's trailer 2 years after it's release"
text4 = "All I can say is.. I'm grateful this movie was released in 2019 to be experienced the way it was meant to be"
text5 = "the drama discloses almost nothing"
text6 = "clockstoppers is one of those crazy , mixed up films that does n't know what it wants to be when it grows up"

In [None]:
predict(text1,tokenizer)
predict(text2,tokenizer)
predict(text3,tokenizer)
predict(text4,tokenizer)
predict(text5,tokenizer)
predict(text6,tokenizer)



Nhận xét :  I really miss this movie so much. That time when I watched this movie on theatres with full of people and the crowd went crazy during epic fights
dự đoán: positive
Nhận xét :  the film is strictly routine
dự đoán: negative
Nhận xét :  Endgame was such a legendary movie that people come to re-watch it's trailer 2 years after it's release
dự đoán: positive
Nhận xét :  All I can say is.. I'm grateful this movie was released in 2019 to be experienced the way it was meant to be
dự đoán: positive
Nhận xét :  the drama discloses almost nothing
dự đoán: negative
Nhận xét :  clockstoppers is one of those crazy , mixed up films that does n't know what it wants to be when it grows up
dự đoán: negative


In [None]:
text1 = "it's not too fast and not too slow"
text2 = "this movie is normal"
text3 = "This movie is neither good nor bad"

In [None]:
predict(text1,tokenizer)
predict(text2,tokenizer)
predict(text3,tokenizer)



Nhận xét :  it's not too fast and not too slow
dự đoán: negative
Nhận xét :  this movie is normal
dự đoán: negative
Nhận xét :  This movie is neither good nor bad
dự đoán: negative
