In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 28.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 71.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 63.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 66.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transform

In [2]:
import pandas as pd
import numpy as np
import transformers
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
import json
import tqdm

In [3]:
cuda = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
with open('/content/drive/MyDrive/Pytorch/datasetVNTC.json') as json_file:
    data = json.load(json_file)

In [5]:
data.keys()

dict_keys(['data', 'target', 'target_names'])

In [6]:
print(len(data['data']))
print(len(data['target']))
print(len(data['target_names']))

33759
33759
10


In [7]:
class TextDataset(Dataset):
  def __init__(self, X, y, tokenizer, max_length):
    super(TextDataset, self).__init__()
    self.tokenizer = tokenizer
    self.X = X
    self.y = y
    self.max_length = max_length
  def __len__(self):
    return len(self.y)
  def __getitem__(self, index):
    text = self.X[index]

    inputs = self.tokenizer.encode_plus(
        text,
        None,
        pad_to_max_length = True,
        add_special_tokens = True,
        return_attention_mask = True,
        max_length = self.max_length,
        truncation = True,
    )
    ids = inputs["input_ids"]
    token_type_ids = inputs["token_type_ids"]
    mask = inputs["attention_mask"]

    return{
        "ids" : torch.tensor(ids, dtype = torch.long, device = cuda),
        "mask" : torch.tensor(mask, dtype = torch.long, device = cuda),
        "token_type_ids" : torch.tensor(token_type_ids, dtype = torch.long, device = cuda),
        "target" : torch.tensor(self.y[index], dtype = torch.long, device = cuda),
    }


In [9]:
phobert = AutoModel.from_pretrained("vinai/phobert-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/518M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/874k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
datatest = TextDataset(data['data'][30000:33744], data['target'][30000:33744],tokenizer, 256)
datatrain = TextDataset(data['data'][0:30000], data['target'][0:30000],tokenizer, 256)

In [11]:
dataloader = DataLoader(dataset = datatrain, batch_size = 16)
datatestloader = DataLoader(dataset = datatest, batch_size = 16)

In [12]:
class BERT(nn.Module):
  def __init__(self, bert):
    super(BERT, self).__init__()
    self.bert = bert

    self.hidden = nn.Linear(768, 512)
    self.relu = nn.ReLU()
    self.output = nn.Linear(512, 10)

  def forward(self, ids, mask, token_type_ids):
    _, o = self.bert(input_ids = ids, attention_mask = mask, token_type_ids = token_type_ids,  return_dict = False)

    out = self.hidden(o)
    out = self.relu(out)
    out = self.output(out)

    return out

In [13]:
model = BERT(phobert).to(cuda)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 1e-5)

In [None]:
for param in model.bert.parameters():
  param.requires_grad = False

In [None]:
# model,train_acc=fintune(10, dataloader, model, loss_function, optimizer)

In [None]:
model.train()
train_acc = []
loss_total = 0
for epoch in range(5):
  loss_total = 0
  print(epoch)
  num_correct_each_epoch = 0
  num_sample_each_epoch = 0
  loop = tqdm.tqdm(enumerate(dataloader), leave=False, total=len(dataloader))

  for batch, dl in loop:
    ids = dl['ids']
    token_type_ids = dl['token_type_ids']
    mask = dl['mask']
    label = dl['target']

    optimizer.zero_grad()

    output = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
    loss = loss_function(output, label)
    loss_total = loss_total + loss
      # print()
      # print(torch.argmax(output, dim = 1))
      # print(label)
    predict =torch.argmax(output, dim = 1)

    correct = sum(1 for a, b in zip(predict, label) if a == b)
    num_correct_each_epoch += correct
    samples = output.shape[0]
    num_sample_each_epoch += samples
    accuracy = correct/samples
    loss.backward()
    optimizer.step()

    loop.set_description(f'Epoch={epoch}/10')
    loop.set_postfix(loss=loss.item(),acc=accuracy)
  train_acc.append(1.0*num_correct_each_epoch/num_sample_each_epoch)
  torch.save(model.state_dict(), '/content/drive/MyDrive/Fine-Tune-Bert/TextClassification/PhoBERT/PhoBERTmodel.pt')
  print(loss_total)

0




tensor(653.7325, device='cuda:0', grad_fn=<AddBackward0>)
1




tensor(588.5809, device='cuda:0', grad_fn=<AddBackward0>)
2




tensor(540.8531, device='cuda:0', grad_fn=<AddBackward0>)
3




tensor(506.2199, device='cuda:0', grad_fn=<AddBackward0>)
4




tensor(478.9281, device='cuda:0', grad_fn=<AddBackward0>)


In [None]:
f = open("/content/drive/MyDrive/input.txt",'w')

In [None]:
for i in train_acc:
  f.write(str(i)+"\n")

In [None]:
f.close()

In [14]:
def accuracy(datatest, model):
  num_correct = 0
  sample = 0
  for batch, dl in enumerate(datatest):
    ids = dl['ids']
    token_type_ids = dl['token_type_ids']
    mask = dl['mask']
    label = dl['target']

    output = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
    
    predict =torch.argmax(output, dim = 1)
    sample += label.shape[0]
    num_correct += sum(1 for a, b in zip(predict, label) if a == b)
    print(str(batch)+" : "+ str(sum(1 for a, b in zip(predict, label) if a == b))+ " / " + str(label.shape[0])
    +" : "+ str(1.0 * sum(1 for a, b in zip(predict, label) if a == b)/label.shape[0]))
  return num_correct / sample

In [None]:
# torch.save(model.state_dict(), "model_text_classifi.pth")

In [20]:
model.load_state_dict(torch.load("/content/drive/MyDrive/model_text_classifi.pth"))

<All keys matched successfully>

In [22]:
print(accuracy(datatestloader, model))



0 : 16 / 16 : 1.0
1 : 12 / 16 : 0.75
2 : 15 / 16 : 0.9375
3 : 15 / 16 : 0.9375
4 : 15 / 16 : 0.9375
5 : 14 / 16 : 0.875
6 : 15 / 16 : 0.9375
7 : 15 / 16 : 0.9375
8 : 13 / 16 : 0.8125
9 : 16 / 16 : 1.0
10 : 16 / 16 : 1.0
11 : 16 / 16 : 1.0
12 : 15 / 16 : 0.9375
13 : 15 / 16 : 0.9375
14 : 15 / 16 : 0.9375
15 : 15 / 16 : 0.9375
16 : 14 / 16 : 0.875
17 : 15 / 16 : 0.9375
18 : 16 / 16 : 1.0
19 : 14 / 16 : 0.875
20 : 15 / 16 : 0.9375
21 : 15 / 16 : 0.9375
22 : 15 / 16 : 0.9375
23 : 14 / 16 : 0.875
24 : 15 / 16 : 0.9375
25 : 16 / 16 : 1.0
26 : 15 / 16 : 0.9375
27 : 14 / 16 : 0.875
28 : 12 / 16 : 0.75
29 : 14 / 16 : 0.875
30 : 15 / 16 : 0.9375
31 : 14 / 16 : 0.875
32 : 15 / 16 : 0.9375
33 : 13 / 16 : 0.8125
34 : 15 / 16 : 0.9375
35 : 15 / 16 : 0.9375
36 : 15 / 16 : 0.9375
37 : 13 / 16 : 0.8125
38 : 13 / 16 : 0.8125
39 : 12 / 16 : 0.75
40 : 16 / 16 : 1.0
41 : 16 / 16 : 1.0
42 : 16 / 16 : 1.0
43 : 14 / 16 : 0.875
44 : 15 / 16 : 0.9375
45 : 16 / 16 : 1.0
46 : 16 / 16 : 1.0
47 : 15 / 16 : 0.9375
4