In [1]:
#Download data from github
!git clone https://github.com/congnghia0609/ntc-scv.git
!unzip ./ntc-scv/data/data_test.zip -d ./data
!unzip ./ntc-scv/data/data_train.zip -d ./data
!rm -rf ./ntc-scv

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: ./data/data_train/train/pos/31228.txt  
  inflating: ./data/data_train/train/pos/3123.txt  
  inflating: ./data/data_train/train/pos/31232.txt  
  inflating: ./data/data_train/train/pos/31233.txt  
  inflating: ./data/data_train/train/pos/31234.txt  
  inflating: ./data/data_train/train/pos/31236.txt  
  inflating: ./data/data_train/train/pos/31238.txt  
  inflating: ./data/data_train/train/pos/31240.txt  
  inflating: ./data/data_train/train/pos/31242.txt  
  inflating: ./data/data_train/train/pos/31247.txt  
  inflating: ./data/data_train/train/pos/31250.txt  
  inflating: ./data/data_train/train/pos/31251.txt  
  inflating: ./data/data_train/train/pos/31252.txt  
  inflating: ./data/data_train/train/pos/31257.txt  
  inflating: ./data/data_train/train/pos/31260.txt  
  inflating: ./data/data_train/train/pos/31261.txt  
  inflating: ./data/data_train/train/pos/31262.txt  
  inflating: ./data/data_train/trai

In [2]:
import torch.nn as nn
import time

In [3]:
#Read data from file
import os
import pandas as pd

def load_data_from_path(folder_path):
  examples = []
  for label in os.listdir(folder_path):
    full_path = os.path.join(folder_path, label)
    for file_name in os.listdir(full_path):
      file_path = os.path.join(full_path, file_name)
      with open(file_path, "r", encoding = "utf-8") as f:
        line = f.readlines()
      sentences = " ".join(line)
      if label == "neg":
        label = 0
      if label == "pos":
        label = 1
      data = {
          'sentence': sentences,
          'label': label
      }
      examples.append(data)
    return pd.DataFrame(examples)

folder_paths = {
    'train': '/content/data/data_train/train',
    'valid': '/content/data/data_train/test',
    'test': '/content/data/data_test/test'
}
train_df = load_data_from_path(folder_paths['train'])
valid_df = load_data_from_path(folder_paths['valid'])
test_df = load_data_from_path(folder_paths['test'])


In [4]:
train_df

Unnamed: 0,sentence,label
0,Kifa hoa kiểng như cái tên cây_cối xanhh tươi ...,1
1,Hôm_nay lại vào bingsu ! Mình không ăn bingsu ...,1
2,Đi ngang qua quán hoài mà hôm ni mới có dịp gh...,1
3,Hôm_nay ghé ngang thấy quán giảm_giá 30% thì t...,1
4,Qua nhà ông anh chơi vô_tình đi ngang quán này...,1
...,...,...
14995,"Đi với gia_đình cứ lên đay là khoẻ nhất , có đ...",1
14996,Nhân_viên quán dễ_thương và nhiệt_tình khiến k...,1
14997,Ai kêu gà thì nên đi 3 người trở_lên . Kim bap...,1
14998,+ Món Ăn :\n Cá_hồi đút lò ngon bá cháy = ) ) ...,1


In [5]:
import re
import string

def preprocess_text(text):
  #remove URL
  url_pattern = re.compile(r'https?://\s+\wwww\.\s+')
  text = url_pattern.sub(r" ", text)
  #remove HTML Tags
  html_pattern = re.compile(r'<[^<>]+>')
  text = html_pattern.sub(r" ", text)
  #remove puncs n digits
  replace_chars = list(string.punctuation + string.digits)
  for char in replace_chars:
    text = text.replace(char, " ")
  #remove emoji
  emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"
    u"\U0001F300-\U0001F5FF"
    u"\U0001F680-\U0001F6FF"
    u"\U0001F1E0-\U0001F1FF"
    u"\U0001F1F2-\U0001F1F4"
    u"\U0001F1E6-\U0001F1FF"
    u"\U0001F600-\U0001F64F"
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    u"\U0001f926-\U0001f937"
    u"\U0001F1F2"
    u"\U0001F1F4"
    u"\U0001F620"
    u"\u200d"
    u"\u2640-\u2642"
    "]+", flags = re.UNICODE)
  text = emoji_pattern.sub(r" ", text)

  #normalize whitespace
  text = " ".join(text.split())

  #lowercasing
  text = text.lower()
  return text

train_df['preprocess_sentence'] = [preprocess_text(row['sentence']) for index, row in train_df.iterrows()]
valid_df['preprocess_sentence'] = [preprocess_text(row['sentence']) for index, row in valid_df.iterrows()]
test_df['preprocess_sentence'] = [preprocess_text(row['sentence']) for index, row in test_df.iterrows()]

In [6]:
train_df['preprocess_sentence'][:10]

0    kifa hoa kiểng như cái tên cây cối xanhh tươi ...
1    hôm nay lại vào bingsu mình không ăn bingsu nữ...
2    đi ngang qua quán hoài mà hôm ni mới có dịp gh...
3    hôm nay ghé ngang thấy quán giảm giá thì thật ...
4    qua nhà ông anh chơi vô tình đi ngang quán này...
5    lâu rồi mình mới ghé lại quán cùng nhóm bạn ch...
6    thích nhất vị cotton candy ngọt dịu dịu mà dễ ...
7    hôm nay là ngày đầu tiên khai trương quán bò n...
8    chỗ chơi thoáng vắng nên chơi cũng thoải mái n...
9    wadan – tinh túy ẩm thực nhật nằm gần khu vực ...
Name: preprocess_sentence, dtype: object

In [7]:
def yield_tokens(sentences, tokinizer):
  for sentence in sentences:
    yield tokenizer(sentence)

from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer("basic_english")
#build vocabulary
from torchtext.vocab import build_vocab_from_iterator

vocab_size = 10000
vocabulary = build_vocab_from_iterator(
    yield_tokens(train_df['preprocess_sentence'], tokenizer),
    max_tokens = vocab_size,
    specials = ["<pad>", "<unk>"]
)

vocabulary.set_default_index(vocabulary["<unk>"])

#convert torchtext dataset
from torchtext.data.functional import to_map_style_dataset

def prepare_dataset(df):
  #create iterator for dataset: (sentence, label)
  for index, row in df.iterrows():
    sentence = row['preprocess_sentence']
    encoded_sentence = vocabulary(tokenizer(sentence))
    label = row['label']
    yield encoded_sentence, label

valid_dataset = prepare_dataset(valid_df)
valid_dataset = to_map_style_dataset(valid_dataset)

train_dataset = prepare_dataset(train_df)
train_dataset = to_map_style_dataset(train_dataset)

test_dataset = prepare_dataset(test_df)
test_dataset = to_map_style_dataset(test_dataset)

In [8]:
#DataLoader

import torch
seq_length = 100

def collate_batch(batch):
  #create inputs, offsets, labels for batch
  sentences, labels = list(zip(*batch))
  encoded_sentences = [
      sentence+([0]* (seq_length - len(sentence))) if len(sentence) < seq_length else sentence[:seq_length]
      for sentence in sentences
  ]
  encoded_sentences = torch.tensor(encoded_sentences, dtype = torch.int64)
  labels = torch.tensor(labels)
  return encoded_sentences, labels

from torch.utils.data import DataLoader
batch_size = 128

train_dataloader = DataLoader(
    train_dataset, batch_size = batch_size, shuffle = True, collate_fn = collate_batch
)
valid_dataloader = DataLoader(
    valid_dataset, batch_size = batch_size, shuffle = False, collate_fn = collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size = batch_size, shuffle = False, collate_fn = collate_batch
)


In [9]:
#Train per epoch
def train_epoch(model, optimizer, criterion, train_dataloader, device, epoch = 0, log_interval = 20):
  model.train()
  losses = []
  total_acc, total_cnt = 0, 0
  start_time = time.time()
  for idx, (inputs, labels) in enumerate(train_dataloader):
    inputs, labels = inputs.to(device), labels.to(device)

    predict = model(inputs)
    #compute loss
    loss = criterion(predict, labels)
    losses.append(loss)
    #optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_acc += (predict.argmax(1) == labels).sum().item()
    total_cnt = labels.size(0)
    if idx % log_interval == 0 and idx > 0:
      elapsed_time = time.time() - start_time
      print(
          " |epoch {:3d} | {:5d} / {:5d} batches "
          "accuracy {:8.3f}".format(
              epoch, idx, len(train_dataloader), total_acc / total_cnt
          )
      )
      total_acc, total_cnt = 0, 0
      start_time = time.time()
  epoch_acc = total_acc / total_cnt
  epoch_loss = sum(losses) / len(losses)
  return epoch_acc, epoch_loss

In [10]:
#evaluate
def evaluate_epoch(model, valid_dataloader, criterion, epoch):
  model.eval()
  total_acc, total_cnt = 0, 0
  losses = []

  with torch.no_grad():
    for idx, (inputs, labels) in enumerate(valid_dataloader):
      inputs, labels = inputs.to(device), labels.to(device)

      predict = model(inputs)

      loss = criterion(predict, labels)
      losses.append(loss)

      total_acc += (predict.argmax(1) == labels).sum().item()
      total_cnt += labels.size(0)

  epoch_acc = total_acc / total_cnt
  epoch_loss = sum(losses) / len(losses)
  return epoch_acc, epoch_loss

In [11]:
def train(model, model_name, save_model, optimizer, criterion, train_dataloader, valid_dataloader, num_epochs, device):
  train_accs, train_losses = [], []
  eval_accs, eval_losses = [], []
  best_loss_eval = 100
  times = []
  for epoch in range(num_epochs):
    epoch_start_time = time.time()
    #Train
    train_acc, train_loss = train_epoch(model, optimizer,criterion, train_dataloader, device, epoch)
    train_accs.append(train_acc)
    train_losses.append(train_loss)

    #Evaluate
    eval_acc, eval_loss = evaluate_epoch(model, valid_dataloader, criterion, epoch)
    eval_accs.append(eval_acc)
    eval_losses.append(eval_loss)

    #Save_best_model
    if eval_loss < best_loss_eval:
      torch.save(model.state_dict(), save_model + f'{model_name}.pt')

    times.append(time.time() - epoch_start_time)
    #Print loss, accuracy and epoch
    print("-" * 59)
    print(
        "| End of epoch {:3d} | Time: {:5.2f}s | Train Accuracy {:8.3f} | Train Loss {:8.3f} |"
        "| Valid Accuracy {:8.3f} | Valid Loss {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, train_acc, train_loss, eval_acc, eval_loss
        )
    )
    print("-" * 59)

    #Load best model
  model.load_state_dict(torch.load(save_model + f'{model_name}.pt'))
  model.eval()
  metrics = {
      'train_accuracy' : train_accs,
      'train_loss' : train_losses,
      'valid_accuracy' : eval_accs,
      'valid_loss' : eval_losses
  }
  return model, metrics

In [12]:
class TransformerEncoderBlock(nn.Module):
  def __init__(self, embed_dim, num_heads, ff_dim, dropout = 0.2):
    super().__init__()
    self.attn = nn.MultiheadAttention(
        embed_dim = embed_dim,
        num_heads = num_heads,
        batch_first = True
    )
    self.ffn = nn.Sequential(
        nn.Linear(in_features = embed_dim, out_features = ff_dim, bias = True),
        nn.ReLU(),
        nn.Linear(in_features = ff_dim, out_features = embed_dim, bias = True)
    )
    self.layernorm_1 = nn.LayerNorm(normalized_shape = embed_dim, eps = 1e-6)
    self.layernorm_2 = nn.LayerNorm(normalized_shape = embed_dim, eps = 1e-6)
    self.dropout_1 = nn.Dropout(p = dropout)
    self.dropout_2 = nn.Dropout(p = dropout)

  def forward(self, query, key, value):
    attn_output, _ = self.attn(query, key, value)
    attn_output = self.dropout_1(attn_output)
    out_1 = self.layernorm_1(query + attn_output)
    ffn_output = self.ffn(out_1)
    ffn_output = self.dropout_2(ffn_output)
    out = self.layernorm_2(out_1 + ffn_output)
    return out

In [13]:
class TransformerEncoder(nn.Module):
  def __init__(self, src_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout = 0.2, device = 'cpu'):
    super().__init__()
    self.embedding = TokenAndPositionEmbedding(src_vocab_size, embed_dim, max_length,device)
    self.layers = nn.ModuleList(
        [
            TransformerEncoderBlock(
                embed_dim, num_heads, ff_dim, dropout
            ) for i in range(num_layers)
        ]
    )

  def forward(self, x):
    output = self.embedding(x)
    for layer in self.layers:
      output = layer(output, output, output)
    return output

In [14]:
class TokenAndPositionEmbedding(nn.Module):
  def __init__(self, vocab_size, embed_dim, max_length, device):
    super().__init__()
    self.device = device
    self.word_emb = nn.Embedding(
        num_embeddings = vocab_size,
        embedding_dim = embed_dim
    )
    self.pos_emb = nn.Embedding(
        num_embeddings = max_length,
        embedding_dim = embed_dim
    )

  def forward(self, x):
    N, seq_len = x.size()
    positions = torch.arange(0, seq_len).expand(N, seq_len).to(self.device)
    output1 = self.word_emb(x)
    output2 = self.pos_emb(positions)
    output = output1 + output2
    return output

In [15]:
class TransformerEncoderClassifier(nn.Module):
  def __init__(self, vocab_size, max_length, num_layers, embed_dim, num_heads, ff_dim, drop_out = 0.2, device = 'cpu'):
    super().__init__()
    self.encoder = TransformerEncoder(vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout , device )
    self.pooling = nn.AvgPool1d(kernel_size = max_length)
    self.dropout = nn.Dropout(p = dropout)
    self.fc1 = nn.Linear(in_features = embed_dim, out_features = 20)
    self.fc2 = nn.Linear(in_features = 20, out_features = 2)
    self.relu = nn.ReLU()

  def forward(self, x):
    output = self.encoder(x)
    output = self.pooling(output.permute(0, 2, 1)).squeeze()
    output = self.dropout(output)
    output = self.fc1(output)
    output = self.dropout(output)
    output = self.fc2(output)
    return output

In [16]:
import torch.optim as optim

In [None]:

vocab_size = 10000
max_length = 100
embed_dim = 200
num_layers = 2
num_heads = 4
ff_dim = 128
dropout =0.3

model = TransformerEncoderClassifier(vocab_size, max_length, num_layers, embed_dim, num_heads, ff_dim, dropout)


device = torch.device('cuda' if torch.cuda.is_available () else 'cpu')

model = TransformerEncoderClassifier(vocab_size, max_length, num_layers, embed_dim, num_heads, ff_dim, dropout, device)
model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr =0.005)
num_epochs = 50
save_model = './ model'
os.makedirs(save_model, exist_ok = True)
model_name = 'model'

model, metrics = train(model, model_name, save_model, optimizer, criterion, train_dataloader, valid_dataloader, num_epochs, device)


 |epoch   0 |    20 /   118 batches accuracy   20.023
 |epoch   0 |    40 /   118 batches accuracy   20.000
 |epoch   0 |    60 /   118 batches accuracy   20.000
 |epoch   0 |    80 /   118 batches accuracy   20.000
 |epoch   0 |   100 /   118 batches accuracy   20.000
-----------------------------------------------------------
| End of epoch   0 | Time:  5.05s | Train Accuracy   86.333 | Train Loss    0.008 || Valid Accuracy    1.000 | Valid Loss    0.000 
-----------------------------------------------------------
 |epoch   1 |    20 /   118 batches accuracy   21.000
 |epoch   1 |    40 /   118 batches accuracy   20.000
 |epoch   1 |    60 /   118 batches accuracy   20.000
 |epoch   1 |    80 /   118 batches accuracy   20.000
 |epoch   1 |   100 /   118 batches accuracy   20.000
-----------------------------------------------------------
| End of epoch   1 | Time:  3.68s | Train Accuracy   86.333 | Train Loss    0.000 || Valid Accuracy    1.000 | Valid Loss    0.000 
----------------

In [None]:
import matplotlib.pyplot as plt

def plot_result (num_epochs, train_accs, eval_accs, train_losses, eval_losses ):
  epochs = list(range(num_epochs))
  fig , axs = plt.subplots(nrows = 1, ncols =2 , figsize = (12 ,6))
  axs[0].plot(epochs , train_accs , label = " Training ")
  axs[0].plot(epochs , eval_accs , label = " Evaluation ")
  axs[1].plot(epochs , train_losses , label = " Training ")
  axs[1].plot(epochs , eval_losses , label = " Evaluation ")
  axs[0].set_xlabel(" Epochs ")
  axs[1].set_xlabel(" Epochs ")
  axs[0].set_ylabel(" Accuracy ")
  axs[1].set_ylabel(" Loss ")
  plt.legend()

In [None]:
train_accs = metrics['train_accuracy']
eval_accs = metrics['valid_accuracy']
train_losses = metrics['train_loss']
eval_losses = metrics['valid_loss']

In [None]:
result = plot_result(num_epochs, train_accs, eval_accs, train_losses, eval_losses)
result