# 1. Download Dataset NTC-SCV

In [1]:
!git clone https://github.com/congnghia0609/ntc-scv.git
!unzip ./ntc-scv/data/data_test.zip -d ./data
!unzip ./ntc-scv/data/data_train.zip -d ./data
!rm -rf ./ntc-scv

Cloning into 'ntc-scv'...
remote: Enumerating objects: 39, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 39 (delta 0), reused 4 (delta 0), pack-reused 35[K
Unpacking objects: 100% (39/39), 186.94 MiB | 10.10 MiB/s, done.
Archive:  ./ntc-scv/data/data_test.zip
   creating: ./data/data_test/
   creating: ./data/data_test/test/
   creating: ./data/data_test/test/neg/
  inflating: ./data/data_test/test/neg/10.txt  
  inflating: ./data/data_test/test/neg/10014.txt  
  inflating: ./data/data_test/test/neg/1003.txt  
  inflating: ./data/data_test/test/neg/10044.txt  
  inflating: ./data/data_test/test/neg/10055.txt  
  inflating: ./data/data_test/test/neg/1007.txt  
  inflating: ./data/data_test/test/neg/10070.txt  
  inflating: ./data/data_test/test/neg/10076.txt  
  inflating: ./data/data_test/test/neg/10079.txt  
  inflating: ./data/data_test/test/neg/10085.txt  
  inflating: ./data/data_test/tes

In [2]:
import os
import pandas as pd

def load_data_from_path(folder_path):
    examples = []
    for label in os.listdir(folder_path):
        full_path = os.path.join(folder_path, label)
        for file_name in os.listdir(full_path):
            file_path = os.path.join(full_path, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                lines = f.readlines()
            sentence = " ".join(lines)
            if label == "neg":
                label = 0
            if label == "pos":
                label = 1
            data = {
                'sentence': sentence,
                'label': label
            }
            examples.append(data)
    return pd.DataFrame(examples)

folder_paths = {
    'train': '/kaggle/working/data/data_train/train',
    'valid': '/kaggle/working/data/data_train/test',
    'test': '/kaggle/working/data/data_test/test'
}

train_df = load_data_from_path(folder_paths['train'])
valid_df = load_data_from_path(folder_paths['valid'])
test_df = load_data_from_path(folder_paths['test'])

# 2. Preprocessing

In [3]:
!pip install langid



In [4]:
from langid.langid import LanguageIdentifier, model

def identify_vn(df):
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    not_vi_idx = set()
    THRESHOLD = 0.9
    for idx, row in df.iterrows():
        score = identifier.classify(row["sentence"])
        if score[0] != "vi" or (score[0] == "vi" and score[1] <= THRESHOLD):
            not_vi_idx.add(idx)
    vi_df = df[~df.index.isin(not_vi_idx)]
    not_vi_df = df[df.index.isin(not_vi_idx)]
    return vi_df, not_vi_df

train_df_vi, train_df_other = identify_vn(train_df)

In [5]:
import re
import string

def preprocess_text(text):
    
    url_pattern = re.compile(r'https?://\s+\wwww\.\s+')
    text = url_pattern.sub(r" ", text)
    html_pattern = re.compile(r'<[^<>]+>')
    text = html_pattern.sub(" ", text)
    
    replace_chars = list(string.punctuation + string.digits)
    for char in replace_chars:
        text = text.replace(char," ")
    
    emoji_pattern = re.compile("["
                                u"\U0001F600-\U0001F64F"
                                u"\U0001F300-\U0001F5FF"
                                u"\U0001F680-\U0001F6FF"
                                u"\U0001F1E0-\U0001F1FF"
                                u"\U0001F1F2-\U0001F1F4"
                                u"\U0001F1E6-\U0001F1FF"
                                u"\U0001F600-\U0001F64F"
                                u"\U00002702-\U000027B0"
                                u"\U000024C2-\U0001F251"
                                u"\U0001f926-\U0001f937"
                                u"\U0001F1F2"
                                u"\U0001F1F4"
                                u"\U0001F620"
                                u"\u200d"
                                u"\u2640-\u2642"
                               "]+", flags=re.UNICODE)
    
    text = emoji_pattern.sub(r" ", text)
    
    text = " ".join(text.split())
    
    return text.lower()

In [6]:
train_df_vi['preprocess_sentence'] = [
    preprocess_text(row['sentence']) for index, row in train_df_vi.iterrows()
]
valid_df['preprocess_sentence'] = [
    preprocess_text(row['sentence']) for index, row in valid_df.iterrows()
]
test_df['preprocess_sentence'] = [
    preprocess_text(row['sentence']) for index, row in test_df.iterrows()
]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_vi['preprocess_sentence'] = [


In [7]:
train_df_vi['preprocess_sentence']

0        quán rất đẹp khá yên tĩnh nhân viên dễ thương ...
1        giá cả bình dân thức uống và thức ăn rất ngon ...
2        lần nào có dịp cũng ghé quá cả quán đủ rộng ca...
3        chủ quán zui zẻ dễ thương đồ uống ngon và rẻ c...
4        được bạn bè rủ rê lên phố cổ chơi lang thang h...
                               ...                        
29995    mình cũng vừa quay lại quán ngói vào hôm chiều...
29996    quầy order máy lạnh không biết bị gì mà đi lần...
29997    hôm nay đi ăn trưa vào nhầm quán vì cùng là số...
29998    mình đã có xem review về hana từ trước và có t...
29999    bò lá lốt ngon mà rẻ nữa mua phần cho rau bánh...
Name: preprocess_sentence, Length: 29736, dtype: object

# 3. Text Vectorization

In [8]:
# Word-based tokenizer
from torchtext.data.utils import  get_tokenizer
tokenizer = get_tokenizer("basic_english")

# Create iter dataset
def yield_tokens(sentences, tokenizer):
    for sentence in sentences:
        yield tokenizer(sentence)

# Build vocabulary
from torchtext.vocab import build_vocab_from_iterator

vocab_size = 10000
vocabulary = build_vocab_from_iterator(
    yield_tokens(train_df_vi['preprocess_sentence'], tokenizer),
    max_tokens=vocab_size,
    specials=["<unk>"]
)
vocabulary.set_default_index(vocabulary["<unk>"])

# Convert iter into torchtext dataset
from torchtext.data.functional import to_map_style_dataset
def prepare_dataset(df):
    for index, row in df.iterrows():
        sentence = row['preprocess_sentence']
        encoded_sentence = vocabulary(tokenizer(sentence))
        label = row['label']
        yield encoded_sentence, label

train_dataset = prepare_dataset(train_df_vi)
train_dataset = to_map_style_dataset(train_dataset)

valid_dataset = prepare_dataset(valid_df)
valid_dataset = to_map_style_dataset(valid_dataset)

In [9]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    encoded_sentences, labels, offsets = [], [], [0]
    for encoded_sentence, label in batch:
        labels.append(label)
        encoded_sentence = torch.tensor(encoded_sentence, dtype=torch.int64)
        encoded_sentences.append(encoded_sentence)
        offsets.append(encoded_sentence.size(0))
        
    labels = torch.tensor(labels, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    encoded_sentences = torch.cat(encoded_sentences)
    
    return encoded_sentences.to(device), offsets.to(device), labels.to(device)

from torch.utils.data import DataLoader

batch_size = 128
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_batch
)

valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch
)

# 4. Classify Model

In [10]:
from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()
    
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
    
    def forward(self, inputs, offsets):
        embedded = self.embedding(inputs, offsets)
        return self.fc(embedded)

num_class = len(set(train_df_vi['label']))
vocab_size = len(vocabulary)
embed_dim = 256
model = TextClassificationModel(vocab_size, embed_dim, num_class).to(device)

In [11]:
learning_rate = 5

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# 5. Training

In [12]:
import time 

def train(model, optimizer, criterion, train_dataloader, epoch=0, log_interval=25):
    model.train()
    total_acc, total_count = 0,0 
    losses = []
    start_time = time.time()
    
    for idx, (inputs, offsets, labels) in enumerate(train_dataloader):
        optimizer.zero_grad()
        predictions = model(inputs, offsets)
        
        # Compute Loss
        loss = criterion(predictions, labels)
        losses.append(loss.item())
        
        # Backward
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predictions.argmax(1) == labels).sum().item()
        total_count += labels.size(0)
        
        if idx % log_interval == 0 and idx >0:
            elapsed = time.time() - start_time
            print(
            "| Epoch {:3d} | {:5d}/{:5d} batches"
            "| Accuracy {:8.3f}".format(epoch, idx, len(train_dataloader), total_acc / total_count)
            )
            
            total_acc, total_count = 0, 0
            start_time = time.time()
    
    epoch_acc = total_acc / total_count 
    epoch_loss = sum(losses) / len(losses)
    return epoch_acc, epoch_loss

In [13]:
def evaluate(model, criterion, valid_dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    losses = []
    
    with torch.no_grad():
        for idx, (inputs, offsets, labels) in  enumerate(valid_dataloader):
            predictions = model(inputs, offsets)
            loss = criterion(predictions, labels)
            losses.append(loss)
            total_acc += (predictions.argmax(1) == labels).sum().item()
            total_count += labels.size(0)
    
    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_acc , epoch_loss

In [14]:
num_class = len(set(train_df_vi['label']))
vocab_size = len(vocabulary)
embed_dim = 100
model = TextClassificationModel(vocab_size, embed_dim, num_class).to(device)

learning_rate = 5
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

num_epochs = 5
for epoch in range(1, num_epochs+1):
    epoch_start_time = time.time()
    train_acc, train_loss = train(model, optimizer, criterion, train_dataloader, epoch)
    eval_acc, eval_loss = evaluate(model, criterion, valid_dataloader)
    print("-"*59)
    print(
        "| End of epoch {:3d} Time: {:5.2f}s | Train Acc {:8.3f} | Train Loss: {:8.3f}"
        "| Valid Acc {:8.3f} | Valid Loss {:8.3f}".format(
            epoch, time.time() - epoch_start_time, train_acc, train_loss, eval_acc, eval_loss)
    )
    print("-"*59)

| Epoch   1 |    25/  233 batches| Accuracy    0.565
| Epoch   1 |    50/  233 batches| Accuracy    0.733
| Epoch   1 |    75/  233 batches| Accuracy    0.786
| Epoch   1 |   100/  233 batches| Accuracy    0.803
| Epoch   1 |   125/  233 batches| Accuracy    0.814
| Epoch   1 |   150/  233 batches| Accuracy    0.829
| Epoch   1 |   175/  233 batches| Accuracy    0.835
| Epoch   1 |   200/  233 batches| Accuracy    0.850
| Epoch   1 |   225/  233 batches| Accuracy    0.849
-----------------------------------------------------------
| End of epoch   1 Time:  2.45s | Train Acc    0.864 | Train Loss:    0.463| Valid Acc    0.835 | Valid Loss    0.395
-----------------------------------------------------------
| Epoch   2 |    25/  233 batches| Accuracy    0.858
| Epoch   2 |    50/  233 batches| Accuracy    0.850
| Epoch   2 |    75/  233 batches| Accuracy    0.850
| Epoch   2 |   100/  233 batches| Accuracy    0.843
| Epoch   2 |   125/  233 batches| Accuracy    0.859
| Epoch   2 |   150/

#  6. Evaluation

In [15]:
def predict(text):
    with torch.no_grad():
        encoded = torch.tensor(vocabulary(tokenizer(text)))
        output = model(encoded, torch.tensor([0]))
        return output.argmax(1).item()

In [16]:
predictions, labels = [], []
for index, row in test_df.iterrows():
    sentence = row['preprocess_sentence']
    label = row['label']
    prediction = predict(sentence)
    predictions.append(prediction)
    labels.append(label)

sum(torch.tensor(predictions) == torch.tensor(labels)) / len(labels)

tensor(0.8700)

In [17]:
vocab_size = 7
embedding_dim = 4
embedding_sum = nn.EmbeddingBag(vocab_size, embedding_dim, mode='sum')
inputs = torch.tensor([1, 2, 4, 5, 4, 3], dtype=torch.long)
offsets = torch.tensor([0, 3], dtype=torch.long)
outputs = embedding_sum(inputs, offsets)
outputs.shape

torch.Size([2, 4])