In [1]:
%%capture
!pip install transformers
!pip install datasets

In [2]:
import os

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch.nn as nn
from torch import optim
from torch.utils.data.dataset import Dataset, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

In [3]:
def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

device = get_device()
print(device)

cuda


In [4]:
import pandas as pd

file_path = '/kaggle/input/vietnamese-sentiment-analyst/data.csv'

df = pd.read_csv(file_path)
df = df[['content', 'label']]

labels_map = {
    "POS": 0,
    "NEU": 1,
    "NEG": 2
}

df['label'] = df['label'].map(labels_map)
df = df.dropna(subset=['content'])

In [5]:
import re
import string

def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F700-\U0001F77F"  
        u"\U0001F780-\U0001F7FF"  
        u"\U0001F800-\U0001F8FF"  
        u"\U0001F900-\U0001F9FF"  
        u"\U0001FA00-\U0001FA6F"  
        u"\U0001FA70-\U0001FAFF"  
        u"\U00002702-\U000027B0"  
        u"\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

def clean_text(text):
    text = text.lower() 
    text = re.sub(r'[^\w\s]', '', text)  
    text = re.sub(r'\d+', ' <num> ', text)  
    text = re.sub(r'\s+', ' ', text).strip() 
    return text

In [6]:
df_test = df.sample(n=15000, random_state=42)
df_test

Unnamed: 0,content,label
2604,Vẫn tặng bạn 5* vì nhiệt tình,0
16959,Dù có 5k nhưg mất uy tín,1
19018,Hàng đẹp dã man chị trang ạ,0
20374,Và con nào ăn thì con đấy chết,2
11124,Chất lượng tốt có nhiều quà tặng kèm,0
...,...,...
9676,Có giảm nhưng uống vào người rất mệt và buồn nôn,1
622,Đóng gói sản phẩm rất đẹp và chắc chắn Shop ph...,0
6249,Đây chỉ là góp ý và không nỡ để shop bị rate t...,1
926,"Chất vải và kiểu áo đều ok, dễ mặc",0


In [7]:
df_test['content'] = df_test['content'].apply(remove_emoji)  
df_test['content'] = df_test['content'].apply(clean_text)  

In [8]:
train_df, val_df = train_test_split(df_test, test_size=0.2, random_state=42)

In [9]:
from datasets import Dataset
from transformers import DataCollatorWithPadding

checkpoint = 'distilbert-base-multilingual-cased'
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples):
    return tokenizer(examples["content"], truncation=True, padding = True, max_length = 128)

2024-07-09 13:04:06.984539: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-09 13:04:06.984637: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-09 13:04:07.083454: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

In [10]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [11]:
train_dataset = train_dataset.remove_columns(column_names='content')
train_dataset = train_dataset.remove_columns(column_names='__index_level_0__')
train_dataset = train_dataset.remove_columns(column_names='attention_mask')
val_dataset = val_dataset.remove_columns(column_names='content')
val_dataset = val_dataset.remove_columns(column_names='__index_level_0__')
val_dataset = val_dataset.remove_columns(column_names='attention_mask')

In [12]:
train_dataset

Dataset({
    features: ['label', 'input_ids'],
    num_rows: 12000
})

In [13]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=10, collate_fn=data_collator)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=10, collate_fn=data_collator)

In [14]:
for batch in train_loader:
    print(batch.keys())
    break 


dict_keys(['input_ids', 'attention_mask', 'labels'])


In [15]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, max_length):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(hidden_size * 2, 64)
        self.fc2 = nn.Linear(64, 8)
        self.fc3 = nn.Linear(8, output_size)
        self.relu = nn.ReLU()
#         self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.embedding(x)
        batch_size = x.size(0)
        h0 = torch.zeros(2, batch_size, hidden_size).to(x.device)
        c0 = torch.zeros(2, batch_size, hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        
        out = self.relu(self.fc1(out[:, -1, :]))
        out = self.dropout(out)
        out = self.relu(self.fc2(out))
        out = self.dropout(out)
#         out = self.softmax(self.fc3(out))
        return out

In [16]:
# HYPER PARAMS

vocab_size = tokenizer.vocab_size
embedding_dim = 128
hidden_size = 64
output_size = 3
max_length = 128
num_epochs = 10
device = device

print(f"Vocabulary size: {vocab_size}")
print(f"Embedding dimension: {embedding_dim}")
print(f"Hidden size: {hidden_size}")
print(f"Output size: {output_size}")
print(f"Max length: {max_length}")
print(f"Number of epochs: {num_epochs}")

Vocabulary size: 119547
Embedding dimension: 128
Hidden size: 64
Output size: 3
Max length: 128
Number of epochs: 10


In [17]:
import torch
import torch.nn as nn
import torch.optim as optim

def train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=10, save_best_model_path=None, save_last_model_path=None):
    best_val_accuracy = 0.0
    best_model_state_dict = None

    for epoch in range(num_epochs):
        model.train()
        running_train_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        
        # Training loop
        for i, batch in enumerate(train_loader):
            inputs, labels = batch['input_ids'], batch['labels'] 
            
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_train_loss += loss.item()
            
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)
        
        # Calculate average train loss and accuracy for the epoch
        train_loss = running_train_loss / len(train_loader)
        train_accuracy = correct_predictions / total_predictions
        
        # Validation loop
        val_loss, val_accuracy = evaluate(model, val_loader, criterion, device)
        
        # Save best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state_dict = model.state_dict()
            if save_best_model_path:
                torch.save(model.state_dict(), save_best_model_path)
        
        # Print epoch statistics
        print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Accuracy: {val_accuracy:.4f}")
        
    # Save last model
    if save_last_model_path:
        torch.save(model.state_dict(), save_last_model_path)
        
    print("Training finished.")
    
    return best_model_state_dict, best_val_accuracy

# Evaluation function (for validation and testing)
def evaluate(model, loader, criterion, device):
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    running_val_loss = 0.0
    
    with torch.no_grad():
        for i, batch in enumerate(loader):
            inputs, labels = batch['input_ids'], batch['labels']
            
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
#             print(predicted)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)
    
    val_loss = running_val_loss / len(loader)
    val_accuracy = correct_predictions / total_predictions
    
    return val_loss, val_accuracy


In [18]:
model = LSTMModel(vocab_size, embedding_dim, hidden_size, output_size, max_length).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=20)

Epoch [1/20], Train Loss: 1.4818, Val Loss: 1.1812, Train Accuracy: 0.5256, Val Accuracy: 0.6367
Epoch [2/20], Train Loss: 1.4859, Val Loss: 1.2484, Train Accuracy: 0.5173, Val Accuracy: 0.6367
Epoch [3/20], Train Loss: 1.4791, Val Loss: 1.2281, Train Accuracy: 0.5188, Val Accuracy: 0.6367
Epoch [4/20], Train Loss: 1.4678, Val Loss: 1.2370, Train Accuracy: 0.5182, Val Accuracy: 0.6367
Epoch [5/20], Train Loss: 1.4698, Val Loss: 1.2104, Train Accuracy: 0.5160, Val Accuracy: 0.6367
Epoch [6/20], Train Loss: 1.4566, Val Loss: 1.2196, Train Accuracy: 0.5212, Val Accuracy: 0.6367
Epoch [7/20], Train Loss: 1.4594, Val Loss: 1.0371, Train Accuracy: 0.5178, Val Accuracy: 0.7137
Epoch [8/20], Train Loss: 1.2285, Val Loss: 0.8265, Train Accuracy: 0.6073, Val Accuracy: 0.7737
Epoch [9/20], Train Loss: 1.1439, Val Loss: 0.8036, Train Accuracy: 0.6627, Val Accuracy: 0.7747
Epoch [10/20], Train Loss: 1.1153, Val Loss: 0.7834, Train Accuracy: 0.6909, Val Accuracy: 0.7810
Epoch [11/20], Train Loss: 1.

(OrderedDict([('embedding.weight',
               tensor([[ 0.3786,  0.6287, -0.2462,  ..., -0.7713, -0.9360, -0.2332],
                       [ 0.3519,  0.3484, -0.4021,  ..., -0.1368, -0.8654, -1.7044],
                       [ 1.1962, -2.1301,  0.6990,  ...,  0.6244,  0.7711, -1.4690],
                       ...,
                       [ 1.2096,  0.2743,  1.3105,  ...,  1.2970, -0.6052, -1.2101],
                       [-0.0731, -0.4180, -0.7601,  ..., -0.5014,  0.3567,  0.7195],
                       [ 0.9466, -0.5554, -0.5342,  ..., -0.7263, -0.6534, -0.6060]],
                      device='cuda:0')),
              ('lstm.weight_ih_l0',
               tensor([[ 1.5408e-04,  4.7901e-01, -6.7895e-02,  ..., -1.0348e-01,
                        -2.9151e-02,  3.9889e-01],
                       [-1.7677e-01,  8.7951e-02, -1.1052e-01,  ..., -1.7651e-01,
                         1.4918e-01, -1.7673e-01],
                       [ 7.9375e-02,  1.2843e-01, -2.4665e-03,  ..., -1.1243e-01,
 