In [35]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoModel, AutoTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import cuda
from torch import nn
import matplotlib.pyplot as plt
from torch.nn import CrossEntropyLoss
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

In [36]:
train_df = pd.read_csv('processed_train.csv').dropna()
test_df = pd.read_csv('processed_test.csv').dropna()
val_df = pd.read_csv('processed_dev.csv').dropna()
combine_df = pd.concat([train_df, test_df, val_df], axis=0)

In [37]:
from gensim.models import FastText
from underthesea import word_tokenize



tokens_list_combine = [word_tokenize(text) for text in combine_df.text]
train_text_tokens = [word_tokenize(text) for text in train_df.text]
test_text_tokens = [word_tokenize(text) for text in test_df.text]
val_text_tokens = [word_tokenize(text) for text in val_df.text]

# Tạo tập dữ liệu huấn luyện (mỗi câu là một danh sách từ)
sentences = [text.split() for text in combine_df.text]

# Huấn luyện FastText
fasttext = FastText(tokens_list_combine, vector_size=100, window=10, min_count=5, workers=4)

# Kiểm tra vector của một từ
print(fasttext.wv["học"])  # Lấy vector của từ "học"

[-0.6153785  -0.61388886  0.90783226  0.5145612  -0.3856001   0.6797288
  0.47787538  1.5822386  -0.531349    1.0337026  -0.62286025  0.59915113
  0.2966425  -0.06621573 -0.8295355  -1.3731369   1.5562274  -0.21414232
  0.09784391 -1.842506    0.45541006 -0.40848184 -1.1805248   2.0406997
  0.30389574 -0.2157323   0.35437942 -0.21296045  0.02300942  0.3998008
  0.6409063  -0.276465    0.96379364  0.08648127  0.3920531   0.35120586
  0.6351549   0.05228269 -0.4131215   1.063778   -0.23628558  0.12369056
  0.872923   -0.776259    0.5232993   0.2150041  -0.01439277 -0.02133981
 -0.02704658 -0.22635224 -0.16978635  0.50162387  0.24102178 -0.78074455
 -0.07552272  0.2646801  -1.1614178   0.48484072  0.35768503  0.30494592
  0.21741958  0.3005198  -0.31258944 -0.1500939  -0.03845323  0.8236267
 -0.5461406   0.15894063 -0.03859643 -0.03383168 -0.16196874 -1.6143469
  0.37854356  0.16508856 -0.3793962  -0.30716252 -0.09697782 -0.30116063
 -0.0369316  -1.0605574  -0.64138263 -0.34890005  0.0574

In [38]:
import numpy as np
import torch
import torch.nn as nn
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

tokenizer_data = Tokenizer(filters='!"#$%&*+,-./;<=>?@[\\]^{|}~\t\n')
tokenizer_data.fit_on_texts(tokens_list_combine)

tokenized_data_text_train = tokenizer_data.texts_to_sequences(train_text_tokens)
train_features = pad_sequences(tokenized_data_text_train, maxlen=160)

tokenized_data_text_test = tokenizer_data.texts_to_sequences(test_text_tokens)
test_features = pad_sequences(tokenized_data_text_test, maxlen=160)

tokenized_data_text_val = tokenizer_data.texts_to_sequences(val_text_tokens)
val_features = pad_sequences(tokenized_data_text_val, maxlen=160)

pickle.dump(tokenizer_data, open("tokenizer_data.pkl", "wb"))
data_vocab_size = len(tokenizer_data.word_index) + 1

# Convert to PyTorch tensors
train_features = torch.tensor(train_features, dtype=torch.long)
test_features = torch.tensor(test_features, dtype=torch.long)
val_features = torch.tensor(val_features, dtype=torch.long)

print("input data shape:", train_features.shape)
print("data_vocab_size:", data_vocab_size)
print("training sample:", len(train_features))
print("validation sample:", len(val_features))
print("test sample:", len(test_features))

input data shape: torch.Size([11425, 160])
data_vocab_size: 4095
training sample: 11425
validation sample: 1583
test sample: 3166


In [39]:
import numpy as np
embeddings_index = {}
for w in fasttext.wv.key_to_index.keys():
    embeddings_index[w] = fasttext.wv[w]
print('Found %s word vectors.' % len(embeddings_index))
words = fasttext.wv.key_to_index.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)
# Initialize the embedding matrix with the correct shape
embedding_dim = 100  # FastText vector size
embedding_matrix = np.zeros((10000, embedding_dim))

# Populate the embedding matrix
for word, i in tokenizer_data.word_index.items():
    if i >= data_vocab_size:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)
print(f'Fasttext embedding shape: {embedding_matrix.shape}')

Found 1339 word vectors.
Vocab size 1339
Fasttext embedding shape: torch.Size([10000, 100])


In [40]:
y_train = torch.tensor(train_df["label"].astype("category").cat.codes.tolist())
y_test = torch.tensor(test_df["label"].astype("category").cat.codes.tolist())
y_val = torch.tensor(val_df["label"].astype("category").cat.codes.tolist())

In [54]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN_LSTM_DTHSH(nn.Module):
    def __init__(self, embedding_dim, embedding_matrix, num_labels):
        super(CNN_LSTM_DTHSH, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        
        # Corrected Conv1d layers
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=128, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=3)  # Fixed in_channels
        
        # Corrected LSTM input_size
        self.lstm1 = nn.LSTM(input_size=embedding_dim, hidden_size=384, num_layers=3, batch_first=True)
        
        # Corrected MultiheadAttention embed_dim
        self.multihead_attn = nn.MultiheadAttention(embed_dim=384, num_heads=8, dropout=0.3, batch_first=True)
        
        self.layer_norm = nn.LayerNorm(512)  # 128 (CNN) + 128 (Attention) = 256
        self.fc1 = nn.Linear(512, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, num_labels)

    def forward(self, x):
        x = self.embedding(x)  # Shape: (batch_size, seq_len, embedding_dim)
        
        # CNN Path
        x_cnn = x.permute(0, 2, 1)  # Shape: (batch_size, embedding_dim, seq_len)
        x_cnn = F.relu(self.conv1(x_cnn))  # Shape: (batch_size, 128, seq_len - 2)
        x_cnn = F.relu(self.conv2(x_cnn))  # Shape: (batch_size, 128, seq_len - 5)
        x_cnn = torch.max(x_cnn, dim=2)[0]  # Global max pooling: (batch_size, 128)
        
        # LSTM + Attention Path
        x_lstm, _ = self.lstm1(x)  # Output shape: (batch_size, seq_len, 128)
        x_att, _ = self.multihead_attn(x_lstm, x_lstm, x_lstm)  # Output shape: (batch_size, seq_len, 128)
        x_att = torch.mean(x_att, dim=1)  # Corrected mean: (batch_size, 128)
        
        # Combine CNN and Attention outputs
        x_combined = torch.cat((x_cnn, x_att), dim=1)  # Shape: (batch_size, 256)
        x_combined = self.layer_norm(x_combined)
        
        # Fully connected layers
        x_combined = F.relu(self.fc1(x_combined))
        x_combined = F.relu(self.fc2(x_combined))
        outputs = self.fc3(x_combined)  # No softmax here if using nn.CrossEntropyLoss
        return outputs       

In [42]:
print("Max token index in train_features:", train_features.max().item())
print("Max token index in val_features:", val_features.max().item())
print("Max token index in test_features:", test_features.max().item())

Max token index in train_features: 3568
Max token index in val_features: 4094
Max token index in test_features: 3920


In [55]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_dim = 100
embedding_matrix = embedding_matrix.to(device)
num_labels = 3
model = CNN_LSTM_DTHSH(embedding_dim, embedding_matrix, num_labels).to(device)
print(model)

epochs = 50
batch_size = 64
early_stopping_patience = 5
early_stopping_counter = 0
best_val_loss = float('inf')

train_dataset = TensorDataset(train_features, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TensorDataset(val_features, y_val)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataset = TensorDataset(test_features, y_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
criterion = nn.CrossEntropyLoss()

print("train_dataloader length:", len(train_dataloader))
print("val_dataloader length:", len(val_dataloader))
print("test_dataloader length:", len(test_dataloader))

print("Training on device:", device)

print("Training...")

# Training loop with accuracy calculation
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}/{epochs}"):
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_loss /= len(train_dataloader)
    train_accuracy = 100. * train_correct / train_total
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_accuracy:.2f}%")

    # Validation
    model.eval()
    val_loss = 0.0
    val_preds = []
    val_labels = []
    
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validating"):
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            val_preds.extend(predicted.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_loss /= len(val_dataloader)
    val_accuracy = accuracy_score(val_labels, val_preds) * 100
    print(f"Epoch {epoch+1}/{epochs}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stopping_counter = 0
        torch.save(model.state_dict(), "best_model.pth")
        print("Model saved!")
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= early_stopping_patience:
            print("Early stopping triggered.")
            break

print("Evaluating on test set...")
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

test_preds = []
test_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
        
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        test_preds.extend(predicted.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

# Print test metrics
print("\nTest Set Results:")
print("Confusion Matrix:")
print(confusion_matrix(test_labels, test_preds))
print("\nClassification Report:")
print(classification_report(test_labels, test_preds, 
                            target_names=[f"Class {i}" for i in range(num_labels)]))


CNN_LSTM_DTHSH(
  (embedding): Embedding(10000, 100)
  (conv1): Conv1d(100, 128, kernel_size=(3,), stride=(1,))
  (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
  (lstm1): LSTM(100, 384, num_layers=3, batch_first=True)
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
  )
  (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=3, bias=True)
)
train_dataloader length: 179
val_dataloader length: 25
test_dataloader length: 50
Training on device: cpu
Training...


Training Epoch 1/50:  37%|███▋      | 66/179 [01:08<01:57,  1.04s/it]


KeyboardInterrupt: 