In [4]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoModel, AutoTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import cuda
from torch import nn
import matplotlib.pyplot as plt
from torch.nn import CrossEntropyLoss
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

In [5]:
train_df = pd.read_csv('processed_train.csv').dropna()
test_df = pd.read_csv('processed_test.csv').dropna()
val_df = pd.read_csv('processed_dev.csv').dropna()
combine_df = pd.concat([train_df, test_df, val_df], axis=0)

In [6]:
from gensim.models import FastText
from underthesea import word_tokenize



tokens_list_combine = [word_tokenize(text) for text in combine_df.text]
train_text_tokens = [word_tokenize(text) for text in train_df.text]
test_text_tokens = [word_tokenize(text) for text in test_df.text]
val_text_tokens = [word_tokenize(text) for text in val_df.text]

# Tạo tập dữ liệu huấn luyện (mỗi câu là một danh sách từ)
sentences = [text.split() for text in combine_df.text]

# Huấn luyện FastText
fasttext = FastText(tokens_list_combine, vector_size=300, window=7, min_count=1, workers=4)

# Kiểm tra vector của một từ
print(fasttext.wv["học"])  # Lấy vector của từ "học"

[ 0.02360233  0.62782395 -0.11770976  0.74578    -0.6986473   0.4496135
 -0.01290128 -0.55761874  0.8163744  -0.5296226  -0.21645746  0.40514842
  0.19725057  0.1587309  -0.21678615 -0.176767    0.36918408  0.16100915
  0.5359108  -0.15905893 -0.29934022 -0.01921103  0.11644315  0.3774891
  0.22186467  1.3479227  -0.40699008  0.5787796  -0.04306652 -0.3846931
  0.03471125  0.24263023  1.0618961   0.04159101  0.35302082  0.22493672
  0.33135733 -0.38879484  0.476584    0.6932649   0.23081763  0.24549386
  0.26727518 -0.59697276 -0.70675045  0.19075868  0.11562022 -0.09988599
  0.16031721 -0.00994855  0.02239961  0.919814   -0.19972864 -0.6629287
  0.54217523  0.24028456  0.11828121  0.37713417 -0.01696712  0.5403281
  0.5920293  -0.43277305  0.30704734 -0.02188688  0.26979274 -0.7358136
  0.40156502 -0.26405737 -0.72858936 -0.5658097  -0.4339098  -0.15935731
  0.39170715  0.35274282 -0.08615943  0.16624998  0.4166899   0.57527596
 -0.4941464   0.24102594 -0.37186357  0.19306757  0.14365

In [11]:
import numpy as np
import torch
import torch.nn as nn
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

tokenizer_data = Tokenizer(filters='!"#$%&*+,-./;<=>?@[\\]^{|}~\t\n')
tokenizer_data.fit_on_texts(tokens_list_combine)

tokenized_data_text_train = tokenizer_data.texts_to_sequences(train_text_tokens)
train_features = pad_sequences(tokenized_data_text_train, maxlen=160)

tokenized_data_text_test = tokenizer_data.texts_to_sequences(test_text_tokens)
test_features = pad_sequences(tokenized_data_text_test, maxlen=160)

tokenized_data_text_val = tokenizer_data.texts_to_sequences(val_text_tokens)
val_features = pad_sequences(tokenized_data_text_val, maxlen=160)

pickle.dump(tokenizer_data, open("tokenizer_data.pkl", "wb"))
data_vocab_size = len(tokenizer_data.word_index)

# Convert to PyTorch tensors
train_features = torch.tensor(train_features, dtype=torch.long)
test_features = torch.tensor(test_features, dtype=torch.long)
val_features = torch.tensor(val_features, dtype=torch.long)

print("input data shape:", train_features.shape)
print("data_vocab_size:", data_vocab_size)
print("training sample:", len(train_features))
print("validation sample:", len(val_features))
print("test sample:", len(test_features))

input data shape: torch.Size([11425, 160])
data_vocab_size: 4094
training sample: 11425
validation sample: 1583
test sample: 3166


In [14]:
import numpy as np
embeddings_index = {}
for w in fasttext.wv.key_to_index.keys():
    embeddings_index[w] = fasttext.wv[w]
print('Found %s word vectors.' % len(embeddings_index))
words = fasttext.wv.key_to_index.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)
embedding_matrix = np.zeros((vocab_size, 300))

# Populate the embedding matrix
for word, i in tokenizer_data.word_index.items():
    if i >= data_vocab_size:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = fasttext.wv[w]

embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)
print(f'Fasttext embedding shape: {embedding_matrix.shape}')

Found 4094 word vectors.
Vocab size 4094
Fasttext embedding shape: torch.Size([4094, 300])


In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN_LSTM_DTHSH(nn.Module):
    def __init__(self, embedding_dim, embedding_matrix, num_labels):
        super(CNN_LSTM_DTHSH, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        
        # Corrected Conv1d layers
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=128, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=4)  # Fixed in_channels
        
        # Corrected LSTM input_size
        self.lstm1 = nn.LSTM(input_size=embedding_dim, hidden_size=128, num_layers=3, batch_first=True)
        
        # Corrected MultiheadAttention embed_dim
        self.multihead_attn = nn.MultiheadAttention(embed_dim=128, num_heads=8, dropout=0.3, batch_first=True)
        
        self.layer_norm = nn.LayerNorm(256)  # 128 (CNN) + 128 (Attention) = 256
        self.fc1 = nn.Linear(256, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, num_labels)

    def forward(self, x):
        x = self.embedding(x)  # Shape: (batch_size, seq_len, embedding_dim)
        
        # CNN Path
        x_cnn = x.permute(0, 2, 1)  # Shape: (batch_size, embedding_dim, seq_len)
        x_cnn = F.relu(self.conv1(x_cnn))  # Shape: (batch_size, 128, seq_len - 2)
        x_cnn = F.relu(self.conv2(x_cnn))  # Shape: (batch_size, 128, seq_len - 5)
        x_cnn = torch.max(x_cnn, dim=2)[0]  # Global max pooling: (batch_size, 128)
        
        # LSTM + Attention Path
        x_lstm, _ = self.lstm1(x)  # Output shape: (batch_size, seq_len, 128)
        x_att, _ = self.multihead_attn(x_lstm, x_lstm, x_lstm)  # Output shape: (batch_size, seq_len, 128)
        x_att = torch.mean(x_att, dim=1)  # Corrected mean: (batch_size, 128)
        
        # Combine CNN and Attention outputs
        x_combined = torch.cat((x_cnn, x_att), dim=1)  # Shape: (batch_size, 256)
        x_combined = self.layer_norm(x_combined)
        
        # Fully connected layers
        x_combined = F.relu(self.fc1(x_combined))
        x_combined = F.relu(self.fc2(x_combined))
        outputs = self.fc3(x_combined)  # No softmax here if using nn.CrossEntropyLoss
        return outputs       