In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import transformers
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, Trainer, TrainingArguments, BertModel
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader, Subset




In [2]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
class Bert_LSTM_Conv_Fc(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(Bert_LSTM_Conv_Fc, self).__init__()
        self.bert_model = bert_model
        self.lstm = nn.LSTM(input_size=768, hidden_size=256, num_layers=3, batch_first=True, bidirectional=False)
        self.conv1d = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3)
        
        # Chỉnh sửa kích thước fc1 và fc2 để phù hợp
        self.fc1 = nn.Linear(128, 128)  # Đầu vào và đầu ra của fc1 cùng là 128
        self.fc2 = nn.Linear(128, num_classes)  # Đầu ra của fc2 phù hợp với số lớp

    def forward(self, input_ids, attention_mask, labels):
        bert_output = self.bert_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        lstm_output, _ = self.lstm(bert_output)  # [batch_size, seq_length, 256]
        lstm_output = lstm_output.permute(0, 2, 1)  # [batch_size, 256, seq_length]
        conv_out = self.conv1d(lstm_output)  # [batch_size, 128, seq_length-2] (do kernel_size=3)
        x_out = conv_out.mean(dim=2)  # [batch_size, 128]
        x = self.fc1(x_out)  # [batch_size, 128]
        logits = self.fc2(x)  # [batch_size, num_classes]
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return (loss, logits) if loss is not None else logits


In [None]:
# Dataset
with open('/kaggle/input/semeval/train_text.txt', 'r') as f:
  data = f.readlines()
  train_texts = []
  for line in data:
    train_texts.append(line.strip())

with open('/kaggle/input/semeval/train_labels.txt', 'r') as f:
  data = f.readlines()
  train_labels = []
  for line in data:
    train_labels.append(int(line.strip()))

with open('/kaggle/input/semeval/val_text.txt', 'r') as f:
  data = f.readlines()
  val_texts = []
  for line in data:
    val_texts.append(line.strip())

with open('/kaggle/input/semeval/val_labels.txt', 'r') as f:
  data = f.readlines()
  val_labels = []
  for line in data:
    val_labels.append(int(line.strip()))

with open('/kaggle/input/semeval/test_text.txt', 'r') as f:
  data = f.readlines()
  test_texts = []
  for line in data:
    test_texts.append(line.strip())

with open('/kaggle/input/semeval/test_labels.txt', 'r') as f:
  data = f.readlines()
  test_labels = []
  for line in data:
    test_labels.append(int(line.strip()))

In [None]:
# preprocessing text
# remove emoji
import re

def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F700-\U0001F77F"  # alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess Symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

# lower case
def lower_case(text:str):
    return text.lower()

# remove @user
def remove_mentions(text):
    return re.sub(r'@\w+', '', text)    

def remove_tag(text: str):
    return re.sub(r'#\w+', '', text)

def remove_urls(text):
    url_pattern = re.compile(r'http\S+|www\S+|https\S+')
    return url_pattern.sub(r'', text)

from bs4 import BeautifulSoup

def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def clean_text(data_text: list):
  clean = []
  for text in data_text:
    text = remove_emojis(text)
    text = lower_case(text)
    text = remove_mentions(text)
    text = remove_tag(text)
    text = remove_urls(text)
    text = remove_html_tags(text)

    clean.append(text)
  return clean

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

bert_lstm_fc = Bert_LSTM_Conv_Fc(bert_model, 3).to(device)

In [None]:
max_length = 512

train_clean = clean_text(train_texts)
val_clean = clean_text(val_texts)
test_clean = clean_text(test_texts)

train_dataset = CustomDataset(train_clean, train_labels, tokenizer, max_length=max_length)
val_dataset = CustomDataset(val_clean, val_labels, tokenizer, max_length=max_length)
test_dataset = CustomDataset(test_clean, test_labels, tokenizer, max_length=max_length)

In [None]:
from datasets import load_metric

# Tải metric tính toán accuracy
accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy["accuracy"]}

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='steps',
    save_strategy='steps',
    save_steps=1000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=200,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True
)

# Khởi tạo Trainer
trainer = Trainer(
    model=bert_lstm_fc,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # Thêm hàm đánh giá tuỳ chỉnh
)
# Huấn luyện mô hình
trainer.train()

In [None]:
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

# Tính toán ma trận nhầm lẫn


In [None]:
# cm = confusion_matrix(labels, preds)
# print("Accuracy: {}".format(accuracy_score(test_labels, preds)))
# # Hiển thị ma trận nhầm lẫn
# plt.figure(figsize=(10, 7))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.title('Confusion Matrix')
# plt.show()

In [None]:
trainer.save_model('./best_model')