In [4]:
import json
import underthesea # For Vietnamese word tokenization
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
from seqeval.metrics import classification_report
import time
import os

# --- Configuration ---
DATA_FILE = 'output_semeval_format_v3_no_overlap.txt'
MAX_LEN = 100       # Maximum sequence length
EMBEDDING_DIM = 100 # Dimension of word embeddings
LSTM_HIDDEN_DIM = 128 # Dimension of LSTM hidden states (per direction)
CNN_FILTERS = 50    # Number of filters for each kernel size in CNN
CNN_KERNEL_SIZES = [2, 3, 4] # Kernel sizes for CNN
DROPOUT_RATE = 0.5
BATCH_SIZE = 32
EPOCHS = 10         # Adjust as needed
LEARNING_RATE = 0.001
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"

# --- 1. Load and Prepare Data ---

def load_data(filepath):
    """Loads data from the JSON file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        # Adjust based on the actual structure
        if "sentences" in data and "sentence" in data["sentences"]:
             return data["sentences"]["sentence"]
        elif isinstance(data, list): # Handle if data is directly a list of sentences
             return data
        else:
             raise ValueError("Cannot find sentence list in the JSON structure")
    except Exception as e:
        print(f"Error loading data: {e}")
        return []

def word_tokenize(sentence):
  """Tokenizes a Vietnamese sentence."""
  return underthesea.word_tokenize(sentence)

def build_vocab_and_tags(data):
    """Builds word vocabulary and unified tag vocabulary."""
    words = Counter()
    tags = Counter()
    tags['O'] = 1 # Ensure 'O' tag exists

    print("Building vocabulary and tag set...")
    for sentence_data in data:
        if 'text' not in sentence_data or 'aspects' not in sentence_data: continue
        text = sentence_data['text']
        aspects = sentence_data['aspects']
        tokens = word_tokenize(text)
        words.update(tokens)

        # Generate unified tags temporarily just to count them
        token_spans = []
        current_pos = 0
        for token in tokens:
            start = current_pos
            end = start + len(token)
            token_spans.append((start, end))
            current_pos = end + 1

        temp_tags = ['O'] * len(tokens)
        for aspect in aspects:
            try:
                aspect_start = int(aspect['from'])
                aspect_end = int(aspect['to'])
                category = aspect['category'].strip().replace(" ", "_")
                polarity = aspect['polarity'].strip()
                if not category or not polarity or polarity not in ['positive', 'negative', 'neutral']: continue

                b_tag = f"B-{category}-{polarity}"
                i_tag = f"I-{category}-{polarity}"
                first_token_in_span = True

                for i, (tok_start, tok_end) in enumerate(token_spans):
                    token_overlaps = (tok_start >= aspect_start and tok_start < aspect_end) or \
                                     (aspect_start >= tok_start and aspect_start < tok_end)
                    if token_overlaps:
                        if temp_tags[i] == 'O': # Avoid overwriting (should not happen with no_overlap file)
                            if first_token_in_span:
                                temp_tags[i] = b_tag
                                first_token_in_span = False
                            else:
                                temp_tags[i] = i_tag
            except (ValueError, KeyError):
                continue # Skip malformed aspects

        tags.update(temp_tags)

    # Create word to index mapping
    word2idx = {word: i + 2 for i, word in enumerate(words)} # Start from 2
    word2idx[PAD_TOKEN] = 0
    word2idx[UNK_TOKEN] = 1
    idx2word = {i: word for word, i in word2idx.items()}

    # Create tag to index mapping
    tag2idx = {tag: i for i, tag in enumerate(sorted(tags.keys()))}
    idx2tag = {i: tag for tag, i in tag2idx.items()}

    print(f"Vocabulary size: {len(word2idx)}")
    print(f"Tag set size: {len(tag2idx)}")
    # print("Tag map:", tag2idx) # Can be very large, print if needed

    return word2idx, idx2word, tag2idx, idx2tag

def preprocess_data(data, word2idx, tag2idx, max_len):
    """Converts sentences and tags to padded sequences of indices."""
    processed_sentences = []
    processed_tags = []

    pad_word_idx = word2idx[PAD_TOKEN]
    unk_word_idx = word2idx[UNK_TOKEN]
    # Use -100 for padding tags, CrossEntropyLoss ignores this index by default
    pad_tag_idx = -100

    print("Preprocessing data into sequences...")
    for sentence_data in data:
        if 'text' not in sentence_data or 'aspects' not in sentence_data: continue
        text = sentence_data['text']
        aspects = sentence_data['aspects']
        tokens = word_tokenize(text)

        # Generate unified BIO tags
        token_spans = []
        current_pos = 0
        for token in tokens:
            start = current_pos
            end = start + len(token)
            token_spans.append((start, end))
            current_pos = end + 1

        bio_tags = ['O'] * len(tokens)
        for aspect in aspects:
            try:
                aspect_start = int(aspect['from'])
                aspect_end = int(aspect['to'])
                category = aspect['category'].strip().replace(" ", "_")
                polarity = aspect['polarity'].strip()
                if not category or not polarity or polarity not in ['positive', 'negative', 'neutral']: continue

                b_tag = f"B-{category}-{polarity}"
                i_tag = f"I-{category}-{polarity}"
                first_token_in_span = True

                for i, (tok_start, tok_end) in enumerate(token_spans):
                    token_overlaps = (tok_start >= aspect_start and tok_start < aspect_end) or \
                                     (aspect_start >= tok_start and aspect_start < tok_end)
                    if token_overlaps:
                        if bio_tags[i] == 'O': # Check just in case
                            if first_token_in_span:
                                bio_tags[i] = b_tag
                                first_token_in_span = False
                            else:
                                bio_tags[i] = i_tag
            except (ValueError, KeyError):
                continue # Skip malformed aspects

        # Convert tokens and tags to indices
        sentence_indices = [word2idx.get(token, unk_word_idx) for token in tokens]
        tag_indices = [tag2idx.get(tag, tag2idx['O']) for tag in bio_tags] # Default to 'O' if tag not found

        # Pad sequences
        seq_len = len(sentence_indices)
        if seq_len < max_len:
            sentence_indices.extend([pad_word_idx] * (max_len - seq_len))
            tag_indices.extend([pad_tag_idx] * (max_len - seq_len))
        elif seq_len > max_len:
            sentence_indices = sentence_indices[:max_len]
            tag_indices = tag_indices[:max_len]

        processed_sentences.append(sentence_indices)
        processed_tags.append(tag_indices)

    return torch.tensor(processed_sentences, dtype=torch.long), torch.tensor(processed_tags, dtype=torch.long)


# --- 2. Define CNN-LSTM Model ---

class CNN_LSTM_Tagger(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, lstm_hidden_dim,
                 cnn_filters, cnn_kernel_sizes, dropout_rate, pad_idx):
        super().__init__()

        self.embedding_dim = embedding_dim
        self.lstm_hidden_dim = lstm_hidden_dim
        self.tagset_size = tagset_size

        # Embedding Layer (consider loading pre-trained embeddings here)
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

        # CNN Layers
        # Input to Conv1d: (batch_size, embedding_dim, sequence_length)
        # Output from Conv1d: (batch_size, num_filters, output_length)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim,
                      out_channels=cnn_filters,
                      kernel_size=ks)
            for ks in cnn_kernel_sizes
        ])
        cnn_output_dim = cnn_filters * len(cnn_kernel_sizes)

        # Dropout Layer
        self.dropout = nn.Dropout(dropout_rate)

        # LSTM Layer
        # Input to LSTM: (batch_size, sequence_length, cnn_output_dim)
        self.lstm = nn.LSTM(embedding_dim, # <--- Sửa thành embedding_dim (100)
                            lstm_hidden_dim,
                            num_layers=1, # Can increase layers
                            bidirectional=True,
                            batch_first=True) # Input/output tensors have batch dim first

        # Final Linear Layer (maps LSTM output to tag space)
        # Input: (batch_size, seq_len, lstm_hidden_dim * 2) -> Output: (batch_size, seq_len, tagset_size)
        self.hidden2tag = nn.Linear(lstm_hidden_dim * 2, tagset_size) # *2 for bidirectional

    def forward(self, sentence_indices):
        # sentence_indices: (batch_size, seq_len)

        # 1. Embedding
        embedded = self.dropout(self.embedding(sentence_indices))
        # embedded: (batch_size, seq_len, embedding_dim)

        # 2. CNN
        # Permute for Conv1d: (batch_size, embedding_dim, seq_len)
        embedded_permuted = embedded.permute(0, 2, 1)

        # Apply convolutions and activation (ReLU)
        conved = [torch.relu(conv(embedded_permuted)) for conv in self.convs]
        # conved[i]: (batch_size, num_filters, seq_len - kernel_size + 1)

        # Max-over-time pooling for each convolution output (or adjust padding in conv)
        # To keep sequence length for LSTM, we need padding in Conv1d or careful handling.
        # Alternative: Apply LSTM *before* CNN, or directly on embeddings if CNN complexity is too high for now.

        # --- Let's simplify: Apply LSTM directly on embeddings for this example ---
        # This is a common BiLSTM approach without the CNN complexity.
        # If you want CNN, you need to handle the sequence length changes or use padding='same' in Conv1d.

        # embedded: (batch_size, seq_len, embedding_dim)
        lstm_out, _ = self.lstm(embedded) # Pass embedded directly
        # lstm_out: (batch_size, seq_len, lstm_hidden_dim * 2)

        # Apply dropout to LSTM output
        lstm_out_dropout = self.dropout(lstm_out)

        # 3. Linear Layer
        tag_space = self.hidden2tag(lstm_out_dropout)
        # tag_space: (batch_size, seq_len, tagset_size)

        # We expect CrossEntropyLoss which applies LogSoftmax internally
        return tag_space # Return logits

# --- Helper Functions for Training/Evaluation ---

def train_epoch(model, dataloader, optimizer, criterion, device, clip=1.0):
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        # Move batch to device
        sentence_in = batch[0].to(device)
        targets = batch[1].to(device) # (batch_size, seq_len)

        optimizer.zero_grad()

        # Forward pass -> get logits (batch_size, seq_len, tagset_size)
        predictions = model(sentence_in)

        # Reshape for CrossEntropyLoss: (batch_size * seq_len, tagset_size)
        predictions = predictions.view(-1, predictions.shape[-1])
        # Reshape targets: (batch_size * seq_len)
        targets = targets.view(-1)

        # Calculate loss (ignore padding tokens automatically via ignore_index in criterion)
        loss = criterion(predictions, targets)

        loss.backward()

        # Gradient Clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device, idx2tag, pad_tag_idx):
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_trues = []

    with torch.no_grad():
        for batch in dataloader:
            sentence_in = batch[0].to(device)
            targets = batch[1].to(device) # (batch_size, seq_len)

            predictions = model(sentence_in) # (batch_size, seq_len, tagset_size)

            # Calculate loss (on non-padded tokens)
            loss = criterion(predictions.view(-1, predictions.shape[-1]), targets.view(-1))
            epoch_loss += loss.item()

            # Get predicted tags (argmax) -> (batch_size, seq_len)
            predicted_indices = predictions.argmax(dim=-1)

            # Convert indices to tags for seqeval, ignoring padding
            for i in range(targets.shape[0]): # Iterate through batch items
                true_seq = []
                pred_seq = []
                for j in range(targets.shape[1]): # Iterate through sequence
                    if targets[i, j].item() != pad_tag_idx: # Check if not padding
                        true_seq.append(idx2tag[targets[i, j].item()])
                        pred_seq.append(idx2tag[predicted_indices[i, j].item()])
                if true_seq: # Only add if sequence is not fully padded
                    all_trues.append(true_seq)
                    all_preds.append(pred_seq)

    avg_loss = epoch_loss / len(dataloader)
    return avg_loss, all_trues, all_preds

# --- Main Execution ---

if __name__ == "__main__":
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # 1. Load and process data
    raw_data = load_data(DATA_FILE)
    if not raw_data: exit()

    word2idx, idx2word, tag2idx, idx2tag = build_vocab_and_tags(raw_data)
    sentences_idx, tags_idx = preprocess_data(raw_data, word2idx, tag2idx, MAX_LEN)

    # 2. Create datasets and dataloaders
    dataset = TensorDataset(sentences_idx, tags_idx)

    # Split data (using indices)
    train_indices, test_indices = train_test_split(
        list(range(len(dataset))), test_size=0.2, random_state=42
    )
    train_indices, val_indices = train_test_split(
        train_indices, test_size=0.15, random_state=42 # ~17% validation
    )

    train_dataset = torch.utils.data.Subset(dataset, train_indices)
    val_dataset = torch.utils.data.Subset(dataset, val_indices)
    test_dataset = torch.utils.data.Subset(dataset, test_indices)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    print(f"\nDataLoaders created:")
    print(f"Train batches: {len(train_loader)}")
    print(f"Validation batches: {len(val_loader)}")
    print(f"Test batches: {len(test_loader)}")

    # 3. Initialize model, optimizer, loss
    VOCAB_SIZE = len(word2idx)
    TAGSET_SIZE = len(tag2idx)
    PAD_IDX = word2idx[PAD_TOKEN]
    PAD_TAG_IDX = -100 # Important for CrossEntropyLoss ignore_index

    # Instantiate the BiLSTM model (removed CNN for simplicity here, rename if needed)
    # If you keep CNN, ensure dimensions match or adjust padding in Conv1d
    model = CNN_LSTM_Tagger(
        VOCAB_SIZE, TAGSET_SIZE, EMBEDDING_DIM, LSTM_HIDDEN_DIM,
        CNN_FILTERS, CNN_KERNEL_SIZES, DROPOUT_RATE, PAD_IDX
    ).to(device)

    print("\nModel Architecture:")
    print(model)

    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss(ignore_index=PAD_TAG_IDX)

    # 4. Training Loop
    best_val_loss = float('inf')
    print("\n--- Starting Training ---")
    for epoch in range(EPOCHS):
        start_time = time.time()

        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss, val_trues, val_preds = evaluate(model, val_loader, criterion, device, idx2tag, PAD_TAG_IDX)

        end_time = time.time()
        epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

        print(f'Epoch: {epoch+1:02} | Time: {int(epoch_mins)}m {int(epoch_secs)}s')
        print(f'\tTrain Loss: {train_loss:.3f}')
        print(f'\t Val. Loss: {val_loss:.3f}')

        # Save best model based on validation loss
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'cnn-lstm-aspect-sentiment-best.pt')
            print("\t -> Saved best model")

        # Print seqeval report for validation set (optional, can be slow)
        try:
            # Get labels for report, excluding 'O' maybe? Or include all? Let's include all for now.
            all_tags_sorted = sorted(tag2idx.keys(), key=lambda name: (name[1:], name[0]))
            val_report = classification_report(val_trues, val_preds, output_dict=False, zero_division=0) # , labels=all_tags_sorted) # Adding labels can make it verbose
            print("\nValidation Seqeval Report (sample):")
            # Print only micro avg F1 for brevity during training
            f1_micro = classification_report(val_trues, val_preds, output_dict=True, zero_division=0).get('micro avg', {}).get('f1-score', 0)
            print(f"\t Micro Avg F1: {f1_micro:.3f}")
            # Uncomment below for full report
            # print(val_report)
        except Exception as e:
            print(f"Could not generate validation seqeval report: {e}")

    print("--- Training Finished ---")

    # 5. Evaluate on Test Set
    print("\n--- Evaluating on Test Set ---")
    # Load the best model
    model.load_state_dict(torch.load('cnn-lstm-aspect-sentiment-best.pt'))
    test_loss, test_trues, test_preds = evaluate(model, test_loader, criterion, device, idx2tag, PAD_TAG_IDX)

    print(f'Test Loss: {test_loss:.3f}')

    try:
        print("\nTest Set Seqeval Classification Report:")
        test_report = classification_report(test_trues, test_preds, output_dict=False, zero_division=0)
        print(test_report)
    except Exception as e:
         print(f"Could not generate test seqeval report: {e}")


    # --- 6. Prediction Example ---
    def predict_sentence(sentence, model, word2idx, idx2tag, device, max_len):
        model.eval()
        tokens = word_tokenize(sentence)
        if not tokens: return [], []

        # Convert to indices, pad
        unk_idx = word2idx[UNK_TOKEN]
        pad_idx = word2idx[PAD_TOKEN]
        indices = [word2idx.get(t, unk_idx) for t in tokens]
        orig_len = len(indices)

        if len(indices) < max_len:
            indices.extend([pad_idx] * (max_len - len(indices)))
        elif len(indices) > max_len:
            indices = indices[:max_len]
            orig_len = max_len # Adjust original length if truncated

        # Convert to tensor and add batch dimension
        sentence_tensor = torch.tensor([indices], dtype=torch.long).to(device)

        with torch.no_grad():
            predictions = model(sentence_tensor) # (1, seq_len, tagset_size)

        predicted_indices = predictions.argmax(dim=-1)[0].cpu().numpy() # (seq_len)

        # Convert indices back to tags, up to original length
        predicted_tags = [idx2tag[idx] for idx in predicted_indices[:orig_len]]

        return tokens[:orig_len], predicted_tags # Return original tokens and predicted tags

    print("\n--- Prediction Example ---")
    test_sentence = "giáo viên nhiệt tình nhưng cơ sở vật chất cần cải thiện ."
    pred_tokens, pred_tags = predict_sentence(test_sentence, model, word2idx, idx2tag, device, MAX_LEN)
    print(f"Sentence: {test_sentence}")
    print(f"Tokens: {pred_tokens}")
    print(f"Predicted Tags: {pred_tags}")

    # You can add the tag grouping logic here if needed (similar to CRF example)

Using device: cpu
Building vocabulary and tag set...


KeyboardInterrupt: 