# Set up

In [1]:
# First cell - Imports and Setup
import os
import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import gc
import json
import zipfile
from sklearn.model_selection import train_test_split, KFold
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, get_cosine_schedule_with_warmup, set_seed
from torch import nn
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
from torch.cuda.amp import autocast, GradScaler
import warnings
import random
warnings.filterwarnings('ignore')

In [2]:
# Configure PyTorch memory allocation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:32'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU Memory Available: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

def set_seed(seed=42):
    """Set all random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

Using device: cuda
GPU Memory Available: 15.83 GB


In [3]:
class SarcasmTextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = f"{row['caption']} {row['emojis']} {row['emoji_explain']}"
        
        encoding = self.tokenizer(
            text, 
            padding="max_length", 
            truncation=True, 
            max_length=self.max_length, 
            return_tensors="pt"
        )
        
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        encoding['labels'] = torch.tensor(row['label'], dtype=torch.long)
        return encoding


In [4]:
# define hyperparameter
hidden_size = 2160
num_labels = 2
batch_size = 8
num_epochs = 3
l_r = 1e-5

In [5]:
# Text Sarcasm Classifier
class TextSarcasmClassifier(nn.Module):
    def __init__(self, hidden_size, num_labels):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.classifier = nn.Sequential(
            nn.Linear(self.roberta.config.hidden_size, hidden_size),  # Increased layer size
            nn.ReLU(),
            nn.Dropout(0.1),
            
            nn.Linear(hidden_size, hidden_size // 2),  # Additional layer
            nn.ReLU(),
            nn.Dropout(0.1),
            
            nn.Linear(hidden_size // 2, num_labels)  # Ensure correct output size
        )

    def forward(self, input_ids, attention_mask=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = outputs.last_hidden_state[:, 0, :]  # CLS token
        logits = self.classifier(cls_token)
        probabilities = nn.functional.softmax(logits, dim=1)
        return probabilities

In [6]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        else:
            self.best_loss = val_loss
            self.counter = 0
        return False

In [7]:
def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []

    # Thêm tqdm để theo dõi tiến trình
    progress_bar = tqdm(dataloader, desc='Training')

    for batch in progress_bar:
        try:
            # Clear cache nếu cần
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            # Move data to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass với gradient scaling
            with autocast():
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)

           
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()

            # Collect metrics
            total_loss += loss.item()
            predictions.extend(outputs.argmax(dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            # Update progress bar
            progress_bar.set_postfix({'loss': loss.item()})

        except Exception as e:
            print(f"Error in batch: {str(e)}")
            continue

       

    # Convert lists to numpy arrays for sklearn metrics
    predictions = np.array(predictions)
    true_labels = np.array(true_labels)
    cm = confusion_matrix(true_labels, predictions)
    # Calculate metrics
    metrics = {
        'loss': total_loss / len(dataloader),
        'accuracy': accuracy_score(true_labels, predictions),
        'f1_score': f1_score(true_labels, predictions, average='weighted'),
        'precision': precision_score(true_labels, predictions, average='weighted'),
        'recall': recall_score(true_labels, predictions, average='weighted'),
        'confusion_matrix': cm
    }
    
    return metrics

In [8]:

def validate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    all_probabilities = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Tính xác suất
            probabilities = torch.softmax(outputs, dim=1)
            
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            predictions = outputs.argmax(-1)
            
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probabilities.extend(probabilities.cpu().numpy())

    accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    cm = confusion_matrix(all_labels, all_preds)
    
#     plt.figure(figsize=(8, 6))
#     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
#     plt.title('Confusion Matrix')
#     plt.ylabel('True Label')
#     plt.xlabel('Predicted Label')
#     plt.show()
    
    metrics = {
        'loss': total_loss / len(dataloader),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': cm,
        'probabilities': all_probabilities,
        'labels': all_labels
    }
    
    return metrics

def print_metrics(phase, metrics):
    print(f"\n{phase} Metrics:")
    print(f"Loss: {metrics['loss']:.4f}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"F1-Score: {metrics['f1_score']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    # Thêm try-except để xử lý trường hợp không có confusion matrix
    try:
        plt.figure(figsize=(8, 6))
        sns.heatmap(metrics['confusion_matrix'], annot=True, fmt='d', cmap='Blues')
        plt.title(f'{phase} Confusion Matrix')  # Thêm phase vào title
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()
    except KeyError:
        print("Warning: Confusion matrix not found in metrics")

In [9]:
def cross_validation_train(df, n_splits=3, test_size=0.15,
                      hidden_size=hidden_size, 
                      epochs=num_epochs, batch_size=batch_size, 
                      learning_rate=l_r,
                      temperature=2.0,
                      clip_value=1.0):
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Split the dataset into train/validation and test sets
    train_val_df, test_df = train_test_split(df, test_size=test_size, random_state=42, stratify=df['label'])

    fold_results = []
    all_val_labels = []  # To store validation labels
    early_stopping = EarlyStopping(patience=3, min_delta=0.001)

    for fold, (train_idx, val_idx) in enumerate(kfold.split(train_val_df)):
        print(f"\nTraining Fold {fold + 1}/{n_splits}")

        train_data = train_val_df.iloc[train_idx].reset_index(drop=True)
        val_data = train_val_df.iloc[val_idx].reset_index(drop=True)
        
        # Store validation labels
        all_val_labels.append(val_data['label'].values)
        
        # Create test dataset and dataloader once
        test_dataset = SarcasmTextDataset(test_df, tokenizer)
        test_loader = DataLoader(
            test_dataset,
            batch_size=batch_size,
            num_workers=4,
            pin_memory=True
        )
        
        # Create datasets and dataloaders
        train_dataset = SarcasmTextDataset(train_data, tokenizer)
        val_dataset = SarcasmTextDataset(val_data, tokenizer)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        # Initialize model
        model = TextSarcasmClassifier(hidden_size, num_labels).to(device)

        # AdamW optimizer with weight decay and gradient clipping
        optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=learning_rate,
            weight_decay=0.01
        )
        
        # Loss function without class weights
        criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

        # Learning rate scheduler with warmup
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=learning_rate,
            epochs=epochs,
            steps_per_epoch=len(train_loader),
            pct_start=0.3,  # 30% epochs for warmup
            anneal_strategy='linear'
        )

        best_val_f1 = 0
        for epoch in range(epochs):
            print(f"\nEpoch {epoch + 1}/{epochs}")

            train_metrics = train_epoch(model, train_loader, optimizer, scheduler, criterion, device)
            val_metrics = validate(model, val_loader, criterion, device)

            if val_metrics['accuracy'] > best_val_f1:
                best_val_f1 = val_metrics['accuracy']
                torch.save({
                    'fold': fold,
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(), 
                    'metrics': val_metrics,
                    'probabilities': val_metrics['probabilities'],
                    'val_labels': val_metrics['labels'],
                }, f'roberta_fold_{fold}.pt')
                
            if early_stopping(val_metrics['loss']):
                print("Early stopping triggered")
                break

        final_val_metrics = validate(model, val_loader, criterion, device)
        test_metrics = validate(model, test_loader, criterion, device)

        fold_results.append({
            'fold': fold,
            'val_metrics': final_val_metrics,
            'test_metrics': test_metrics
        })

        # Clear memory
        del model, optimizer, scheduler, train_loader, val_loader
        torch.cuda.empty_cache()
        gc.collect()

    # Print average results across folds
    print("\nCross-validation Results:")
    
    # Calculate average validation metrics
    avg_val_metrics = {
        'accuracy': np.mean([r['val_metrics']['accuracy'] for r in fold_results]),
        'f1_score': np.mean([r['val_metrics']['f1_score'] for r in fold_results]),
        'precision': np.mean([r['val_metrics']['precision'] for r in fold_results]),
        'recall': np.mean([r['val_metrics']['recall'] for r in fold_results])
    }
    
    # Calculate average test metrics
    avg_test_metrics = {
        'accuracy': np.mean([r['test_metrics']['accuracy'] for r in fold_results]),
        'f1_score': np.mean([r['test_metrics']['f1_score'] for r in fold_results]),
        'precision': np.mean([r['test_metrics']['precision'] for r in fold_results]),
        'recall': np.mean([r['test_metrics']['recall'] for r in fold_results])
    }

    print("\nValidation Metrics:")
    print(f"Average Accuracy: {avg_val_metrics['accuracy']:.4f}")
    print(f"Average F1-Score: {avg_val_metrics['f1_score']:.4f}")
    print(f"Average Precision: {avg_val_metrics['precision']:.4f}")
    print(f"Average Recall: {avg_val_metrics['recall']:.4f}")
    
    print("\nTest Metrics:")
    print(f"Average Accuracy: {avg_test_metrics['accuracy']:.4f}")
    print(f"Average F1-Score: {avg_test_metrics['f1_score']:.4f}")
    print(f"Average Precision: {avg_test_metrics['precision']:.4f}")
    print(f"Average Recall: {avg_test_metrics['recall']:.4f}")

    return fold_results, avg_val_metrics, avg_test_metrics


In [10]:
if __name__ == "__main__":

    # Configure paths
#     TEXT_DIR = "/kaggle/input/vimmsd-uit/UIT/IT/datasets/input/train/training-images/train-images/"
#     TRAIN_DATA_PATH = "/kaggle/input/vimmsd-uit/train_data_2.csv"
    # Load data/
    df = pd.read_csv('/kaggle/input/approach-2/text_data.csv')
    # Set random seed
    set_seed(42)

    # Train model
    fold_results, val_metrics, test_metrics = cross_validation_train(
    df,
    n_splits=3,
    test_size=0.15,  # có thể điều chỉnh tỉ lệ test
    epochs=num_epochs,
    
    )

    print("\nTraining completed!")
    print(f"Final Test Metrics:")
    print(f"Accuracy: {test_metrics['accuracy']:.4f}")
    print(f"F1-Score: {test_metrics['f1_score']:.4f}")
    print(f"Precision: {test_metrics['precision']:.4f}")
    print(f"Recall: {test_metrics['recall']:.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]


Training Fold 1/3


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training:   0%|          | 0/766 [00:00<?, ?it/s]

Validating:   0%|          | 0/383 [00:00<?, ?it/s]


Epoch 2/3


Training:   0%|          | 0/766 [00:00<?, ?it/s]

Validating:   0%|          | 0/383 [00:00<?, ?it/s]


Epoch 3/3


Training:   0%|          | 0/766 [00:00<?, ?it/s]

Validating:   0%|          | 0/383 [00:00<?, ?it/s]

Validating:   0%|          | 0/383 [00:00<?, ?it/s]

Validating:   0%|          | 0/203 [00:00<?, ?it/s]


Training Fold 2/3


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training:   0%|          | 0/766 [00:00<?, ?it/s]

Validating:   0%|          | 0/383 [00:00<?, ?it/s]


Epoch 2/3


Training:   0%|          | 0/766 [00:00<?, ?it/s]

Validating:   0%|          | 0/383 [00:00<?, ?it/s]


Epoch 3/3


Training:   0%|          | 0/766 [00:00<?, ?it/s]

Validating:   0%|          | 0/383 [00:00<?, ?it/s]

Validating:   0%|          | 0/383 [00:00<?, ?it/s]

Validating:   0%|          | 0/203 [00:00<?, ?it/s]


Training Fold 3/3


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training:   0%|          | 0/766 [00:00<?, ?it/s]

Validating:   0%|          | 0/383 [00:00<?, ?it/s]


Epoch 2/3


Training:   0%|          | 0/766 [00:00<?, ?it/s]

Validating:   0%|          | 0/383 [00:00<?, ?it/s]


Epoch 3/3


Training:   0%|          | 0/766 [00:00<?, ?it/s]

Validating:   0%|          | 0/383 [00:00<?, ?it/s]

Early stopping triggered


Validating:   0%|          | 0/383 [00:00<?, ?it/s]

Validating:   0%|          | 0/203 [00:00<?, ?it/s]


Cross-validation Results:

Validation Metrics:
Average Accuracy: 0.6858
Average F1-Score: 0.6890
Average Precision: 0.7094
Average Recall: 0.6858

Test Metrics:
Average Accuracy: 0.6878
Average F1-Score: 0.6908
Average Precision: 0.7147
Average Recall: 0.6878

Training completed!
Final Test Metrics:
Accuracy: 0.6878
F1-Score: 0.6908
Precision: 0.7147
Recall: 0.6878


In [11]:
md_1 = torch.load('/kaggle/working/roberta_fold_1.pt')
md_1['metrics']

{'loss': 0.6037260562414911,
 'accuracy': 0.7043449852989219,
 'precision': 0.7204842471837796,
 'recall': 0.7043449852989219,
 'f1_score': 0.706917630618046,
 'confusion_matrix': array([[1227,  588],
        [ 317,  929]]),
 'probabilities': [array([0.73093456, 0.26906544], dtype=float32),
  array([0.4284019, 0.5715981], dtype=float32),
  array([0.73092514, 0.26907486], dtype=float32),
  array([0.4266049, 0.5733951], dtype=float32),
  array([0.7307729 , 0.26922715], dtype=float32),
  array([0.32101232, 0.6789877 ], dtype=float32),
  array([0.39508304, 0.60491693], dtype=float32),
  array([0.42864782, 0.5713522 ], dtype=float32),
  array([0.315245  , 0.68475497], dtype=float32),
  array([0.6816772 , 0.31832278], dtype=float32),
  array([0.73094636, 0.2690536 ], dtype=float32),
  array([0.42872497, 0.571275  ], dtype=float32),
  array([0.6942479 , 0.30575207], dtype=float32),
  array([0.40033358, 0.5996665 ], dtype=float32),
  array([0.3838861, 0.6161139], dtype=float32),
  array([0.470

# Lấy test pred

In [12]:
# Dataset class cho public test
class SarcasmTestDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=77):
        self.df = df
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def prepare_text(self, row):
        return str(row['caption']) if pd.notna(row['caption']) else ''

    def __getitem__(self, idx):
        try:
            row = self.df.iloc[idx]

            # Prepare text
            text = self.prepare_text(row)

        
            # Process inputs
            encoding = self.tokenizer(
                text=text,
                padding="max_length",
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )

            # Remove batch dimension
            for k, v in encoding.items():
                encoding[k] = v.squeeze()

            # Thêm index và label nếu có
            encoding['index'] = torch.tensor(int(row['index']))
            if 'label' in row and pd.notna(row['label']):
                encoding['labels'] = torch.tensor(int(row['label']))

            return encoding

        except Exception as e:
            print(f"Error processing index {idx}: {e}")
            return None

In [13]:
def evaluate_test_set(model, test_loader, device):
    model.eval()
    predictions = []
    indices = []
    true_labels = []
    probabilities = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Forward pass
            logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            # Get predictions
            batch_preds = logits.argmax(-1).cpu().numpy()
            predictions.extend(batch_preds)
            indices.extend(batch['index'].cpu().numpy())

            # Lưu xác suất dự đoán vào danh sách
            probabilities.extend(torch.softmax(logits, dim=-1).cpu().numpy())

            # Nếu có nhãn thực tế, lưu lại
            if 'labels' in batch:
                true_labels.extend(batch['labels'].cpu().numpy())

    return indices, predictions, true_labels, probabilities

In [14]:
def save_submission(predictions, probabilities, file_name='roberta_priv.pt'):
    """
    Lưu trữ predictions và probabilities vào file .pt
    Args:
        predictions: list các predictions tương ứng
        probabilities: list các xác suất dự đoán tương ứng
        file_name: tên file để lưu trữ
    """
    submission_data = {
        'predictions': predictions,
        'probabilities': probabilities
    }
    torch.save(submission_data, file_name)
    print(f"Saved submission data to {file_name}")

In [15]:
def main_test():
    # Configuration
    BATCH_SIZE = 64
    #IMAGE_DIR = "/kaggle/input/vimmsd-uit/UIT/IT/datasets/input/test/public-test-images/dev-images"
    MODEL_PATH = "/kaggle/working/roberta_fold_1.pt"

    # Load test data
    file = '/kaggle/input/private-set/vimmsd-private-test.json'
    with open(file) as data:
        dict_data = json.load(data)

    test_df = pd.DataFrame.from_dict(dict_data, orient='index')
    test_df.reset_index(level=0, inplace=True)
    print(f"Test samples: {len(test_df)}")

    test_df['index'] = pd.to_numeric(test_df['index'], errors='coerce').fillna(-1).astype(int)
    if 'label' in test_df.columns:
        test_df['label'] = pd.to_numeric(test_df['label'], errors='coerce').fillna(-1).astype(int)

    print(test_df.dtypes)

    # Initialize processor and model
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = TextSarcasmClassifier(hidden_size = hidden_size, num_labels =2)

    # Load trained model weights
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    checkpoint = torch.load(MODEL_PATH, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'], strict = False)
    model = model.to(device)

    # Create test dataset and dataloader
    test_dataset = SarcasmTestDataset(test_df, tokenizer)
    test_loader = DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=2
    )

    # Run evaluation
    indices, predictions, true_labels, probabilities = evaluate_test_set(model, test_loader, device)

    # Lưu file submission chỉ với xác suất và nhãn dự đoán
    save_submission(predictions, probabilities)

In [16]:
if __name__ == "__main__":
    main_test()

Test samples: 1504
index       int64
image      object
caption    object
label       int64
dtype: object


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Testing:   0%|          | 0/24 [00:00<?, ?it/s]

Saved submission data to roberta_priv.pt


In [17]:
predict = torch.load('/kaggle/working/roberta_priv.pt')
predict['predictions'][-5:]

[0, 0, 1, 1, 1]