# data_set.py

In [1]:
from torch.utils.data import Dataset
import os
from PIL import Image
import json


class MyDataset(Dataset):
    def __init__(self, args, mode, limit=None):
        self.args = args
        self.data = self.load_data(args, mode, limit)
        self.image_ids = list(self.data.keys())  # Use unique indices as keys
        for id in self.data.keys():
            if mode in ["train"]:
                self.data[id]["image_path"] = os.path.join(self.args.image_train, self.data[id]["image"])
            else:
                self.data[id]["image_path"] = os.path.join(self.args.image_test, self.data[id]["image"])
    
    def load_data(self, args, mode, limit=None):
        cnt = 0
        data_set = {}
        label_mapping = {
            "not-sarcasm": 0,
            "multi-sarcasm": 1,
            "text-sarcasm": 2,
            "image-sarcasm": 3
        }
        
        if mode in ["train"]:
            with open(self.args.text_train, 'r', encoding='utf-8') as f:
                datas = json.load(f)
                for key, data in datas.items():
                    if limit is not None and cnt >= limit:
                        break

                    file_name = data['image']
                    sentence = data['caption']
                    label = label_mapping[data['label']]
                    
                    cur_img_path = os.path.join(self.args.image_train, file_name)
                    if not os.path.exists(cur_img_path):
                        print(f"{cur_img_path} not found!")
                        continue
                    
                    data_set[key] = {
                        "image": file_name,
                        "caption": sentence,
                        "label": label
                    }
                    cnt += 1
                    
        elif mode in ["test"]:
            with open(self.args.text_test, 'r', encoding='utf-8') as f:
                datas = json.load(f)
                for key, data in datas.items():
                    file_name = data['image']
                    sentence = data['caption']
                    label = data['label']

                    cur_img_path = os.path.join(self.args.image_test, file_name)
                    if not os.path.exists(cur_img_path):
                        print(f"{cur_img_path} not found!")
                        continue
                    
                    data_set[key] = {
                        "image": file_name,
                        "caption": sentence,
                        "label": label
                    }
                    cnt += 1
                    
        else:
            print("Not found correct mode in MyDataset class!!!")
        
        return data_set

    def image_loader(self, id):
        return Image.open(self.data[id]["image_path"])

    def text_loader(self, id):
        return self.data[id]["caption"]

    def __getitem__(self, index):
        id = self.image_ids[index]  # Access by unique key (index from JSON)
        text = self.text_loader(id)
        image_feature = self.image_loader(id)
        label = self.data[id]["label"]
        return text, image_feature, label, id

    def __len__(self):
        return len(self.image_ids)

    @staticmethod
    def collate_func(batch_data):
        batch_size = len(batch_data)
 
        if batch_size == 0:
            return {}

        text_list = []
        image_list = []
        label_list = []
        id_list = []
        for instance in batch_data:
            text_list.append(instance[0])
            image_list.append(instance[1])
            label_list.append(instance[2])
            id_list.append(instance[3])
        return text_list, image_list, label_list, id_list


# model.py

In [2]:
from transformers import CLIPModel,BertConfig
from transformers.models.bert.modeling_bert import BertLayer
import torch.nn as nn
import torch
import torch.nn.functional as F
import copy

class MultimodalEncoder(nn.Module):
    def __init__(self, config, layer_number):
        super(MultimodalEncoder, self).__init__()
        layer = BertLayer(config)
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(layer_number)])

    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
        all_encoder_layers = []
        all_encoder_attentions = []
        for layer_module in self.layer:
            hidden_states, attention = layer_module(hidden_states, attention_mask, output_attentions=True)
            all_encoder_attentions.append(attention)
            if output_all_encoded_layers:
                all_encoder_layers.append(hidden_states)
        if not output_all_encoded_layers:
            all_encoder_layers.append(hidden_states)
        return all_encoder_layers, all_encoder_attentions


class MV_CLIP(nn.Module):
    def __init__(self, args, class_weights=None):
        super(MV_CLIP, self).__init__()
        self.model = CLIPModel.from_pretrained(args.clip_model)
        self.config = BertConfig.from_pretrained("bert-base-uncased")
        self.config.hidden_size = 512
        self.config.num_attention_heads = 8
        self.trans = MultimodalEncoder(self.config, layer_number=args.layers)
        if args.simple_linear:
            self.text_linear = nn.Linear(args.text_size, args.text_size)
            self.image_linear = nn.Linear(args.image_size, args.image_size)
        else:
            self.text_linear =  nn.Sequential(
                nn.Linear(args.text_size, args.text_size),
                nn.Dropout(args.dropout_rate),
                nn.GELU()
            )
            self.image_linear =  nn.Sequential(
                nn.Linear(args.image_size, args.image_size),
                nn.Dropout(args.dropout_rate),
                nn.GELU()
            )

        self.classifier_fuse = nn.Linear(args.text_size , args.label_number)
        self.classifier_text = nn.Linear(args.text_size, args.label_number)
        self.classifier_image = nn.Linear(args.image_size, args.label_number)

        self.loss_fct = nn.CrossEntropyLoss(weight=class_weights) # thêm class weight
        self.att = nn.Linear(args.text_size, 1, bias=False)

    def forward(self, inputs, labels):
        output = self.model(**inputs,output_attentions=True)
        text_features = output['text_model_output']['last_hidden_state']
        image_features = output['vision_model_output']['last_hidden_state']
        text_feature = output['text_model_output']['pooler_output']
        image_feature = output['vision_model_output']['pooler_output']
        text_feature = self.text_linear(text_feature)
        image_feature = self.image_linear(image_feature)

        text_embeds = self.model.text_projection(text_features)
        image_embeds = self.model.visual_projection(image_features)
        input_embeds = torch.cat((image_embeds, text_embeds), dim=1)
        attention_mask = torch.cat((torch.ones(text_features.shape[0], 50).to(text_features.device), inputs['attention_mask']), dim=-1) # patch 14 thì thay từ 50 sang 257
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        fuse_hiddens, all_attentions = self.trans(input_embeds, extended_attention_mask, output_all_encoded_layers=False)
        fuse_hiddens = fuse_hiddens[-1]
        new_text_features = fuse_hiddens[:, 50:, :]
        new_text_feature = new_text_features[
            torch.arange(new_text_features.shape[0], device=inputs['input_ids'].device), inputs['input_ids'].to(torch.int).argmax(dim=-1)
        ]

        new_image_feature = fuse_hiddens[:, 0, :].squeeze(1)

        text_weight = self.att(new_text_feature)
        image_weight = self.att(new_image_feature)    
        att = nn.functional.softmax(torch.stack((text_weight, image_weight), dim=-1),dim=-1)
        tw, iw = att.split([1,1], dim=-1)
        fuse_feature = tw.squeeze(1) * new_text_feature + iw.squeeze(1) * new_image_feature

        logits_fuse = self.classifier_fuse(fuse_feature)
        logits_text = self.classifier_text(text_feature)
        logits_image = self.classifier_image(image_feature)
   
        fuse_score = nn.functional.softmax(logits_fuse, dim=-1)
        text_score = nn.functional.softmax(logits_text, dim=-1)
        image_score = nn.functional.softmax(logits_image, dim=-1)

        score = fuse_score + text_score + image_score

        outputs = (score,)
        if labels is not None:
            loss_fuse = self.loss_fct(logits_fuse, labels)
            loss_text = self.loss_fct(logits_text, labels)
            loss_image = self.loss_fct(logits_image, labels)
            loss = loss_fuse + loss_text + loss_image

            outputs = (loss,) + outputs
        return outputs

# train.py


In [3]:
import os
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm, trange
from sklearn import metrics
import numpy as np


# def train(args, model, device, train_data, dev_data, processor):
def train(args, model, device, train_data, processor):
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    train_loader = DataLoader(dataset=train_data,
                              batch_size=args.train_batch_size,
                              collate_fn=MyDataset.collate_func,
                              shuffle=True)
    total_steps = int(len(train_loader) * args.num_train_epochs)
    
    model.to(device)

    if args.optimizer_name == 'adafactor':
        from transformers.optimization import Adafactor, AdafactorSchedule

        print('Use Adafactor Optimizer for Training.')
        optimizer = Adafactor(
            model.parameters(),
            # lr=1e-3,
            # eps=(1e-30, 1e-3),
            # clip_threshold=1.0,
            # decay_rate=-0.8,
            # beta1=None,
            lr=None,
            weight_decay=args.weight_decay,
            relative_step=True,
            scale_parameter=True,
            warmup_init=True
        )
        scheduler = AdafactorSchedule(optimizer)
    elif args.optimizer_name == 'adam':
        print('Use AdamW Optimizer for Training.')
        from transformers.optimization import AdamW, get_linear_schedule_with_warmup
        if args.model == 'MV_CLIP':
            clip_params = list(map(id, model.model.parameters()))
            base_params = filter(lambda p: id(p) not in clip_params, model.parameters())
            optimizer = AdamW([
                    {"params": base_params},
                    {"params": model.model.parameters(),"lr": args.clip_learning_rate}
                    ], lr=args.learning_rate, weight_decay=args.weight_decay)

            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(args.warmup_proportion * total_steps),
                                                    num_training_steps=total_steps)
        else:
            optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon, weight_decay=args.weight_decay)
            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(args.warmup_proportion * total_steps),
                                                num_training_steps=total_steps)

    else:
        raise Exception('Wrong Optimizer Name!!!')


    max_acc = 0.
    for i_epoch in trange(0, int(args.num_train_epochs), desc="Epoch", disable=False):
        sum_loss = 0.
        sum_step = 0

        iter_bar = tqdm(train_loader, desc="Iter (loss=X.XXX)", disable=False)
        model.train()

        for step, batch in enumerate(iter_bar):
            text_list, image_list, label_list, id_list = batch
            if args.model == 'MV_CLIP':
                inputs = processor(text=text_list, images=image_list, padding='max_length', truncation=True, max_length=args.max_len, return_tensors="pt").to(device)
                labels = torch.tensor(label_list).to(device)

            loss, score = model(inputs,labels=labels)
            sum_loss += loss.item()
            sum_step += 1

            iter_bar.set_description("Iter (loss=%5.3f)" % loss.item())
            loss.backward()
            optimizer.step()
            if args.optimizer_name == 'adam':
                scheduler.step() 
            optimizer.zero_grad()

        print(f"Epoch {i_epoch + 1}")
        print(f"Train loss {sum_loss/sum_step}")

        
        path_to_save = os.path.join(args.output_dir, args.model)
        if not os.path.exists(path_to_save):
            os.mkdir(path_to_save)
        model_to_save = (model.module if hasattr(model, "module") else model)
        torch.save(model_to_save.state_dict(), os.path.join(path_to_save, f'model{args.current_epoch+i_epoch}.pt'))
        print(f"Saved model at {os.path.join(path_to_save, f'model{args.current_epoch+i_epoch}{args.attempt}.pt')}")

            
        torch.cuda.empty_cache()
    print('Train done')

# main.py

In [4]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = '2'
import torch
import argparse
import random
import numpy as np
from transformers import CLIPProcessor, AutoTokenizer, AutoProcessor
import pickle
from PIL import ImageFile
from sklearn.model_selection import train_test_split
ImageFile.LOAD_TRUNCATED_IMAGES = True


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

    
def compute_class_weights(args, train_data):
    # Count the number of occurrences of each class
    class_counts = torch.zeros(args.label_number)
    
    for data in train_data:
        _, _, label, _ = data
        class_counts[label] += 1
    
    # Compute class weights
    total_samples = class_counts.sum().item()
    class_weights = total_samples / (class_counts * len(class_counts))

    return class_weights
    
    
class Args:
    def __init__(self, **entries):
        self.__dict__.update(entries)
        

def main():
    args_dict = {
        'device': '0',
        'model': 'MV_CLIP',
        'text_train': '/kaggle/input/hateam-processed-en/HAteam_processed_en.json',
        'image_train': '/kaggle/input/muiltimodal-sarcasm/data/training-images',
        'simple_linear': False,
        'num_train_epochs': 4,
        'train_batch_size': 32,
        'label_number': 4,
        'text_size': 512,
        'image_size': 768,
        'adam_epsilon': 1e-8,
        'optimizer_name': 'adam',
        'learning_rate': 5e-4,
        'clip_learning_rate': 1e-6,
        'max_len': 77,
        'layers': 6,
        'max_grad_norm': 5.0,
        'weight_decay': 0.05,
        'warmup_proportion': 0.2,
        'dropout_rate': 0.1,
        'output_dir': '/kaggle/working/',
        'limit': None,
        'seed': 42,
        'model_path': '/kaggle/working/MV_CLIP',
        'clip_model': 'openai/clip-vit-base-patch32',
        'current_epoch': 9,
        'attempt': ''
    }
    
    args = Args(**args_dict)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    seed_everything(args.seed)

    # Load full training data và test data
    train_data = MyDataset(args, mode='train', limit=None)
    
    # Compute class weights
    class_weights = compute_class_weights(args, train_data)
    class_weights = class_weights.to(device)  # Move to same device as model
    
    if args.model == 'MV_CLIP':
        processor = CLIPProcessor.from_pretrained(args.clip_model)
        model = MV_CLIP(args, class_weights=class_weights)
    else:
        raise RuntimeError('Error model name!')

    #model.load_state_dict(torch.load('/kaggle/working/MV_CLIP/model4.pt', map_location="cpu"))
    model.to(device)

    train(args, model, device, train_data, processor)

In [None]:
main()

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Use AdamW Optimizer for Training.


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iter (loss=4.284):   0%|          | 0/478 [00:01<?, ?it/s][A
Iter (loss=4.284):   0%|          | 1/478 [00:01<14:29,  1.82s/it][A
Iter (loss=4.263):   0%|          | 1/478 [00:02<14:29,  1.82s/it][A
Iter (loss=4.263):   0%|          | 2/478 [00:02<09:45,  1.23s/it][A
Iter (loss=4.247):   0%|          | 2/478 [00:03<09:45,  1.23s/it][A
Iter (loss=4.247):   1%|          | 3/478 [00:03<08:07,  1.03s/it][A
Iter (loss=4.179):   1%|          | 3/478 [00:03<08:07,  1.03s/it][A
Iter (loss=4.179):   1%|          | 4/478 [00:04<07:30,  1.05it/s][A
Iter (loss=4.240):   1%|          | 4/478 [00:04<07:30,  1.05it/s][A
Iter (loss=4.240):   1%|          | 5/478 [00:05<06:54,  1.14it/s][A
Iter (loss=4.264):   1%|          | 5/478 [00:05<06:54,  1.14it/s][A
Iter (loss=4.264):   1%|▏         | 6/478 [00:05<06:35,  1.19it/s][A
Iter (loss=4.252):   1%|▏         | 6/478 [00:06<06:35,  1.19it/s][A
Iter (loss=4.252):   1%|▏         | 7/478 [00:06<06:

# predict.py

In [30]:
import os
from transformers import CLIPProcessor
from torch.utils.data import DataLoader
import torch
import argparse
from tqdm import tqdm
import json
import numpy as np
from zipfile import ZipFile


def predict(args, model, device, data, processor, pre = None):
    data_loader = DataLoader(data, batch_size=args.test_batch_size, collate_fn=MyDataset.collate_func,shuffle=False)
    results = {}  # Sử dụng dict để lưu kết quả dự đoán
    index = 0 # Để lưu id của file kết quả

    model.eval()
    with open(pre,'w',encoding='utf-8') as fout:
        with torch.no_grad():
            for i_batch, t_batch in enumerate(data_loader):
                text_list, image_list, _, id_list = t_batch  # Nhận các phần tử từ batch, bỏ qua label
                
                # Xử lý đầu vào cho model
                inputs = processor(text=text_list, images=image_list, padding='max_length', truncation=True, max_length=args.max_len, return_tensors="pt").to(device)
                
                # Dự đoán đầu ra
                t_outputs = model(inputs, labels=None)
                predict = torch.argmax(t_outputs[0], -1).cpu().numpy().tolist()
                
                for pred in predict:
                    results[index] = ['not-sarcasm', 'multi-sarcasm', 'text-sarcasm', 'image-sarcasm'][pred]
                    index += 1
                
    # Save predictions to JSON and compress into a zip file
    with ZipFile(pre, 'w') as zipf:
        with zipf.open('results.json', 'w') as json_file:
            json_data = json.dumps({"results": results, "phase": "dev"}, ensure_ascii=False)
            json_file.write(json_data.encode('utf-8'))
    
    print("Predictions have been saved to", pre)      

class Args:
    def __init__(self, **entries):
        self.__dict__.update(entries)

def main_predict():
    args_dict = {
        'device': '0',
        'max_len': 77,
        'text_size': 512,
        'image_size': 768,
        'dropout_rate': 0,
        'label_number': 4,
        'test_batch_size': 32,
        'model_path': "/kaggle/working/MV_CLIP",
        'save_file': "B32_HAdata_processed_en_epoch4.zip",
        'text_test': '/kaggle/input/text-processed-en/vimmsd-test-processed-en.json',
        'image_test': '/kaggle/input/muiltimodal-sarcasm/data/public-test-images',
        'layers': 6,
        'simple_linear': False,
        'clip_model': 'openai/clip-vit-base-patch32'
    }
    
    args = Args(**args_dict)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
    
    processor = CLIPProcessor.from_pretrained(args.clip_model)
    model = MV_CLIP(args)

    test_data = MyDataset(args, mode='test', limit=None)

    model.load_state_dict(torch.load('/kaggle/working/MV_CLIP/model4.pt', map_location="cpu"), strict=False)
    model.to(device)
    model.eval()

    predict(args, model, device, test_data, processor, pre=args.save_file)

In [31]:
main_predict()

  model.load_state_dict(torch.load('/kaggle/working/MV_CLIP/model5.pt', map_location="cpu"), strict=False)


Predictions have been saved to B32_vi_epoch5_augment_vi.zip
