In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!ls /content/drive/MyDrive/ColabNotebooks/5242GP/CS5242_Project
!ls /content/drive/MyDrive/ColabNotebooks/5242GP/CS5242_Project/image
!ls /content/drive/MyDrive/ColabNotebooks/5242GP/CS5242_Project/preprocess/wsb/data_processed

cnn_featuers.csv  image  label.csv  MVSA_Single  Network  preprocess
data_preprocessing.ipynb  data_processed


ls: cannot access '/content/drive/MyDrive/ColabNotebooks/5242GP/CS5242_Project': No such file or directory


In [2]:
import os
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from transformers import CLIPProcessor, CLIPModel  # Huggingface CLIP
from sklearn.metrics import accuracy_score, classification_report
from tqdm.auto import tqdm # For progress bars

# --- 1. Configuration ---

In [19]:
CONFIG = {

    "batch_size": 16, # Adjust based on GPU memory
    "clip_model_name": "openai/clip-vit-base-patch32", # Or other CLIP model
    "data_dir": "data", # Directory containing images and texts

    "data_dir_img":"/content/drive/MyDrive/ColabNotebooks/5242GP/CS5242_Project/image",
    "data_dir_text":"/content/drive/MyDrive/ColabNotebooks/5242GP/CS5242_Project/preprocess/wsb/data_processed",

    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "freeze_clip": True, # Set to True to freeze CLIP weights initially
    "label_file": "/content/drive/MyDrive/ColabNotebooks/5242GP/label.csv", # CSV file with 'id' and 'label' columns
    "learning_rate_clip": 1e-6, # Smaller LR for pre-trained CLIP
    "learning_rate_head": 1e-4, # Larger LR for custom head
    "max_token_length": 77, # Standard CLIP context length
    "num_classes": 3, # positive, negative, neutral
    "num_epochs": 10, # Number of training epochs
    "seed": 42, # For reproducible splits/shuffling
    "test_split_ratio": 0.15, # Test set ratio
    "use_cross_attention": True,
    "use_cnn_layer": True,
    "val_split_ratio": 0.15 # Validation set ratio
    # --- Ablation Study Flags ---


}

In [4]:
# Label mapping
label_map = {"negative": 0, "neutral": 1, "positive": 2}
# Inverse mapping for reporting
inv_label_map = {v: k for k, v in label_map.items()}

# --- 2. Dataset and DataLoader ---

In [5]:
class MultimodalBlogDataset(Dataset):
    """Custom Dataset for loading image-text pairs."""
    def __init__(self, data_dir, dataframe, clip_processor, label_map, data_dir_img, data_dir_text):
        self.data_dir = data_dir

        ##### 建议回头text和img数据放到一个文件夹
        self.data_dir_img = data_dir_img
        self.data_dir_text = data_dir_text
        ####

        self.dataframe = dataframe
        self.processor = clip_processor
        self.label_map = label_map
        # Image transformations are handled by CLIPProcessor,
        # but ensure images are loaded correctly (RGB)
        self.image_loader = transforms.Compose([
            transforms.ToTensor() # ToTensor is needed before processor usually
                                  # Processor handles resize and normalize
        ])
        #transforms不是transformer， 而是图片预处理的工具
        #totensor()把 PIL.Image 或 numpy.ndarray 类型的图像转化为torch.Tensor，并归一化(0,255)->(0,1)
        # .Compose 定义一个pipeline， 这里暂时只有totensor一个操作
        # 某些版本的CLIPProcessor只支持输入tensor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        item_id = row['ID']
        label_str = row['label']
        label = self.label_map[label_str]

        # Load Image
        img_path = os.path.join(self.data_dir_img, f"{item_id}.jpg")
        try:
            image = Image.open(img_path).convert("RGB") #.jpg, .png可以忽略.convert("RGB")
        except FileNotFoundError:
            print(f"Warning: Image file not found {img_path}, returning None.")
            return None # Handle appropriately in collate_fn or dataloader

        # Load Text
        txt_path = os.path.join(self.data_dir_text, f"{item_id}.txt") #假设text文件和img文件放在一起
        try:
            with open(txt_path, 'r', encoding='utf-8') as f:
                text = f.read()
        except FileNotFoundError:
            print(f"Warning: Text file not found {txt_path}, returning None.")
            return None # Handle appropriately

        # Preprocessing is done in the training loop / collate_fn
        # Here we just return the raw data + label
        return image, text, label


In [15]:
def collate_fn(batch, processor, device, max_length):
    """Custom collate function to handle preprocessing within the batch."""
    # Filter out None items if any file was not found
    batch = [item for item in batch if item is not None]
    if not batch:
        return None

    images, texts, labels = zip(*batch) #将[img,text,label]数据转变为[img1, img2,..] [text1,text2..][label1,label2..]

    # Process batch using CLIPProcessor
    inputs = processor(
        text=list(texts), #Huggingface版本的处理器传入list，和open ai版本不同
        images=list(images),
        return_tensors="pt", #pytorch格式的tensor
        padding="max_length", # Pad to max_length
        truncation=True,
        max_length=max_length
    )
    '''
    {
      'input_ids': tensor([...]),  不是词向量，而是词向量在词表中的id
      'attention_mask': tensor([...]), 全是0和1，告诉attention不要使用padding
      'pixel_values': tensor([...]) 图片resize后的tensor[B, 3, 224, 224] BCWH，不是encoder处理过的表征
    }
    '''

    # Move tensors to the correct device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    #for k, v in inputs.items():
    #  print(f"{k}: {v.device}")
    labels = torch.tensor(labels, dtype=torch.long).to(device) #强制要求数据为长整型

    return inputs, labels

# --- 3. Model Architecture ---

In [17]:
class MultimodalClassifier(nn.Module):
    """The main model combining CLIP features with a custom fusion head."""
    def __init__(self, clip_model_name, num_classes,
                 use_cross_attention=True, use_cnn_layer=True, freeze_clip=True, device='cpu'):
        super().__init__()
        self.use_cross_attention = use_cross_attention
        self.use_cnn_layer = use_cnn_layer
        self.device = device

        # Load CLIP model
        self.clip_model = CLIPModel.from_pretrained(clip_model_name).to(self.device)

        # Freeze CLIP weights if specified
        if freeze_clip: #open ai版本是默认无法调参的，hugging face版本支持fine tune
            print("Freezing CLIP model parameters.")
            for param in self.clip_model.parameters():
                param.requires_grad = False
        else:
            print("CLIP model parameters will be fine-tuned.")


        # Get CLIP embedding dimension (projection_dim)
        self.embed_dim = self.clip_model.projection_dim # e.g., 512 or 768

        # --- Fusion Layers ---
        if self.use_cross_attention:
            # MultiheadAttention expects (batch, seq_len, embed_dim) if batch_first=True
            # Our features are (batch, embed_dim), so add seq_len=1
            self.img_to_txt_attention = nn.MultiheadAttention(self.embed_dim, num_heads=8, batch_first=True, dropout=0.1)
            self.txt_to_img_attention = nn.MultiheadAttention(self.embed_dim, num_heads=8, batch_first=True, dropout=0.1)
            fusion_input_dim = self.embed_dim * 4 # img_feat + txt_feat + attended_img + attended_txt
        else:
            fusion_input_dim = self.embed_dim * 2 # img_feat + txt_feat

        # --- CNN Layer (Optional) ---
        # Applying Conv1d on concatenated features of length 1.
        # Kernel size 1 acts like a Linear layer applied independently to each channel.
        # Might not capture "global perception" in the traditional sense here.
        if self.use_cnn_layer:
            self.cnn_out_channels = fusion_input_dim // 2 # Example reduction
            # Input shape for Conv1d: (batch, channels, length)
            # Our concatenated features: (batch, fusion_input_dim)
            # Reshape to: (batch, fusion_input_dim, 1)
            self.conv1d = nn.Conv1d(in_channels=fusion_input_dim,
                                    out_channels=self.cnn_out_channels,  #
                                    kernel_size=1, # Acts like a linear projection per channel ，等于不滑动
                                    padding=0)
            self.relu_cnn = nn.ReLU()
            # 多维数据一般不用linear，因为linear不支持channel，但由于length =1，kernel size=1，所以两者是等价的，但cnn功能更强大，后续好调整
            # After Conv1d: (batch, cnn_out_channels, 1) -> Flatten -> (batch, cnn_out_channels)
            classifier_input_dim = self.cnn_out_channels
        else:
            classifier_input_dim = fusion_input_dim # Input dim for MLP if CNN is skipped

        # --- Classifier Head (MLP: Increase then Decrease Dim) ---
        self.classifier_hidden_dim = classifier_input_dim * 2 # "升维"
        self.fc1 = nn.Linear(classifier_input_dim, self.classifier_hidden_dim)
        self.relu_fc1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(self.classifier_hidden_dim, num_classes) # "降维" to num_classes


    def forward(self, inputs):
        # Get CLIP features
        # Note: Use **inputs to unpack dict directly into arguments
        image_features = self.clip_model.get_image_features(pixel_values=inputs['pixel_values'])
        text_features = self.clip_model.get_text_features(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        # Features are typically (batch_size, embed_dim)

        # --- Fusion ---
        if self.use_cross_attention:
            # Reshape features for MultiheadAttention: (batch, seq_len=1, embed_dim)
            img_feat_attn = image_features.unsqueeze(1)
            txt_feat_attn = text_features.unsqueeze(1)

            # Image attends to Text (Q=img, K=txt, V=txt)
            attended_img, _ = self.img_to_txt_attention(img_feat_attn, txt_feat_attn, txt_feat_attn)
            attended_img = attended_img.squeeze(1) # Back to (batch, embed_dim)

            # Text attends to Image (Q=txt, K=img, V=img)
            attended_txt, _ = self.txt_to_img_attention(txt_feat_attn, img_feat_attn, img_feat_attn)
            attended_txt = attended_txt.squeeze(1) # Back to (batch, embed_dim)

            # Concatenate all features
            fused_features = torch.cat([image_features, text_features, attended_img, attended_txt], dim=1)
            # Shape: (batch, embed_dim * 4)
        else:
            # Simple concatenation if cross-attention is disabled
            fused_features = torch.cat([image_features, text_features], dim=1)
            # Shape: (batch, embed_dim * 2)

        # --- Optional CNN Layer ---
        if self.use_cnn_layer:
            # Reshape for Conv1d: (batch, channels=fusion_input_dim, length=1)
            cnn_input = fused_features.unsqueeze(2)
            cnn_output = self.conv1d(cnn_input)
            cnn_output = self.relu_cnn(cnn_output)
            # Flatten: (batch, cnn_out_channels, 1) -> (batch, cnn_out_channels)
            classifier_input = cnn_output.squeeze(2)
        else:
            classifier_input = fused_features # Pass concatenated features directly

        # --- Classifier Head ---
        x = self.fc1(classifier_input)
        x = self.relu_fc1(x)
        x = self.dropout1(x)
        logits = self.fc2(x) # Output logits (batch, num_classes)

        return logits

# --- 4. Training and Evaluation Functions ---

In [8]:
def train_epoch(model, dataloader, optimizer, criterion, device, clip_processor, max_length):
    model.train() # Set model to training mode
    total_loss = 0
    all_preds = []
    all_labels = []

    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for batch in progress_bar:
        if batch is None: continue # Skip if collate_fn returned None
        inputs, labels = batch

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Store predictions and labels for metric calculation
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

        progress_bar.set_postfix({'loss': loss.item()})

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    return avg_loss, accuracy

def evaluate_epoch(model, dataloader, criterion, device, clip_processor, max_length):
    model.eval() # Set model to evaluation mode
    total_loss = 0
    all_preds = []
    all_labels = []

    progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)
    with torch.no_grad(): # Disable gradient calculations
        for batch in progress_bar:
            if batch is None: continue
            inputs, labels = batch

            # Forward pass
            outputs = model(inputs)

            # Calculate loss
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            # Store predictions and labels
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
            progress_bar.set_postfix({'loss': loss.item()})


    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=label_map.keys(), zero_division=0)

    return avg_loss, accuracy, report, all_labels, all_preds


# --- 5. Main Execution ---

In [9]:
if __name__ == "__main__":
    print(f"Using device: {CONFIG['device']}")
    torch.manual_seed(CONFIG['seed'])

    # --- Load Data ---
    print("Loading labels...")
    try:
        df = pd.read_csv(CONFIG['label_file']).dropna(how='all')  #dataframe object
        df['ID'] = df['ID'].astype(int)
        df['class'] = df['class'].astype(int)
        # Basic validation
        print(df)
        if 'ID' not in df.columns or 'label' not in df.columns:
            raise ValueError("CSV must contain 'id' and 'label' columns.")
        if not all(label in label_map for label in df['label'].unique()):
            raise ValueError(f"Labels in CSV must be one of {list(label_map.keys())}")
        print(f"Found {len(df)} samples.")
    except FileNotFoundError:
        print(f"Error: Label file not found at {CONFIG['label_file']}")
        #exit()
    except ValueError as e:
        print(f"Error: {e}")
        #exit()


Using device: cuda
Loading labels...
        ID      text     image     label  class
0        1   neutral  positive  positive      2
1        2   neutral  positive  positive      2
2        3   neutral  positive  positive      2
3        4  positive  positive  positive      2
4        5  positive  positive  positive      2
...    ...       ...       ...       ...    ...
4506  5125   neutral  positive  positive      2
4507  5126  positive   neutral  positive      2
4508  5127  positive  positive  positive      2
4509  5128   neutral  positive  positive      2
4510  5129  positive  positive  positive      2

[4511 rows x 5 columns]
Found 4511 samples.


In [None]:

    # --- Split Data ---
    # Calculate split sizes
    total_size = len(df) # count data rows (exclude title row)
    test_size = int(CONFIG['test_split_ratio'] * total_size)  #取0.15的样本用于测试
    val_size = int(CONFIG['val_split_ratio'] * total_size) #取0.15的样本用于验证
    train_size = total_size - val_size - test_size

    print(f"Splitting data: Train={train_size}, Val={val_size}, Test={test_size}")
    if train_size <= 0 or val_size <= 0 or test_size <= 0:
        print("Error: Dataset too small for specified split ratios.")
        exit()

    # Perform the split   #from torch.utils.data import random_split
    #input: 1st : list / dataframe/ tensor /Dataset
    #2nd: size #3rd: only for fix seed
    train_df, val_df, test_df = random_split(df, [train_size, val_size, test_size],
                                             generator=torch.Generator().manual_seed(CONFIG['seed']))

    # Convert subsets back to DataFrames for easier indexing if needed by Dataset class
    # Note: random_split returns Subset objects. We get the indices and select from the original df.
    # drop = true means don't remain the old shuffled indices as a column
    train_df = df.iloc[train_df.indices].reset_index(drop=True)
    val_df = df.iloc[val_df.indices].reset_index(drop=True)
    test_df = df.iloc[test_df.indices].reset_index(drop=True)


    # --- Initialize Processor, Dataset, DataLoader ---
    print("Initializing CLIP Processor...")
    clip_processor = CLIPProcessor.from_pretrained(CONFIG['clip_model_name']) #clip-vit32

    print("Creating Datasets...")
    #传入df是label的数据，通过此函数去data_dir文件夹取得图片、文本数据，getitem结果：image, text, label
    #只是定义了一个取数据的函数，还没实际获得
    train_dataset = MultimodalBlogDataset(CONFIG['data_dir'], train_df, clip_processor, label_map,CONFIG['data_dir_img'],CONFIG['data_dir_text'])
    val_dataset = MultimodalBlogDataset(CONFIG['data_dir'], val_df, clip_processor, label_map,CONFIG['data_dir_img'],CONFIG['data_dir_text'])
    test_dataset = MultimodalBlogDataset(CONFIG['data_dir'], test_df, clip_processor, label_map,CONFIG['data_dir_img'],CONFIG['data_dir_text'])

    print("Creating DataLoaders...")
    # Define the collate function with necessary arguments partially filled
    collate_fn_partial = lambda batch: collate_fn(batch, clip_processor, CONFIG['device'], CONFIG['max_token_length'])
    #DataLoader(dataset的子类, bs, shuffle 训练时为true)
    #dataloader： 每次从数据源取出一个batch，对元素调用get_item,取完所有元素后，传给collate_fn
    #note： dataloader默认只给collate_fn传入batch size，如果要自定义其他参数，需要先写collate_fn_partial
    #依然只是构造了dataloader的实例，还不是数据：for inputs, labels in train_loader才是数据
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, collate_fn=collate_fn_partial)
    val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False, collate_fn=collate_fn_partial)
    test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False, collate_fn=collate_fn_partial)


    # --- Initialize Model, Loss, Optimizer ---
    print("Initializing Model...")
    # Pass ablation flags here
    model = MultimodalClassifier(
        clip_model_name=CONFIG['clip_model_name'],
        num_classes=CONFIG['num_classes'],
        use_cross_attention=CONFIG["use_cross_attention"],
        use_cnn_layer=CONFIG["use_cnn_layer"],
        freeze_clip=CONFIG["freeze_clip"],
        device=CONFIG['device']
    )
    model = model.to(CONFIG['device']) # Model parts are moved to device in __init__

    criterion = nn.CrossEntropyLoss()

    # Separate parameters for different learning rates
    clip_params = list(model.clip_model.parameters())
    head_params = []
    if CONFIG["use_cross_attention"]:
        head_params.extend(list(model.img_to_txt_attention.parameters()))
        head_params.extend(list(model.txt_to_img_attention.parameters()))
    if CONFIG["use_cnn_layer"]:
        head_params.extend(list(model.conv1d.parameters()))
    head_params.extend(list(model.fc1.parameters()))
    head_params.extend(list(model.fc2.parameters()))


    optimizer = optim.AdamW([
        {'params': clip_params, 'lr': CONFIG['learning_rate_clip']},
        {'params': head_params, 'lr': CONFIG['learning_rate_head']}
    ])

    # --- Training Loop ---
    print("Starting Training...")
    best_val_accuracy = 0.0
    best_epoch = -1

    for epoch in range(CONFIG['num_epochs']):
        print(f"\n--- Epoch {epoch+1}/{CONFIG['num_epochs']} ---")

        train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, CONFIG['device'], clip_processor, CONFIG['max_token_length'])
        print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")

        val_loss, val_acc, val_report, _, _ = evaluate_epoch(model, val_loader, criterion, CONFIG['device'], clip_processor, CONFIG['max_token_length'])
        print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")
        print("Validation Classification Report:\n", val_report)

        # Save best model based on validation accuracy
        if val_acc > best_val_accuracy:
            best_val_accuracy = val_acc
            best_epoch = epoch
            # Create a directory to save models if it doesn't exist
            os.makedirs("models", exist_ok=True)
            model_save_path = os.path.join("models", "best_multimodal_model.pth")
            print(f"Validation accuracy improved. Saving model to {model_save_path}")
            torch.save(model.state_dict(), model_save_path)
            # You might want to save optimizer state and epoch number too for resuming training

    print(f"\nTraining finished. Best validation accuracy ({best_val_accuracy:.4f}) achieved at epoch {best_epoch+1}.")

    # --- Final Evaluation on Test Set ---
    print("\n--- Evaluating on Test Set using Best Model ---")
    # Load the best model weights
    best_model_path = os.path.join("models", "best_multimodal_model.pth")
    if os.path.exists(best_model_path):
        model.load_state_dict(torch.load(best_model_path, map_location=CONFIG['device']))
        print("Loaded best model weights for testing.")

        test_loss, test_acc, test_report, test_labels, test_preds = evaluate_epoch(model, test_loader, criterion, CONFIG['device'], clip_processor, CONFIG['max_token_length'])
        print(f"\nTest Loss: {test_loss:.4f}")
        print(f"Test Accuracy: {test_acc:.4f}")
        print("Test Set Classification Report:\n", test_report)
        # You can further analyze test_labels and test_preds here (e.g., confusion matrix)
    else:
        print("Warning: Best model file not found. Skipping test set evaluation.")

    print("\nDone.")
