In [10]:
import os
import json
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
import unicodedata
import re
import ast
import gc

import torch
import torch.nn as nn
from transformers import AutoProcessor, AutoModelForCausalLM
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split

In [11]:
IMG_DIR = '/kaggle/input/2025-sum-dpl-302-m/devset_images/devset_images'
META_JSON = '/kaggle/input/2025-sum-dpl-302-m/devset_images_metadata.json'
GT_CSV = '/kaggle/input/2025-sum-dpl-302-m/devset_images_gt.csv'

In [12]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("✅ Using device:", DEVICE)

✅ Using device: cuda


In [13]:
from transformers import AutoProcessor, AutoModelForCausalLM

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ✅ Dùng tên rõ ràng để tránh ghi đè model MLP
processor_git2 = AutoProcessor.from_pretrained("microsoft/git-large")
model_git2 = AutoModelForCausalLM.from_pretrained("microsoft/git-large").to(DEVICE)
model_git2.eval()

GitForCausalLM(
  (git): GitModel(
    (embeddings): GitEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (image_encoder): GitVisionModel(
      (vision_model): GitVisionTransformer(
        (embeddings): GitVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
          (position_embedding): Embedding(257, 1024)
        )
        (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (encoder): GitVisionEncoder(
          (layers): ModuleList(
            (0-23): 24 x GitVisionEncoderLayer(
              (self_attn): GitVisionAttention(
                (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
      

In [14]:
with open(META_JSON, 'r') as f:
    json_data = json.load(f)
train_data = json_data['images']
train_df = pd.DataFrame(train_data)

# Clean columns
cols_needed = ['image_id', 'title', 'description', 'user_tags']
train_df = train_df[[col for col in cols_needed if col in train_df.columns]]
train_df = train_df.rename(columns={'image_id': 'id'})
train_df['id'] = train_df['id'].astype(int)

# Clean user_tags
train_df["user_tags"] = train_df["user_tags"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
train_df["user_tags"] = train_df["user_tags"].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))

# Basic text clean
def basic_clean(text):
    if not isinstance(text, str): return ""
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^\w\s\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\-_#]+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.lower().strip()

for col in ['title', 'description', 'user_tags']:
    train_df[col] = train_df[col].fillna("").astype(str).apply(basic_clean)

train_df["text"] = train_df.apply(lambda row: f"Title: {row['title']} | Description: {row['description']} | Tags: {row['user_tags']}", axis=1)

# Merge label
label_df = pd.read_csv(GT_CSV)
label_df['id'] = label_df['id'].apply(lambda x: int(float(x)))
train_df = train_df.merge(label_df, on="id", how="left")

In [15]:
def find_image_path(image_id, exts=[".jpg", ".jpeg", ".png", ".bmp", ".gif"]):
    for ext in exts:
        path = os.path.join(IMG_DIR, f"{image_id}{ext}")
        if os.path.isfile(path):
            return path
    return None

In [16]:
@torch.no_grad()
def extract_git2_feature(image_path, text, fallback_dim=768):
    try:
        from PIL import Image
        image = Image.open(image_path).convert("RGB")

        # Cắt bớt text nếu quá dài
        text = " ".join(text.split()[:80])

        # Tokenize
        inputs = processor_git2(images=image, text=text, return_tensors="pt").to(DEVICE)

        # Forward GIT-2
        outputs = model_git2(**inputs, output_hidden_states=True)

        # Trích đặc trưng từ token cuối ở layer cuối
        feature = outputs.hidden_states[-1][:, -1, :]  # shape (1, hidden_dim)

        return feature.squeeze(0).cpu().numpy()  # shape (hidden_dim,)
    
    except Exception as e:
        print(f"❌ GIT-2 lỗi ảnh: {image_path} | {e}")
        return np.zeros(fallback_dim, dtype=np.float32)

In [17]:
success_count = 0
error_count = 0
missing_image_count = 0
invalid_feature_count = 0

all_features = []
all_labels = []

expected_shape = (768,)

for idx, row in tqdm(train_df.iterrows(), total=len(train_df), desc="🔄 Extracting GIT-2 features"):
    try:
        image_id = str(int(row['id']))
        image_path = find_image_path(image_id)
        text_input = row.get('text', '')

        feature = None
        if image_path is not None:
            feature = extract_git2_feature(image_path, text_input, fallback_dim=expected_shape[0])

        if feature is None or not isinstance(feature, np.ndarray) or feature.shape != expected_shape:
            if len(all_features) > 0:
                feature = np.mean(all_features, axis=0)
            else:
                feature = np.zeros(expected_shape, dtype=np.float32)

            if image_path is None:
                print(f"⚠️ Không tìm thấy ảnh ID {image_id}, gán vector trung bình.")
                missing_image_count += 1
            else:
                print(f"⚠️ Feature lỗi ID {image_id}, gán vector trung bình.")
                invalid_feature_count += 1

        all_features.append(feature)
        all_labels.append(row['label'])
        success_count += 1

        if idx % 200 == 0:
            gc.collect()
            torch.cuda.empty_cache()

    except Exception as e:
        print(f"❌ Lỗi tại idx {idx} (ID {row['id']}): {e}")
        error_count += 1

X = np.stack(all_features)
y = np.array(all_labels)

print(f"✅ Done: {success_count} success, {missing_image_count} missing images, {invalid_feature_count} invalid features, {error_count} errors")

🔄 Extracting GIT-2 features:  98%|█████████▊| 5177/5280 [06:05<00:07, 14.18it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (601 > 512). Running this sequence through the model will result in indexing errors
🔄 Extracting GIT-2 features: 100%|██████████| 5280/5280 [06:13<00:00, 14.15it/s]

✅ Done: 5280 success, 0 missing images, 0 invalid features, 0 errors





In [18]:
from torch.utils.data import TensorDataset

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

dataset = TensorDataset(X_tensor, y_tensor)

print("Feature shape:", X.shape)
print("Label shape  :", y.shape)

Feature shape: (5280, 768)
Label shape  : (5280,)


In [41]:
import torch.nn as nn
import torch.nn.functional as F

class FusionNetGIT2(nn.Module):
    def __init__(self, input_dim=768):
        super(FusionNetGIT2, self).__init__()
        self.bn_input = nn.BatchNorm1d(input_dim)

        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.5)

        self.fc2 = nn.Linear(512, 384)
        self.bn2 = nn.BatchNorm1d(384)
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(384, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.dropout3 = nn.Dropout(0.1)

        # self.fc4 = nn.Linear(128, 64)
        # self.bn4 = nn.BatchNorm1d(64)
        # self.dropout4 = nn.Dropout(0.1)


        self.out = nn.Linear(128, 1)

    def forward(self, x):
        x = self.bn_input(x)

        x = self.fc1(x)
        x = self.bn1(x)
        x = F.silu(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = self.bn2(x)
        x = F.silu(x)
        x = self.dropout2(x)

        x = self.fc3(x)
        x = self.bn3(x)
        x = F.silu(x)
        x = self.dropout3(x)

        # x = self.fc4(x)
        # x = self.bn4(x)
        # x = F.silu(x)
        # x = self.dropout4(x)

        return self.out(x).squeeze(1)

In [42]:
from torch.utils.data import TensorDataset, Subset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score

pos_weight = torch.tensor(3360 / 1920, dtype=torch.float32).to(DEVICE)  # điều chỉnh nếu cần
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

dataset = TensorDataset(torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32))


In [43]:
def train_fold(model, train_loader, val_loader, fold_id, total_epochs=50, lr_max=0.00068):
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr_max, weight_decay=0.001)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=3, min_lr=1e-7, verbose=True
    )

    best_f1 = 0.0
    best_threshold = 0.5
    patience = 9
    patience_counter = 0

    for epoch in range(total_epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            preds = model(xb)
            loss = criterion(preds, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        all_probs, all_targets = [], []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                preds = model(xb)
                probs = torch.sigmoid(preds).cpu().numpy().flatten()
                all_probs.extend(probs)
                all_targets.extend(yb.cpu().numpy().flatten())

        probs = np.array(all_probs)
        targets = np.array(all_targets)

        def find_best_threshold(y_true, y_prob):
            thresholds = np.arange(0.1, 0.91, 0.01)
            best_t, best_f1 = 0.5, 0.0
            for t in thresholds:
                preds = (y_prob >= t).astype(int)
                f1 = f1_score(y_true, preds, zero_division=0)
                if f1 > best_f1:
                    best_t, best_f1 = t, f1
            return best_t, best_f1

        best_threshold_epoch, best_f1_epoch = find_best_threshold(targets, probs)

        # Log
        auc = roc_auc_score(targets, probs)
        f1 = f1_score(targets, (probs >= 0.5).astype(int), zero_division=0)
        precision = precision_score(targets, (probs >= 0.5).astype(int), zero_division=0)
        recall = recall_score(targets, (probs >= 0.5).astype(int), zero_division=0)
        acc = (targets == (probs >= 0.5).astype(int)).mean()

        scheduler.step(f1)

        print(f"[Fold {fold_id}] Epoch {epoch+1}: F1 = {f1:.4f} | AUC = {auc:.4f} | "
              f"Best Threshold = {best_threshold_epoch:.2f} | Best F1 = {best_f1_epoch:.4f}")

        if best_f1_epoch > best_f1:
            best_f1 = best_f1_epoch
            best_threshold = best_threshold_epoch
            patience_counter = 0
            torch.save(model.state_dict(), f"best_model_fold{fold_id}.pt")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break

    return best_threshold, probs, targets


In [44]:
best_thresholds = []
val_probs_all = []
val_targets_all = []

for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\n📦 Fold {fold_id + 1}")
    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)

    train_loader = DataLoader(train_subset, batch_size=128, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=128, shuffle=False)

    model = FusionNetGIT2(input_dim=X.shape[1]).to(DEVICE)
    best_t, val_probs, val_targets = train_fold(model, train_loader, val_loader, fold_id + 1)
    best_thresholds.append(best_t)
    val_probs_all.extend(val_probs)
    val_targets_all.extend(val_targets)



📦 Fold 1
[Fold 1] Epoch 1: F1 = 0.8620 | AUC = 0.9643 | Best Threshold = 0.74 | Best F1 = 0.8865
[Fold 1] Epoch 2: F1 = 0.8835 | AUC = 0.9683 | Best Threshold = 0.55 | Best F1 = 0.8873
[Fold 1] Epoch 3: F1 = 0.9152 | AUC = 0.9716 | Best Threshold = 0.49 | Best F1 = 0.9179
[Fold 1] Epoch 4: F1 = 0.8822 | AUC = 0.9699 | Best Threshold = 0.45 | Best F1 = 0.8883
[Fold 1] Epoch 5: F1 = 0.8683 | AUC = 0.9705 | Best Threshold = 0.68 | Best F1 = 0.8971
[Fold 1] Epoch 6: F1 = 0.8941 | AUC = 0.9669 | Best Threshold = 0.51 | Best F1 = 0.8964
[Fold 1] Epoch 7: F1 = 0.8698 | AUC = 0.9686 | Best Threshold = 0.71 | Best F1 = 0.8912
[Fold 1] Epoch 8: F1 = 0.8741 | AUC = 0.9713 | Best Threshold = 0.69 | Best F1 = 0.9010
[Fold 1] Epoch 9: F1 = 0.8872 | AUC = 0.9685 | Best Threshold = 0.53 | Best F1 = 0.8941
[Fold 1] Epoch 10: F1 = 0.8622 | AUC = 0.9704 | Best Threshold = 0.71 | Best F1 = 0.8971
[Fold 1] Epoch 11: F1 = 0.9021 | AUC = 0.9693 | Best Threshold = 0.42 | Best F1 = 0.9105
[Fold 1] Epoch 12: F

In [45]:
val_probs_all = np.array(val_probs_all)
val_targets_all = np.array(val_targets_all)

def find_best_threshold(y_true, y_probs):
    thresholds = np.arange(0.1, 0.91, 0.01)
    best_t, best_f1 = 0.5, 0.0
    for t in thresholds:
        preds = (y_probs >= t).astype(int)
        f1 = f1_score(y_true, preds, zero_division=0)
        if f1 > best_f1:
            best_t, best_f1 = t, f1
    return best_t, best_f1

global_thresh, global_f1 = find_best_threshold(val_targets_all, val_probs_all)

print(f"\n🌟 Global Threshold: {global_thresh:.4f} | Global F1: {global_f1:.4f}")


🌟 Global Threshold: 0.5100 | Global F1: 0.9036


In [46]:
thresh2 = np.mean(best_thresholds)
print(thresh2)

0.5699999999999997


In [47]:
import pandas as pd
import ast
import re
import unicodedata

# --- Đọc dữ liệu ---
test_df = pd.read_csv("/kaggle/input/2025-sum-dpl-302-m/test.csv")

# --- Làm sạch ID ---
test_df['image_id'] = test_df['image_id'].apply(lambda x: int(float(x)) if pd.notnull(x) else x)

# --- Hàm chuyển user_tags từ chuỗi → list an toàn ---
def safe_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            val = ast.literal_eval(x)
            if isinstance(val, list):
                return val
        except Exception:
            pass
        return [x] if x else []
    return []

test_df["user_tags"] = test_df["user_tags"].apply(safe_list)

# --- Hàm làm sạch từng cột text ---
def basic_clean(text):
    if not isinstance(text, str):
        return ""
    
    text = unicodedata.normalize('NFKC', text)  # Chuẩn Unicode
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Bỏ URL
    text = re.sub(r"\S+@\S+", "", text)  # Bỏ email
    text = re.sub(r"<.*?>", "", text)  # Bỏ HTML tags
    text = re.sub(r"[^\w\s\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\-_#]+", "", text)  # Giữ emoji + đa ngôn ngữ
    text = re.sub(r"\s+", " ", text)  # Rút gọn khoảng trắng
    return text.lower().strip()

# --- Làm sạch từng trường ---
test_df["title"] = test_df["title"].fillna("").astype(str).apply(basic_clean)
test_df["description"] = test_df["description"].fillna("").astype(str).apply(basic_clean)
test_df["user_tags"] = test_df["user_tags"].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))
test_df["user_tags"] = test_df["user_tags"].apply(basic_clean)

# --- Gộp thành text đầu vào ---
test_df["text"] = test_df.apply(
    lambda row: f"Title: {row['title']} | Description: {row['description']} | Tags: {row['user_tags']}",
    axis=1
)

# ✅ Kết quả
test_df = test_df.rename(columns={'image_id': 'id'})
test_df[["id", "text"]].head()

Unnamed: 0,id,text
0,3483809003,Title: flooded parking lot at emily fowler lib...
1,3712805295,Title: larc de barà the roman arch of barà | D...
2,379845620,Title: highest point over the sea level that i...
3,7343264988,Title: lagos after the rains | Description: af...
4,3843337492,Title: flooded corley ave | Description: also ...


In [48]:
import os
import numpy as np
from tqdm import tqdm

# --- Đường dẫn thư mục chứa ảnh ---
IMG_TEST_DIR = "/kaggle/input/2025-sum-dpl-302-m/testset_images/testset_images"

# --- Đuôi ảnh hợp lệ ---
valid_exts = [".jpg", ".png"]

# --- Lọc danh sách file ảnh ---
image_files = [
    f for f in os.listdir(IMG_TEST_DIR)
    if os.path.isfile(os.path.join(IMG_TEST_DIR, f))
    and not f.startswith("._")
    and os.path.splitext(f.lower())[1] in valid_exts
]
print(f"🖼️ Tổng số ảnh hợp lệ: {len(image_files)}")

🖼️ Tổng số ảnh hợp lệ: 1320


In [49]:
def find_image_path_test(image_id, exts=[".jpg", ".png"]):
    for ext in exts:
        path = os.path.join(IMG_TEST_DIR, f"{image_id}{ext}")
        if os.path.isfile(path):
            return path
    return None

In [50]:
expected_shape = (768,)
all_features = []
error_count = 0

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="🔍 Extracting GIT-2 test features"):
    try:
        image_id = str(int(row["id"]))
        image_path = find_image_path_test(image_id)
        text_input = row["text"]

        if image_path is not None:
            feat = extract_git2_feature(image_path, text_input, fallback_dim=expected_shape[0])
        else:
            print(f"⚠️ Missing image: {image_id}")
            feat = np.zeros(expected_shape, dtype=np.float32)

        if feat.shape != expected_shape:
            print(f"⚠️ Wrong shape for ID {image_id}")
            feat = np.zeros(expected_shape, dtype=np.float32)

        all_features.append(feat)

    except Exception as e:
        print(f"❌ Error at idx {idx}, id {row['id']}: {e}")
        all_features.append(np.zeros(expected_shape, dtype=np.float32))
        error_count += 1

# --- Save ra file .npy ---
all_features = np.stack(all_features)
np.save("git2_features_test.npy", all_features)
print("✅ Đã lưu git2_features_test.npy")
print(f"📐 Shape: {all_features.shape}")
print(f"❗ Số lượng lỗi: {error_count}")

🔍 Extracting GIT-2 test features: 100%|██████████| 1320/1320 [01:23<00:00, 15.89it/s]

✅ Đã lưu git2_features_test.npy
📐 Shape: (1320, 768)
❗ Số lượng lỗi: 0





In [64]:
# --- 1. Nạp đặc trưng test ---
git2_test = np.load("git2_features_test.npy")  # shape: (M, 768)
X_test_tensor = torch.tensor(git2_test, dtype=torch.float32).to(DEVICE)

# --- 2. Dự đoán theo từng fold ---
NUM_FOLDS = 10
all_probs = []

for fold in range(1, NUM_FOLDS + 1):
    print(f"🔁 Predicting with Fold {fold}")
    
    model = FusionNetGIT2(input_dim=768).to(DEVICE)
    model.load_state_dict(torch.load(f"best_model_fold{fold}.pt", map_location=DEVICE))
    model.eval()

    with torch.no_grad():
        logits = model(X_test_tensor)
        probs = torch.sigmoid(logits).cpu().numpy().flatten()  # xác suất
        all_probs.append(probs)

# --- 3. Trung bình xác suất giữa các fold ---
ensemble_probs = np.mean(all_probs, axis=0)
ensemble_preds = (ensemble_probs >= thresh2).astype(int)


🔁 Predicting with Fold 1
🔁 Predicting with Fold 2
🔁 Predicting with Fold 3
🔁 Predicting with Fold 4
🔁 Predicting with Fold 5
🔁 Predicting with Fold 6
🔁 Predicting with Fold 7
🔁 Predicting with Fold 8
🔁 Predicting with Fold 9
🔁 Predicting with Fold 10


In [65]:
results_df = test_df.copy()  # test_df phải có cột 'id'
results_df["label"] = ensemble_preds
results_df["probability"] = ensemble_probs

In [66]:
results_df.to_csv("fusionnet_git2_predictions.csv", index=False)
print("✅ Saved full results → 'fusionnet_git2_predictions.csv'")

✅ Saved full results → 'fusionnet_git2_predictions.csv'


In [67]:
results_df = results_df[["id", "label", "probability"]].copy()
results_df.sort_values(by="probability", ascending=False, inplace=True)
results_df.to_csv("result_git2.csv", index=False)

In [68]:
submission_df = results_df[["id", "label"]].copy()
submission_df.to_csv("submission_git2.csv", index=False)
print("✅ Saved submission → 'submission_git2.csv'")

✅ Saved submission → 'submission_git2.csv'


In [69]:
num_positives = (results_df["label"] == 1).sum()
print(f"📊 Số lượng dự đoán là 1: {num_positives}")

📊 Số lượng dự đoán là 1: 478
