In [2]:
%%capture

!pip install open_clip_torch
!pip install -r requirements.txt
!pip install -r reddit-memes-virality-prediction/requirements1.txt
!pip install -r reddit-memes-virality-prediction/requirements2.txt

In [1]:
import os
import json
import torch
import pandas as pd
from torch import nn
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as transforms
import open_clip
from transformers import get_scheduler
from torchvision.transforms import RandAugment
from sklearn.metrics import roc_auc_score, accuracy_score
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data_dir = "./facebook-hateful-memes-dataset"
images_dir = data_dir
train_path = os.path.join(data_dir, "train.jsonl")
dev_path = os.path.join(data_dir, "dev.jsonl")

### Dataset Preparation

In [26]:
class HatefulMemesDataset(Dataset):
    def __init__(self, jsonl_path, img_dir, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        self.samples = []
        with open(jsonl_path, "r") as f:
            for line in f:
                data = json.loads(line)
                img_path = os.path.join(img_dir, data["img"])
                text = data["text"]
                label = data.get("label")
                if label is not None:
                    label = float(label)
                sample_id = data.get("id")
                self.samples.append({"image": img_path, "text": text, "label": label, "id": sample_id})

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        entry = self.samples[idx]
        image = Image.open(entry["image"]).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, entry["text"], entry["label"], entry["id"]

In [27]:
model_clip, _, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
clip_image_size = 224

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(clip_image_size, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073),
                         std=(0.26862954, 0.26130258, 0.27577711))
])

val_transform = transforms.Compose([
    transforms.Resize(clip_image_size),
    transforms.CenterCrop(clip_image_size),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073),
                         std=(0.26862954, 0.26130258, 0.27577711))
])

train_dataset = HatefulMemesDataset(train_path, images_dir, transform=train_transform)
val_dataset = HatefulMemesDataset(dev_path, images_dir, transform=val_transform)

tokenizer = open_clip.get_tokenizer("ViT-B-32")

def collate_fn(batch):
    imgs, texts, labels, ids = zip(*batch)
    imgs = torch.stack(imgs)
    text_tokens = tokenizer(list(texts))
    labels_tensor = None if labels[0] is None else torch.tensor(labels, dtype=torch.float32)
    return imgs, text_tokens, labels_tensor, list(ids)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=2, collate_fn=collate_fn)



### Model Initialization

In [28]:
class HateCLIPMultimodalModel(nn.Module):
    def __init__(self, clip_model, image_dim=512, text_dim=512,
                 proj_dim=512, hidden_dim=128):
        super().__init__()
        self.clip_model = clip_model
        for name, param in self.clip_model.named_parameters():
            # Freeze all CLIP parameters
            # Unfreeze last visual transformer block
            param.requires_grad = False
            if "visual.transformer.resblocks.11" in name:
                param.requires_grad = True

        self.image_proj = nn.Linear(image_dim, proj_dim)
        self.text_proj = nn.Linear(text_dim, proj_dim)
        self.classifier = nn.Sequential(
            nn.Linear(proj_dim * proj_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, images, text_tokens):
        with torch.no_grad():
            img_features = self.clip_model.encode_image(images)
            text_features = self.clip_model.encode_text(text_tokens)
        p_i = self.image_proj(img_features)
        p_t = self.text_proj(text_features)
        outer = torch.einsum("bi,bj->bij", p_i, p_t)
        r = outer.view(outer.size(0), -1)
        logit = self.classifier(r)
        return logit.squeeze(1)


model = HateCLIPMultimodalModel(model_clip).to(device)

### Training Setup

In [29]:
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=1e-4,
    weight_decay=1e-4
)
criterion = nn.BCEWithLogitsLoss()
max_epochs = 10
best_auroc = 0.0
patience = 3
no_improve = 0
scaler = GradScaler()

def evaluate(model, loader, criterion, device):
    model.eval()
    all_logits = []
    all_labels = []
    total_loss = 0.0
    with torch.no_grad():
        for images, text_tokens, labels, _ in loader:
            images = images.to(device)
            text_tokens = text_tokens.to(device)
            labels = None if labels is None else labels.to(device)
            logits = model(images, text_tokens)
            if labels is not None:
                loss = criterion(logits, labels)
                total_loss += loss.item() * labels.size(0)
                all_labels.extend(labels.cpu().numpy())
            all_logits.extend(logits.cpu().numpy())
    avg_loss = total_loss / len(loader.dataset) if all_labels else None
    metrics = {}
    if all_labels:
        probs = torch.sigmoid(torch.tensor(all_logits)).numpy()
        metrics["auroc"] = roc_auc_score(all_labels, probs)
        preds = (probs >= 0.5).astype(int)
        metrics["accuracy"] = accuracy_score(all_labels, preds)
    model.train()
    return avg_loss, metrics

  scaler = GradScaler()


### Training Loop

In [30]:
for epoch in range(1, max_epochs + 1):
    model.train()
    train_loss = 0.0
    for images, text_tokens, labels, _ in train_loader:
        images = images.to(device)
        text_tokens = text_tokens.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        with autocast():
            logits = model(images, text_tokens)
            loss = criterion(logits, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        train_loss += loss.item() * labels.size(0)
    avg_train_loss = train_loss / len(train_loader.dataset)
    val_loss, val_metrics = evaluate(model, val_loader, criterion, device)
    auroc = val_metrics.get("auroc", 0.0)
    acc = val_metrics.get("accuracy", 0.0)
    print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f}, Val Loss = {val_loss:.4f}, "
          f"Val AUROC = {auroc:.4f}, Val Acc = {acc:.4f}")
    if auroc > best_auroc:
        best_auroc = auroc
        best_state = {k: v.cpu() for k, v in model.state_dict().items()}
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            print("Early stopping")
            break

  with autocast():
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ffa72317380>
Traceback (most recent call last):
  File "/home/kali/.local/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 1663, in __del__
    self._shutdown_workers()
  File "/home/kali/.local/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 1627, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.11/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/connection.py", line 947, in wait
    ready = selector.select(timeout)
            ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/selectors.py", line 415, in select
    fd_event_list = s

KeyboardInterrupt: 

In [None]:
class RedditHatefulDataset(Dataset):
    def __init__(self, df, img_dir, transform, tokenizer):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.img_dir, f"{row['id']}.jpg")
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        text = row.get("title", "")  # usar o título como texto
        return image, text, row["viral_score"], row["id"]

def reddit_collate_fn(batch):
    imgs, texts, scores, ids = zip(*batch)
    imgs = torch.stack(imgs)
    text_tokens = tokenizer(list(texts))  # reusar tokenizer do CLIP
    scores = torch.tensor(scores, dtype=torch.float32)
    return imgs, text_tokens, scores, list(ids)

## Boxplot da viralidade por rótulo

In [None]:
plt.figure(figsize=(6,4))
data = [
    res_df[res_df["is_hateful"] == 0]["viral_score"],
    res_df[res_df["is_hateful"] == 1]["viral_score"]
]
plt.boxplot(data, labels=["Não Hateful", "Hateful"])
plt.ylabel("Viral Score")
plt.title("Distribuição de Viralidade por Classificação de Hatefulness")
plt.tight_layout()
plt.show()

## Viralidade média em cada grupo

In [None]:
means = res_df.groupby("is_hateful")["viral_score"].mean()
plt.figure(figsize=(5,4))
plt.bar(["Não Hateful", "Hateful"], means)
plt.ylabel("Viral Score Médio")
plt.title("Média de Viralidade por Classificação")
plt.tight_layout()
plt.show()

In [None]:
model.load_state_dict(best_state)
model.to(device)
model.eval()

# 1. Separe os ventiles
df_v1  = viral_df[viral_df["Ventile"] == 1].reset_index(drop=True)
df_v20 = viral_df[viral_df["Ventile"] == 20].reset_index(drop=True)

# 2. Defina uma função que roda todo o seu bloco de inferência
def infer_on_df(df_subset, img_folder):
    # Dataset + Loader usando a sua classe existente
    ds = RedditHatefulDataset(
        df_subset,
        img_dir=img_folder,
        transform=transform_viral,
        tokenizer=tokenizer
    )
    loader = DataLoader(
        ds,
        batch_size=32,
        shuffle=False,
        num_workers=4,
        collate_fn=reddit_collate_fn
    )
    # Inferência
    all_scores, all_pred = [], []
    with torch.no_grad():
        for imgs, text_tokens, viral_scores, _ in loader:
            imgs, text_tokens = imgs.to(device), text_tokens.to(device)
            logits = model(imgs, text_tokens)
            probs  = torch.sigmoid(logits).cpu().numpy()
            preds  = (probs >= 0.5).astype(int)
            all_scores.extend(viral_scores.numpy())
            all_pred.extend(preds)
    # Resultado em DataFrame
    return pd.DataFrame({
        "viral_score": all_scores,
        "is_hateful":  all_pred
    })

# 3. Rode para cada ventile, apontando para a pasta correta
base_path = "reddit-virality-dataset"
res_v1  = infer_on_df(df_v1,  os.path.join(base_path, "Ventile_1"))
res_v20 = infer_on_df(df_v20, os.path.join(base_path, "Ventile_20"))

# 4. Compare os resultados
h_rate_v1  = res_v1["is_hateful"].mean()
h_rate_v20 = res_v20["is_hateful"].mean()
print(f"Ventile 1 hateful rate:  {h_rate_v1:.3f}")
print(f"Ventile 20 hateful rate: {h_rate_v20:.3f}")

# 5. Plote para visualizar
fig, axes = plt.subplots(1,2, figsize=(10,4))

axes[0].boxplot([res_v1["viral_score"], res_v20["viral_score"]],
                labels=["Ventile 1","Ventile 20"])
axes[0].set_title("Distribuição de Viral Score")

axes[1].bar(["Ventile 1","Ventile 20"], [h_rate_v1, h_rate_v20])
axes[1].set_ylim(0,1)
axes[1].set_ylabel("Hateful Rate")
axes[1].set_title("Percentual de Memes Hateful")

plt.tight_layout()
plt.show()