In [29]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from torch.utils.data import DataLoader, TensorDataset

# DataLoader pour modèle texte
val_loader_text = DataLoader(
    TensorDataset(input_ids_val, y_val),
    batch_size=64,
    shuffle=False
)

# DataLoader pour modèle image
val_loader_img = DataLoader(
    TensorDataset(val_images, y_val),
    batch_size=64,
    shuffle=False
)


# 1) Validation : recherche du meilleur alpha
model_text.eval()
model_img.eval()

preds_val_text, preds_val_img, targets_val = [], [], []

with torch.no_grad():
    for (input_ids_batch, y_text_batch), (images_batch, y_img_batch) in zip(val_loader_text, val_loader_img):
        # Texte
        input_ids_batch = input_ids_batch.to(device)
        preds_text = model_text(input_ids_batch)
        preds_val_text.extend(preds_text.cpu().numpy())

        # Image
        images_batch = images_batch.to(device)
        preds_img = model_img(images_batch)
        preds_val_img.extend(preds_img.cpu().numpy())

        targets_val.extend(y_text_batch.cpu().numpy())  # mêmes cibles

preds_val_text = np.array(preds_val_text)
preds_val_img = np.array(preds_val_img)
targets_val = np.array(targets_val)

# Recherche de l'alpha optimal
best_alpha, best_loss = 0.0, float('inf')
loss_fn = nn.HuberLoss()

for alpha in np.linspace(0, 1, 21):
    combined = alpha * preds_val_text + (1 - alpha) * preds_val_img
    loss = loss_fn(torch.tensor(combined), torch.tensor(targets_val)).item()
    if loss < best_loss:
        best_loss = loss
        best_alpha = alpha

print(f"✅ Meilleur alpha trouvé : {best_alpha:.2f} | Validation Loss: {best_loss:.4f}")

# 2)Test : génération des prédictions 

# 2.1 Texte
test_df["text"] = test_df["title"].fillna("") + " " + test_df["description"].fillna("")
tokenized_test = [text.lower() for text in test_df["text"]]
X_text_test = torch.tensor([text_to_indices(t, vocab) for t in tokenized_test], dtype=torch.long)

model_text.eval()
with torch.no_grad():
    preds_text_test = model_text(X_text_test.to(device)).cpu().numpy()

# 2.2 Images
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor()
])

class TestImageDataset(Dataset):
    def __init__(self, df, data_path, transform=None):
        self.df = df
        self.data_path = data_path
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.df.iloc[idx]["id"]
        path = os.path.join(self.data_path, "test", f"{img_id}.jpg")
        image = Image.open(path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image

image_dataset = TestImageDataset(test_df, data_path=data_path, transform=transform)
image_loader = DataLoader(image_dataset, batch_size=64)

model_img.eval()
preds_img_test = []
with torch.no_grad():
    for batch in tqdm(image_loader, desc="🖼 Predicting image model"):
        batch = batch.to(device)
        preds = model_img(batch)
        preds_img_test.extend(preds.cpu().numpy())

preds_img_test = np.array(preds_img_test)
preds_text_test = np.array(preds_text_test)

# 3) Moyenne pondérée & création du CSV 
avg_preds = best_alpha * preds_text_test + (1 - best_alpha) * preds_img_test
final_views = np.expm1(avg_preds).astype(int)

submission = pd.DataFrame({
    "ID": test_df["id"],
    "views": final_views
})
submission.to_csv("/kaggle/working/submission.csv", index=False)
print(f" submission.csv généré avec succès avec alpha = {best_alpha:.2f}")


✅ Meilleur alpha trouvé : 0.70 | Validation Loss: 0.7375


🖼 Predicting image model: 100%|██████████| 54/54 [00:12<00:00,  4.35it/s]

✅ submission.csv généré avec succès avec alpha = 0.70





In [1]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

# Données de validation 
val_df = train_df.iloc[idx_val]
val_texts = val_df["title"].fillna("") + " " + val_df["description"].fillna("")
val_prompts = val_df.apply(lambda row: f"Video titled '{row['title']}', from channel '{row['channel']}', published in {row['year']}.", axis=1)
val_input_ids = torch.tensor([text_to_indices(t, vocab) for t in val_texts], dtype=torch.long).to(device)

# Images val
def load_val_images(df_subset):
    images = []
    for img_id in df_subset["id"]:
        path = get_image_path(img_id)
        image = Image.open(path).convert("RGB")
        image = transform(image)
        images.append(image)
    return torch.stack(images)

# On garde les images PIL pour le modèle CLIP, et on transforme pour les autres
val_images_pil = []
val_images_tensor = []

for img_id in val_df["id"]:
    path = get_image_path(img_id)
    img = Image.open(path).convert("RGB")
    val_images_pil.append(img)                            # pour CLIP
    val_images_tensor.append(transform(img))              # pour CNN

val_images_tensor = torch.stack(val_images_tensor).to(device)

val_targets = y[idx_val]

# Prédictions sur les modèles de base 
model_text.eval()
model_img.eval()
model_clip.eval()
model_1.eval()
model_2.eval()
model_3.eval()
model_4.eval()
model_5.eval()

# Métadonnées pour validation
meta_val = torch.tensor(meta_features[idx_val], dtype=torch.float32).to(device)


with torch.no_grad():
    val_preds_text = model_text(val_input_ids).cpu().numpy()
    val_preds_img = model_img(val_images_tensor).cpu().numpy()

    clip_inputs = model_clip.processor(
        images=val_images_pil,
        text=list(val_prompts),
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(device)
    output = model_clip.clip(**clip_inputs)
    clip_feats = torch.cat([output.image_embeds, output.text_embeds], dim=1)
    val_preds_clip = model_clip.head(clip_feats).squeeze(1).cpu().numpy()

    val_preds_1 = model_1(val_images_tensor, meta_val).cpu().numpy()
    val_preds_2 = model_2(val_images_tensor, meta_val).cpu().numpy()
    val_preds_3 = model_3(val_images_tensor, meta_val).cpu().numpy()
    val_preds_4 = model_4(val_images_tensor, meta_val).cpu().numpy()
    val_preds_5 = model_5(val_images_tensor, meta_val).cpu().numpy()



# Apprentissage des pondérations
X_stack = np.vstack([
    val_preds_text,
    val_preds_img,
    val_preds_clip,
    val_preds_1,
    val_preds_2,
    val_preds_3,
    val_preds_4,
    val_preds_5
]).T

reg = Ridge(alpha=1.0)
reg.fit(X_stack, val_targets)

coefs = reg.coef_
coef_names = ["text", "img", "clip", "model_1", "model_2", "model_3", "model_4", "model_5"]

print("Pondérations apprises :")
for name, val in zip(coef_names, coefs):
    print(f"  {name:>8} : {val:.4f}")


NameError: name 'train_df' is not defined

In [86]:
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Calcul du nombre total de features
meta_input_dim = text_features.shape[1] + channel_features.shape[1] + 1

X_train, X_val, idx_train, idx_val = train_test_split(
    np.arange(len(train_df)), np.arange(len(train_df)), test_size=0.1, random_state=42)

train_dataset = YouTubeDataset(train_df.iloc[idx_train], text_features[idx_train], channel_features[idx_train], year_features[idx_train], y[idx_train], data_path)
val_dataset = YouTubeDataset(train_df.iloc[idx_val], text_features[idx_val], channel_features[idx_val], year_features[idx_val], y[idx_val], data_path)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

meta_input_dim=5047
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_1 = YouTubeModel(meta_input_dim=meta_input_dim).to(device)
model_2 = YouTubeModel_upgrade1(meta_input_dim=meta_input_dim).to(device)
model_3 = YouTubeModel_LiteAttention(meta_input_dim=meta_input_dim).to(device)
model_4 = YouTubeModelSimple(meta_input_dim=meta_input_dim).to(device)
model_5 = YouTubeModel_CNN_MetaAttention(meta_input_dim=meta_input_dim).to(device)

model = model_5

def entraine(premier_model):
    optimizer = torch.optim.Adam(premier_model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
    loss_fn = nn.HuberLoss()

    from tqdm.notebook import tqdm  

    for epoch in range(7):
        premier_model.train()
        train_losses = []
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1} [train]")
        for images, metas, targets in pbar:
            images, metas, targets = images.to(device), metas.to(device), targets.to(device)
            preds = premier_model(images, metas)
            loss = loss_fn(preds, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
            pbar.set_postfix({"loss": np.mean(train_losses)})

        # Validation
        premier_model.eval()
        val_losses = []
        with torch.no_grad():
            for images, metas, targets in val_loader:
                images, metas, targets = images.to(device), metas.to(device), targets.to(device)
                preds = premier_model(images, metas)
                val_losses.append(loss_fn(preds, targets).item())
        scheduler.step(np.mean(val_losses))
        print(f" Epoch {epoch+1} done | Train Loss: {np.mean(train_losses):.4f} | Val Loss: {np.mean(val_losses):.4f}")

for model in [model_1, model_2, model_3, model_4, model_5]:
    entraine(model)


Epoch 1 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

RuntimeError: Given normalized_shape=[5047], expected input with shape [*, 5047], but got input of size[32, 559]

In [81]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

#Données de validation
val_df = train_df.iloc[idx_val]
val_texts = val_df["title"].fillna("") + " " + val_df["description"].fillna("")
val_prompts = val_df.apply(lambda row: f"Video titled '{row['title']}', from channel '{row['channel']}', published in {row['year']}.", axis=1)
val_input_ids = torch.tensor([text_to_indices(t, vocab) for t in val_texts], dtype=torch.long).to(device)

# Images val
val_images_pil = []
val_images_tensor = []

for img_id in val_df["id"]:
    path = get_image_path(img_id)
    img = Image.open(path).convert("RGB")
    val_images_pil.append(img)                            # pour CLIP
    val_images_tensor.append(transform(img))              # pour CNN

val_images_tensor = torch.stack(val_images_tensor).to(device)
meta_val = torch.tensor(meta_features[idx_val], dtype=torch.float32).to(device)
val_targets = y[idx_val]

def predict_in_batches(model, images_tensor, meta_tensor=None, batch_size=64):
    model.eval()
    preds = []
    with torch.no_grad():
        for i in range(0, len(images_tensor), batch_size):
            imgs = images_tensor[i:i+batch_size]
            if meta_tensor is not None:
                metas = meta_tensor[i:i+batch_size]
                out = model(imgs, metas)
            else:
                out = model(imgs)
            preds.extend(out.cpu().numpy())
    return np.array(preds)

#Modèles en éval
model_text.eval()
model_img.eval()
model_clip.eval()
model_1.eval()
model_2.eval()
model_3.eval()
model_4.eval()
model_5.eval()

#Prédictions
with torch.no_grad():
    # Texte
    val_preds_text = model_text(val_input_ids).cpu().numpy()

    # Image CNN
    val_preds_img = predict_in_batches(model_img, val_images_tensor)

    # CLIP
    clip_inputs = model_clip.processor(
        images=val_images_pil,
        text=list(val_prompts),
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(device)
    output = model_clip.clip(**clip_inputs)
    clip_feats = torch.cat([output.image_embeds, output.text_embeds], dim=1)
    val_preds_clip = model_clip.head(clip_feats).squeeze(1).cpu().numpy()

    # Modèles fusionnés image + meta
    val_preds_1 = predict_in_batches(model_1, val_images_tensor, meta_val)
    val_preds_2 = predict_in_batches(model_2, val_images_tensor, meta_val)
    val_preds_3 = predict_in_batches(model_3, val_images_tensor, meta_val)
    val_preds_4 = predict_in_batches(model_4, val_images_tensor, meta_val)
    val_preds_5 = predict_in_batches(model_5, val_images_tensor, meta_val)

# Apprentissage des pondérations
X_stack = np.vstack([
    val_preds_text,
    val_preds_img,
    val_preds_clip,
    val_preds_1,
    val_preds_2,
    val_preds_3,
    val_preds_4,
    val_preds_5
]).T

reg = Ridge(alpha=1.0)
reg.fit(X_stack, val_targets)

coefs = reg.coef_
coef_names = ["text", "img", "clip", "model_1", "model_2", "model_3", "model_4", "model_5"]

print(" Pondérations apprises :")
for name, val in zip(coef_names, coefs):
    print(f"  {name:>8} : {val:.4f}")


✅ Pondérations apprises :
      text : 0.4020
       img : 0.1964
      clip : 0.6108
   model_1 : 0.2133
   model_2 : -0.1588
   model_3 : -0.9006
   model_4 : -0.1647
   model_5 : -0.1082


In [84]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms
import numpy as np
from tqdm import tqdm
import pandas as pd

# Texte brut et prompt CLIP
test_df["text"] = test_df["title"].fillna("") + " " + test_df["description"].fillna("")
test_df["prompt"] = test_df.apply(lambda row: f"Video titled '{row['title']}', from channel '{row['channel']}', published in {row['year']}.", axis=1)

# Transformation image
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor()
])

# Dataset unifié
class UnifiedTestDataset(Dataset):
    def __init__(self, df, vocab, meta_feats, transform=None):
        self.df = df
        self.vocab = vocab
        self.meta = meta_feats
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_id = row["id"]
        image_path = os.path.join(data_path, "test", f"{img_id}.jpg")
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        text = row["text"].lower()
        input_ids = text_to_indices(text, self.vocab)
        prompt = row["prompt"]
        meta = self.meta[idx]

        return image, torch.tensor(input_ids, dtype=torch.long), prompt, torch.tensor(meta, dtype=torch.float32), row["id"]

#Collate function
def collate_fn(batch):
    images, input_ids, prompts, metas, ids = zip(*batch)
    return (
        torch.stack(images),
        torch.stack(input_ids),
        list(prompts),
        torch.stack(metas),
        list(ids)
    )

# DataLoader
test_dataset = UnifiedTestDataset(test_df, vocab, meta_features_test, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=collate_fn)

# Modèles en éval
models = [model_text, model_img, model_clip, model_1, model_2, model_3, model_4, model_5]
for m in models:
    m.eval()

clip_processor = model_clip.processor
coefs = np.array(coefs)  # les coefficients appris précédemment
coefs = coefs / coefs.sum()  # normalisation

#Prédiction en batch
preds_all = [[] for _ in range(len(coefs))]
all_ids = []

with torch.no_grad():
    for images, input_ids, prompts, metas, ids in tqdm(test_loader, desc="🔮 Test predictions"):
        images = images.to(device)
        input_ids = input_ids.to(device)
        metas = metas.to(device)

        # Texte
        pred_text = model_text(input_ids).cpu().numpy()
        preds_all[0].extend(pred_text)

        # Image seule
        pred_img = model_img(images).cpu().numpy()
        preds_all[1].extend(pred_img)

        # CLIP
        clip_inputs = clip_processor(images=images, text=prompts, return_tensors="pt", padding=True, truncation=True).to(device)
        output = model_clip.clip(**clip_inputs)
        clip_feats = torch.cat([output.image_embeds, output.text_embeds], dim=1)
        pred_clip = model_clip.head(clip_feats).squeeze(1).cpu().numpy()
        preds_all[2].extend(pred_clip)

        # Modèles 1 à 5
                # Modèles 1 à 5
        for i, model in enumerate(models[3:], start=3):
            # Préparer les features individuellement
            pred = model(
                images,
                metas  # Assuré d'être [batch_size, 5047] comme attendu
            ).cpu().numpy()
            preds_all[i].extend(pred)


        all_ids.extend(ids)

# Moyenne pondérée finale
final_preds = sum(c * np.array(p) for c, p in zip(coefs, preds_all))
final_views = np.expm1(final_preds).astype(int)

# 9. Soumission
submission = pd.DataFrame({
    "ID": all_ids,
    "views": final_views
})
submission.to_csv("/kaggle/working/submission.csv", index=False)
print("Fichier submission.csv généré avec succès avec les 8 modèles pondérés.")


🔮 Test predictions:   0%|          | 0/54 [00:00<?, ?it/s]


RuntimeError: Given normalized_shape=[559], expected input with shape [*, 559], but got input of size[64, 5047]