Importation des données

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


data_path = "/kaggle/input/CSC_43M04_EP_challenge/dataset"

train_df = pd.read_csv(f"{data_path}/train_val.csv")
test_df = pd.read_csv(f"{data_path}/test.csv")

def get_image_path(img_id, is_test=False):
    folder = "test" if is_test else "train_val"
    return os.path.join(data_path, folder, f"{img_id}.jpg")

/kaggle/input/CSC_43M04_EP_challenge/dataset/train_val.csv
/kaggle/input/CSC_43M04_EP_challenge/dataset/test.csv
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/1269.jpg
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/623.jpg
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/2193.jpg
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/2008.jpg
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/2081.jpg
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/3138.jpg
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/764.jpg
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/1700.jpg
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/1786.jpg
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/2907.jpg
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/1075.jpg
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/2863.jpg
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/771.jpg
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/208.jpg
/kaggle/input/CSC_43M04_EP_challenge/dataset/test/2628.jpg
/kaggl

Récupération des données et préparation à l'exploitation

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

data_path = "/kaggle/input/CSC_43M04_EP_challenge/dataset"

train_df = pd.read_csv(f"{data_path}/train_val.csv")
test_df = pd.read_csv(f"{data_path}/test.csv")

# Texte
train_df["text"] = train_df["title"].fillna("") + " " + train_df["description"].fillna("")
vectorizer = TfidfVectorizer(max_features=512)
text_features = vectorizer.fit_transform(train_df["text"]).toarray()

# Catégorie
encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
channel_features = encoder.fit_transform(train_df[["channel"]])

# Année
year_features = train_df[["year"]].values

# Cible
train_df["views_log"] = np.log1p(train_df["views"])
y = train_df["views_log"].values

# Fusion des features texte, chaîne, année
meta_features = np.concatenate([
    text_features,          # TF-IDF features
    channel_features,       # One-hot encoding
    year_features           # Année brute
], axis=1)



Préparation du dataset

In [3]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import torchvision.transforms as transforms
import numpy as np
import os

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


class YouTubeDataset(Dataset):
    def __init__(self, df, text_feat, channel_feat, year_feat, targets, data_path):
        self.df = df
        self.text_feat = text_feat
        self.channel_feat = channel_feat
        self.year_feat = year_feat
        self.targets = targets
        self.data_path = data_path

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.df.iloc[idx]["id"]
        img_path = os.path.join(self.data_path, "train_val", f"{img_id}.jpg")

        image = Image.open(img_path).convert("RGB")
        image = transform(image)

        meta_features = np.concatenate([
            self.text_feat[idx],
            self.channel_feat[idx],
            self.year_feat[idx]
        ])

        return image, torch.tensor(meta_features, dtype=torch.float32), torch.tensor(self.targets[idx], dtype=torch.float32)


Modèle simple 1)

In [4]:
import torch
import torch.nn as nn

class YouTubeModel(nn.Module):
    def __init__(self, meta_input_dim, image_size=128):
        super().__init__()

        # CNN to extract features from thumbnails
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),     # (B,3,H,W) -> (B,32,H,W)
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2),                                # -> (B,32,H/2,W/2)

            nn.Conv2d(32, 64, kernel_size=3, padding=1),    # -> (B,64,H/2,W/2)
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2),                                # -> (B,64,H/4,W/4)

            nn.Conv2d(64, 128, kernel_size=3, padding=1),   # -> (B,128,H/4,W/4)
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1),
            nn.AdaptiveAvgPool2d((4, 4)),                   # -> (B,128,4,4)

            nn.Flatten()                                    # -> (B,128*4*4)
        )


        self.image_output_dim = 128 * 4 * 4

        # Final regressor that combines image + metadata
        self.fc = nn.Sequential(
            nn.Linear(self.image_output_dim + meta_input_dim, 512),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
        self.meta_norm = nn.LayerNorm(meta_input_dim)

    def forward(self, image, meta):
        img_feat = self.cnn(image)
        meta = self.meta_norm(meta)
        x = torch.cat([img_feat, meta], dim=1)
        return self.fc(x).squeeze(1)


Modèle simple 2

In [5]:
import torch
import torch.nn as nn

class YouTubeModel_upgrade1(nn.Module):
    def __init__(self, meta_input_dim, image_size=128):
        super().__init__()

        # CNN to extract features from thumbnails
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),     # (B,3,H,W) -> (B,32,H,W)
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2),                                # -> (B,32,H/2,W/2)

            nn.Conv2d(32, 64, kernel_size=3, padding=1),    # -> (B,64,H/2,W/2)
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2),                                # -> (B,64,H/4,W/4)

            nn.Conv2d(64, 128, kernel_size=3, padding=1),   # -> (B,128,H/4,W/4)
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1),
            nn.AdaptiveAvgPool2d((4, 4)),                   # -> (B,128,4,4)

            nn.Flatten()                                    # -> (B,128*4*4)
        )


        self.image_output_dim = 128 * 4 * 4

        # Final regressor that combines image + metadata
        self.fc = nn.Sequential(
            nn.Linear(self.image_output_dim + 64, 512),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
        self.meta_mlp = nn.Sequential(
            nn.LayerNorm(meta_input_dim),
            nn.Linear(meta_input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64)  # Tu peux ajuster 64 selon ta préférence
        )

    def forward(self, image, meta):
        img_feat = self.cnn(image)
        meta_feat = self.meta_mlp(meta)  
        x = torch.cat([img_feat, meta_feat], dim=1)
        return self.fc(x).squeeze(1)


Modèle simple 3)

In [6]:
import torch
import torch.nn as nn

class YouTubeModel_LiteAttention(nn.Module):
    def __init__(self, meta_input_dim, image_size=128, d_model=96, patch_size=16, nhead=2, num_layers=1):
        super().__init__()
        self.d_model = d_model

        self.patch_embed = nn.Conv2d(3, d_model, kernel_size=patch_size, stride=patch_size)
        num_patches = (image_size // patch_size) ** 2

        self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))
        self.meta_token = nn.Sequential(
            nn.LayerNorm(meta_input_dim),
            nn.Linear(meta_input_dim, d_model),
            nn.ReLU(),
            nn.Linear(d_model, d_model)
        )

        self.pos_embed = nn.Parameter(torch.randn(1, num_patches + 2, d_model))  # [CLS] + [META]
        self.dropout = nn.Dropout(0.3)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=256,
            activation="gelu",
            batch_first=True,
            dropout=0.3
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )

    def forward(self, image, meta):
        B = image.size(0)

        x = self.patch_embed(image)                # [B, D, H', W']
        x = x.flatten(2).transpose(1, 2)           # [B, N_patches, D]

        cls_token = self.cls_token.expand(B, 1, -1)
        meta_tok = self.meta_token(meta).unsqueeze(1)

        tokens = torch.cat([cls_token, meta_tok, x], dim=1)  # [B, N+2, D]
        tokens = self.dropout(tokens + self.pos_embed[:, :tokens.size(1)])

        x = self.transformer(tokens)
        return self.head(x[:, 0]).squeeze(1)  # Utilise le [CLS] token


Modèle simple 4)

In [7]:
import torch
import torch.nn as nn

class YouTubeModel_CNN_MetaAttention(nn.Module):
    def __init__(self, meta_input_dim, image_size=128):
        super().__init__()

        # --- 1. CNN image encoder ---
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # 64x64
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((4, 4)),  # [B, 64, 4, 4]
            nn.Flatten()  # [B, 1024]
        )
        img_feat_dim = 64 * 4 * 4

        # --- 2. MLP metadata encoder ---
        self.meta_net = nn.Sequential(
            nn.LayerNorm(meta_input_dim),
            nn.Linear(meta_input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 128)
        )

        # --- 3. Attention encoder (image + meta) ---
        self.attention_input_dim = img_feat_dim + 128

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.attention_input_dim,
            nhead=4,
            dim_feedforward=256,
            dropout=0.2,
            activation="gelu",
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=1)

        # --- 4. Head ---
        self.head = nn.Sequential(
            nn.LayerNorm(self.attention_input_dim),
            nn.Linear(self.attention_input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )

    def forward(self, image, meta):
        img_feat = self.cnn(image)           # [B, 1024]
        meta_feat = self.meta_net(meta)      # [B, 128]

        # Fusion + attention
        combined = torch.cat([img_feat, meta_feat], dim=1).unsqueeze(1)  # [B, 1, D]
        attended = self.transformer(combined)[:, 0]                      # [B, D]

        return self.head(attended).squeeze(1)


Entrainement

In [8]:
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ReduceLROnPlateau

X_train, X_val, idx_train, idx_val = train_test_split(
    np.arange(len(train_df)), np.arange(len(train_df)), test_size=0.1, random_state=42)

train_dataset = YouTubeDataset(train_df.iloc[idx_train], text_features[idx_train], channel_features[idx_train], year_features[idx_train], y[idx_train], data_path)
val_dataset = YouTubeDataset(train_df.iloc[idx_val], text_features[idx_val], channel_features[idx_val], year_features[idx_val], y[idx_val], data_path)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_1 = YouTubeModel(meta_input_dim=text_features.shape[1] + channel_features.shape[1] + 1).to(device)
model_2 = YouTubeModel_upgrade1(meta_input_dim=text_features.shape[1] + channel_features.shape[1] + 1).to(device)
model_3 = YouTubeModel_LiteAttention(
    meta_input_dim=text_features.shape[1] + channel_features.shape[1] + 1
).to(device)
model_4 = YouTubeModel_CNN_MetaAttention(
    meta_input_dim=text_features.shape[1] + channel_features.shape[1] + 1
).to(device)


#Choix du modèle ici
model = model_4






def entraine(premier_model):

    optimizer = torch.optim.Adam(premier_model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
    loss_fn = nn.HuberLoss()
    
    from tqdm.notebook import tqdm  # barre pour Jupyter/Kaggle

    for epoch in range(4):
        premier_model.train()
        train_losses = []
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1} [train]")
        for images, metas, targets in pbar:
            images, metas, targets = images.to(device), metas.to(device), targets.to(device)
            preds = premier_model(images, metas)
            loss = loss_fn(preds, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
            pbar.set_postfix({"loss": np.mean(train_losses)})

        # Validation
        premier_model.eval()
        val_losses = []
        with torch.no_grad():
            for images, metas, targets in val_loader:
                images, metas, targets = images.to(device), metas.to(device), targets.to(device)
                preds = premier_model(images, metas)
                val_losses.append(loss_fn(preds, targets).item())
        scheduler.step(np.mean(val_losses))
        print(f" Epoch {epoch+1} done | Train Loss: {np.mean(train_losses):.4f} | Val Loss: {np.mean(val_losses):.4f}")

#Entrainement de tous les modèles.

for model in [model_1,model_2,model_3,model_4]:
    entraine(model)





Epoch 1 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 1 done | Train Loss: 1.5587 | Val Loss: 1.2596


Epoch 2 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 2 done | Train Loss: 1.3576 | Val Loss: 1.1743


Epoch 3 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 3 done | Train Loss: 1.3176 | Val Loss: 1.1691


Epoch 4 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 4 done | Train Loss: 1.2804 | Val Loss: 1.1344


Epoch 1 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 1 done | Train Loss: 1.5457 | Val Loss: 1.2239


Epoch 2 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 2 done | Train Loss: 1.3701 | Val Loss: 1.1532


Epoch 3 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 3 done | Train Loss: 1.3058 | Val Loss: 1.3916


Epoch 4 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 4 done | Train Loss: 1.3028 | Val Loss: 1.3604


Epoch 1 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 1 done | Train Loss: 1.6684 | Val Loss: 1.6860


Epoch 2 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 2 done | Train Loss: 1.4181 | Val Loss: 1.5055


Epoch 3 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 3 done | Train Loss: 1.4230 | Val Loss: 1.4358


Epoch 4 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 4 done | Train Loss: 1.4182 | Val Loss: 1.3987


Epoch 1 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 1 done | Train Loss: 1.5709 | Val Loss: 1.3782


Epoch 2 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 2 done | Train Loss: 1.4939 | Val Loss: 1.5919


Epoch 3 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 3 done | Train Loss: 1.4694 | Val Loss: 1.2892


Epoch 4 [train]:   0%|          | 0/436 [00:00<?, ?it/s]

 Epoch 4 done | Train Loss: 1.4406 | Val Loss: 1.3534


Validation du modèle

In [11]:
from tqdm import tqdm

test_df["text"] = test_df["title"].fillna("") + " " + test_df["description"].fillna("")

text_features_test = vectorizer.transform(test_df["text"]).toarray()
channel_features_test = encoder.transform(test_df[["channel"]])
year_features_test = test_df[["year"]].values



class YouTubeTestDataset(Dataset):
    def __init__(self, df, text_feat, channel_feat, year_feat, data_path):
        self.df = df
        self.text_feat = text_feat
        self.channel_feat = channel_feat
        self.year_feat = year_feat
        self.data_path = data_path

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.df.iloc[idx]["id"]
        img_path = os.path.join(self.data_path, "test", f"{img_id}.jpg")
        image = Image.open(img_path).convert("RGB")
        image = transform(image)

        meta = np.concatenate([
            self.text_feat[idx],
            self.channel_feat[idx],
            self.year_feat[idx]
        ])
        return image, torch.tensor(meta, dtype=torch.float32)



test_dataset = YouTubeTestDataset(test_df, text_features_test, channel_features_test, year_features_test, data_path)
test_loader = DataLoader(test_dataset, batch_size=64)

model.eval()
predictions = []

with torch.no_grad():
    for images, metas in tqdm(test_loader, desc="🔍 Predicting"):
        images, metas = images.to(device), metas.to(device)
        preds = model(images, metas)
        predictions.extend(preds.cpu().numpy())

views_pred = np.expm1(predictions).astype(int)

submission = pd.DataFrame({
    "ID": test_df["id"],
    "views": views_pred
})

submission.to_csv("/kaggle/working/submission.csv", index=False)




🔍 Predicting: 100%|██████████| 54/54 [00:44<00:00,  1.21it/s]


Nombre de paramètres

In [12]:
for model in [model_1,model_2,model_3,model_4]:
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Nombre total de paramètres : {total_params:,}")

Nombre total de paramètres : 1,495,903
Nombre total de paramètres : 1,322,399
Nombre total de paramètres : 244,319
Nombre total de paramètres : 6,093,727
