In [1]:
!curl -L -o /content/dataset-mini-mmctr-challenge.zip\
  https://www.kaggle.com/api/v1/datasets/download/othmanehana/dataset-mini-mmctr-challenge

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 2071M  100 2071M    0     0  83.4M      0  0:00:24  0:00:24 --:--:-- 94.0M


In [None]:
!unzip /content/dataset-mini-mmctr-challenge.zip

In [3]:
import pandas as pd

In [4]:
item_emb_file = "/content/Data/item_emb.parquet"
item_feature_file = "/content/Data/item_feature.parquet"
item_info_file = "/content/Data/item_info.parquet"
item_seq_file = "/content/Data/item_seq.parquet"
test_file = "/content/Data/test.parquet"
train_file = "/content/Data/train.parquet"
valid_file = "/content/Data/valid.parquet"

In [5]:
item_emb_df = pd.read_parquet(item_emb_file)
item_feature_df = pd.read_parquet(item_feature_file)
item_info_df = pd.read_parquet(item_info_file)
item_seq_df = pd.read_parquet(item_seq_file)
test_df = pd.read_parquet(test_file)
train_df = pd.read_parquet(train_file)
valid_df = pd.read_parquet(valid_file)

In [6]:
import torch
from torch.utils.data import Dataset
import numpy as np
import pandas as pd

class CTRDataset(Dataset):
    def __init__(self, data_df, item_embedding_matrix, max_len=50, is_test=False):
        """
        Args:
            data_df (pd.DataFrame): The dataframe containing 'item_seq', 'item_id', 'label'
            item_embedding_matrix (torch.Tensor): A tensor of shape (num_items, 128)
                                                  where row N is the vector for Item ID N.
            max_len (int): Maximum length of user history sequence.
            is_test (bool): If True, does not look for 'label'.
        """
        self.data_df = data_df
        self.item_embedding_matrix = item_embedding_matrix
        self.max_len = max_len
        self.is_test = is_test

    def __len__(self):
        return len(self.data_df)

    def __getitem__(self, idx):
        # 1. Get the row
        row = self.data_df.iloc[idx]

        # 2. Get the Sequence (History)
        seq_ids = row['item_seq']

        # 3. Handle Padding / Truncation
        seq_len = len(seq_ids)

        if seq_len < self.max_len:
            pads = [0] * (self.max_len - seq_len)
            seq_ids = pads + seq_ids
        else:
            seq_ids = seq_ids[-self.max_len:]

        # 4. Convert IDs to Vectors (The Lookup)
        seq_tensor = torch.tensor(seq_ids, dtype=torch.long)
        history_emb = self.item_embedding_matrix[seq_tensor]

        # 5. Get Target Item Vector
        target_id = row['item_id']
        target_emb = self.item_embedding_matrix[target_id]
        likes = torch.tensor(row['likes_level'], dtype=torch.long)
        views = torch.tensor(row['views_level'], dtype=torch.long)
        user  = torch.tensor(row['user_id'], dtype=torch.long)

        # 6. Return Data
        output = {
            'history_ids': seq_tensor,
            'history_emb': history_emb,
            'target_id':   torch.tensor(row['item_id'], dtype=torch.long),
            'target_emb':  target_emb,
            'likes':       likes,
            'views':       views,
            #'label':       label
        }

        if not self.is_test:
            output['label'] = torch.tensor(row['label'], dtype=torch.float32)

        return output

In [9]:
import numpy as np
import torch

# 1. Create a mapping from Raw Item ID -> Matrix Index
# We start at index 1 because index 0 is reserved for Padding
unique_items = item_emb_df['item_id'].unique()
item_id_map = {raw_id: i + 1 for i, raw_id in enumerate(unique_items)}
vocab_size = len(unique_items) + 1 # +1 for padding at index 0

print(f"Total items: {len(unique_items)}. Matrix size: {vocab_size}")

# 2. Build the Embedding Tensor
# Initialize with zeros (Row 0 remains zero for padding)
embedding_matrix = np.zeros((vocab_size, 128), dtype=np.float32)

# Fill the matrix
# We assume 'item_emb_d128_e4' contains lists or numpy arrays
for _, row in item_emb_df.iterrows():
    idx = item_id_map.get(row['item_id'])
    if idx is not None:
        embedding_matrix[idx] = row['item_emb_d128_e4']

# Convert to Torch Tensor (Ready for the Dataset Class)
embedding_tensor = torch.tensor(embedding_matrix)
print("Embedding Tensor created successfully!")

# 3. Helper function to remap your Train/Test Dataframes
def remap_dataframe(df, mapping):
    # Map the target item_id
    # If an ID is missing (not in map), set to 0 (Padding/Unknown)
    df['mapped_item_id'] = df['item_id'].apply(lambda x: mapping.get(x, 0))

    # Map the sequence
    # This might take a moment on CPU
    def map_seq(seq):
        return [mapping.get(x, 0) for x in seq]

    df['mapped_item_seq'] = df['item_seq'].apply(map_seq)
    return df

# APPLY THE MAPPING (Assumes you have train_df and valid_df loaded)
print("Remapping Train Data...")
train_df = remap_dataframe(train_df, item_id_map)
print("Remapping Valid Data...")
valid_df = remap_dataframe(valid_df, item_id_map)

Total items: 45858. Matrix size: 45859
Embedding Tensor created successfully!
Remapping Train Data...
Remapping Valid Data...


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on: {device}")

Running on: cuda


In [12]:
import torch
import torch.nn as nn

class HybridCTRTransformer(nn.Module):
    def __init__(self, num_items, pretrained_dim=128, model_dim=128, num_heads=4, num_layers=2):
        """
        num_items: Total number of unique items (vocab_size)
        pretrained_dim: Dimension of your parquet vectors (128)
        model_dim: Dimension we want to use inside the Transformer
        """
        super(HybridCTRTransformer, self).__init__()

        # --- BLOCK 1: The Hybrid Embedding Layer ---
        # A. Learnable ID Embedding (The "Memory")
        # Padding index 0 will remain 0
        self.item_id_embedding = nn.Embedding(num_items, model_dim, padding_idx=0)

        # B. Content Adapter (The "Interpreter")
        # Takes the fixed BERT/ResNet vector and learns how to use it for CTR
        self.content_adapter = nn.Linear(pretrained_dim, model_dim)

        # --- BLOCK 2: The Transformer ---
        self.pos_embedding = nn.Parameter(torch.randn(1, 50, model_dim))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=model_dim,
            nhead=num_heads,
            dim_feedforward=model_dim*4,
            dropout=0.1,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # --- BLOCK 3: Static Features ---
        self.likes_emb = nn.Embedding(20, 32)
        self.views_emb = nn.Embedding(20, 32)

        # --- BLOCK 4: The Prediction Head ---
        # Input: User_Interest(128) + Target_Item(128) + Likes(32) + Views(32) = 320
        total_dim = model_dim * 2 + 32 + 32

        self.mlp = nn.Sequential(
            nn.Linear(total_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        self.sigmoid = nn.Sigmoid()

    def get_full_representation(self, item_ids, content_vecs):
        """
        Helper to combine ID and Content
        """
        # 1. Get ID embedding
        id_vec = self.item_id_embedding(item_ids) # (Batch, Seq, 128)

        # 2. Get Adapted Content embedding
        content_vec = self.content_adapter(content_vecs) # (Batch, Seq, 128)

        # 3. Sum them up
        return id_vec + content_vec

    def forward(self, history_ids, history_content, target_id, target_content, likes, views):
        # --- Step 1: Create Representations ---
        # Use the helper to fuse ID + Content for both History and Target

        # History Sequence
        history_x = self.get_full_representation(history_ids, history_content)

        # Target Item
        target_x = self.get_full_representation(target_id, target_content)

        # --- Step 2: Transformer on History ---
        # Add position info
        x = history_x + self.pos_embedding

        # Pass through Transformer
        # We should strictly use a padding mask here, but for now we rely on the
        # embedding of '0' being learned as "ignore me" or zero.
        transformer_out = self.transformer(x)

        # Take the last item as User Interest
        user_interest = transformer_out[:, -1, :]

        # --- Step 3: Static Features ---
        like_vec = self.likes_emb(likes)
        view_vec = self.views_emb(views)

        # --- Step 4: Concatenate & Predict ---
        combined = torch.cat([user_interest, target_x, like_vec, view_vec], dim=1)

        logits = self.mlp(combined)
        return self.sigmoid(logits)

In [None]:
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score

BATCH_SIZE = 1024
LEARNING_RATE = 0.001
EPOCHS = 20


train_df['item_seq'] = train_df['mapped_item_seq']
train_df['item_id'] = train_df['mapped_item_id']
valid_df['item_seq'] = valid_df['mapped_item_seq']
valid_df['item_id'] = valid_df['mapped_item_id']

train_dataset = CTRDataset(train_df, embedding_tensor)
valid_dataset = CTRDataset(valid_df, embedding_tensor)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 2. Initialize Model
model = HybridCTRTransformer(num_items=vocab_size, pretrained_dim=128).to(device)
criterion = torch.nn.BCELoss() # Binary Cross Entropy
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

print(f"Starting Training on {device}...")

# 3. Training Loop
for epoch in range(EPOCHS):
    model.train() # Set to training mode
    total_loss = 0

    for i, batch in enumerate(train_loader):
        # Unpack batch
        # IDs (for the Embedding Layer)
        hist_ids = batch['history_ids'].to(device)
        tgt_id   = batch['target_id'].to(device)

        # Content Vectors (for the Linear Adapter)
        hist_emb = batch['history_emb'].to(device)
        tgt_emb  = batch['target_emb'].to(device)

        # Static
        likes = batch['likes'].to(device)
        views = batch['views'].to(device)

        label   = batch['label'].to(device).unsqueeze(1)

        # Forward Pass
        optimizer.zero_grad()
        prediction = model(hist_ids, hist_emb, tgt_id, tgt_emb, likes, views)

        # Calculate Loss
        loss = criterion(prediction, label)

        # Backward Pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Print every 50 batches so you know it's alive
        if i % 50 == 0:
            print(f"Epoch {epoch+1} | Batch {i} | Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} Complete! Average Training Loss: {avg_loss:.4f}")


    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad(): # Disable gradient calculation (saves memory/time)
        for batch in valid_loader:
            hist_ids = batch['history_ids'].to(device)
            tgt_id   = batch['target_id'].to(device)
            hist_emb = batch['history_emb'].to(device)
            tgt_emb  = batch['target_emb'].to(device)
            likes = batch['likes'].to(device)
            views = batch['views'].to(device)
            label   = batch['label'].to(device).unsqueeze(1)
            optimizer.zero_grad()
            preds = model(hist_ids, hist_emb, tgt_id, tgt_emb, likes, views)


            all_preds.extend(preds.squeeze().tolist())
            all_labels.extend(label.tolist())

    # Calculate AUC
    try:
        auc = roc_auc_score(all_labels, all_preds)
        print(f"Validation AUC: {auc:.4f}")
    except ValueError:
        print("Error calculating AUC (Labels might be all 0s or all 1s in batch)")

print("Training Finished!")

Starting Training on cuda...
Epoch 1 | Batch 0 | Loss: 0.7006
Epoch 1 | Batch 50 | Loss: 0.5486
Epoch 1 | Batch 100 | Loss: 0.5631
Epoch 1 | Batch 150 | Loss: 0.5230
Epoch 1 | Batch 200 | Loss: 0.5656
Epoch 1 | Batch 250 | Loss: 0.5706
Epoch 1 | Batch 300 | Loss: 0.5328
Epoch 1 | Batch 350 | Loss: 0.5289
Epoch 1 | Batch 400 | Loss: 0.4918
Epoch 1 | Batch 450 | Loss: 0.5245
Epoch 1 | Batch 500 | Loss: 0.5052
Epoch 1 | Batch 550 | Loss: 0.4909
Epoch 1 | Batch 600 | Loss: 0.4648
Epoch 1 | Batch 650 | Loss: 0.4636
Epoch 1 | Batch 700 | Loss: 0.4692
Epoch 1 | Batch 750 | Loss: 0.4821
Epoch 1 | Batch 800 | Loss: 0.4016
Epoch 1 | Batch 850 | Loss: 0.4201
Epoch 1 | Batch 900 | Loss: 0.4328
Epoch 1 | Batch 950 | Loss: 0.4457
Epoch 1 | Batch 1000 | Loss: 0.4173
Epoch 1 | Batch 1050 | Loss: 0.4018
Epoch 1 | Batch 1100 | Loss: 0.4080
Epoch 1 | Batch 1150 | Loss: 0.4111
Epoch 1 | Batch 1200 | Loss: 0.4386
Epoch 1 | Batch 1250 | Loss: 0.4215
Epoch 1 | Batch 1300 | Loss: 0.4036
Epoch 1 | Batch 1350 |

In [None]:
torch.save(model.state_dict(), "hybrid_ctr_model.pth")
print("Model weights saved successfully.")

In [None]:
model = HybridCTRTransformer(num_items=vocab_size, pretrained_dim=128)
model.load_state_dict(torch.load("/content/model_parameters(1).pt", map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
# Save the model weights
torch.save(model.state_dict(), "/content/model_parameters(1).pt")

NameError: name 'torch' is not defined

In [None]:
import pandas as pd

def generate_all_item_embeddings(model, pretrained_embedding_tensor, device):
    """
    Generates the final 128-d vectors for all items using the trained model.
    """
    model.eval() # Set to evaluation mode

    # 1. Create a tensor of all Item IDs [0, 1, 2, ... N]
    # (pretrained_embedding_tensor shape is [Vocab_Size, 128])
    num_items = pretrained_embedding_tensor.shape[0]
    all_ids = torch.arange(num_items, dtype=torch.long).to(device)

    # 2. Move the pretrained content vectors to the same device
    all_content = pretrained_embedding_tensor.to(device)

    # 3. Pass through the model's embedding logic
    # We do this in a "no_grad" block because we are just extracting, not training
    with torch.no_grad():
        # This calls: ID_Emb(ids) + Linear(content)
        final_embeddings = model.get_full_representation(all_ids, all_content)

    # 4. Move back to CPU and convert to Numpy
    return final_embeddings.cpu().numpy()

# --- EXECUTION ---

print("Generating final learned embeddings...")
# 'embedding_tensor' is the matrix of BERT/RN50 vectors you created in Step 1
final_item_vectors = generate_all_item_embeddings(model, embedding_tensor, device)

print(f"Generated vectors shape: {final_item_vectors.shape}")
# Should be (90000+, 128)

In [None]:
final_item_vectors[1]

In [None]:
# 1. Load the Test Parquet
test_df = pd.read_parquet("/content/test.parquet")


print("Remapping Test Data...")
test_df = remap_dataframe(test_df, item_id_map)


Remapping Test Data...


In [None]:
class TestDataset(Dataset):
    def __init__(self, data_df, item_embedding_matrix, max_len=50):
        self.data_df = data_df
        self.item_embedding_matrix = item_embedding_matrix
        self.max_len = max_len

    def __len__(self):
        return len(self.data_df)

    def __getitem__(self, idx):
        row = self.data_df.iloc[idx]

        # 1. Competition ID (Required for CSV)
        comp_id = row['ID']

        # 2. History Logic
        seq_ids = row['mapped_item_seq'] # Uses the mapped column
        seq_len = len(seq_ids)
        if seq_len < self.max_len:
            pads = [0] * (self.max_len - seq_len)
            seq_ids = pads + seq_ids
        else:
            seq_ids = seq_ids[-self.max_len:]

        hist_ids = torch.tensor(seq_ids, dtype=torch.long)
        hist_emb = self.item_embedding_matrix[hist_ids]

        # 3. Target Logic
        target_id_int = row['mapped_item_id']
        target_id = torch.tensor(target_id_int, dtype=torch.long)
        target_emb = self.item_embedding_matrix[target_id]

        # 4. Static Logic
        likes = torch.tensor(row['likes_level'], dtype=torch.long)
        views = torch.tensor(row['views_level'], dtype=torch.long)

        return {
            'comp_id': comp_id,
            'history_ids': hist_ids,
            'history_emb': hist_emb.float(),
            'target_id': target_id,
            'target_emb': target_emb.float(),
            'likes': likes,
            'views': views
        }

In [None]:
# Create Loader
test_dataset = TestDataset(test_df, embedding_tensor)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

model.eval() # Important! Disables Dropout
predictions = []
ids = []

print("Starting Inference...")

with torch.no_grad():
    for batch in test_loader:
        # Move inputs to GPU
        h_ids = batch['history_ids'].to(device)
        h_emb = batch['history_emb'].to(device)
        t_id  = batch['target_id'].to(device)
        t_emb = batch['target_emb'].to(device)
        likes = batch['likes'].to(device)
        views = batch['views'].to(device)

        # Predict
        # Output shape is (Batch, 1), we squeeze to get (Batch,)
        preds = model(h_ids, h_emb, t_id, t_emb, likes, views).squeeze()

        # Store results
        predictions.extend(preds.cpu().tolist())
        ids.extend(batch['comp_id'].tolist())

print(f"Inference complete. Generated {len(predictions)} predictions.")

Starting Inference...
Inference complete. Generated 379142 predictions.


In [None]:
# Create DataFrame
submission_df = pd.DataFrame({
    'ID': ids,
    'Task1': 0,               # Fill with 0 (Not your task)
    'Task2': predictions,     # YOUR PREDICTIONS
    'Task1&2': 0              # Fill with 0 (Not your task)
})

# Sort by ID just to be safe (optional but good practice)
submission_df = submission_df.sort_values('ID')

# Save to CSV (index=False is critical!)
submission_df.to_csv("prediction.csv", index=False)

print("prediction.csv created! Good luck with the submission!")

prediction.csv created! Good luck with the submission!
