# The BASELINE model

## Imports

In [None]:
import os
import math
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from typing import List
from torch import Tensor
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

## Config

In [None]:
from dataclasses import dataclass, field

@dataclass
class Config:
    PLAYERS_SIZE: int = 0
    CONTEXT_LEN: int = 20  ## predict the 21st match by the 20
    PERFORMANCE_EMBD_DIM: int = 128
    PLAYER_INPUT_DIM: int = 25
    MATCH_INPUT_EMBD: int = 25

    NUM_EPOCHS: int = 100
    LEARNING_RATE: float = 1e-3
    BATCH_SIZE: int = 16
    DEVICE: str = field(default_factory=lambda: "cuda" if torch.cuda.is_available() else "cpu")
    # MODEL_SAVE_PATH: str = "/kaggle/working"
    # BASE_DIR: str = '/kaggle/input/flickr8k/'
    # CROSSATT_NUM_HEADS: int = 8
    CLS_INIT_STD: float = 0.02    ## <CLS> token initialized with std 0.02 from the mean=0
    TEST_DATASET_SIZE: int = 180
    IDLE_DEVICE: str = 'cpu'
    ACCUMULATION_STEPS = 4

    @property
    # def transform(self):

    @property
    # def reverse_transform(self):

In [None]:
config = Config()

## Defination of **Losses**

## Data

In [None]:
## INPUT
# Universal Player embedding
# Match Situation Embedding
# Form embedding --> attention to be performed on

#### Data Processing (Normalization)

#### Data Consolidation

In [None]:
## Concatation of the Input Embedding

#### Generated Training Sequences

#### Create the DataLoader

## Model Architecture

In [None]:
class PlayerEmbedding(nn.Module):
  """ HEre we are doing the Proj of the raw Player embedding into the PERFORMANCE_EMBD_DIM """
  def __init__(self, in_channels=config.PLAYER_INPUT_DIM, out_channels=config.PERFORMANCE_EMBD_DIM):
    super().__init__()
    self.proj = nn.Linear(in_channels, out_channels)

  def forward(self, x):
    # x: (B, PLAYER_INPUT_DIM) or flattened (B*T, PLAYER_INPUT_DIM)
    return self.proj(x)   ## ( B/B*T, PERFORMANCE_EMBD)

In [None]:
class MatchEmbedding(nn.Module):
    """
    Computes a match-level embedding from team and match information.
    Expected input shapes (for T matches):
      team1_players: (B, T, num_team1, PLAYER_INPUT_DIM)
      team2_players: (B, T, num_team2, PLAYER_INPUT_DIM)
      match_info: (B, T, match_info_dim)
    Output:
      (B, T, PERFORMANCE_EMBD_DIM)
    """
    def __init__(self, player_embedding_module, in_channels=config.MATCH_INPUT_EMBD, out_channels=config.PERFORMANCE_EMBD_DIM):
        super().__init__()
        self.player = player_embedding_module
        self.proj = nn.Linear(in_channels, out_channels)

    def forward(self, team1_players, team2_players, match_info):
        """
          1. Get player embeddings using self.player.
          2. Sum (or pool) embeddings for each team.
          3. Concatenate team representations with match_info.
          4. Project the concatenated vector to obtain the final match embedding.
        """

        B, T, num_team1, _ = team1_players.shape
        # Compute player embeddings
        team1_flat = team1_players.view(B * T, num_team1, -1)  # (B*T, num_team1, PLAYER_INPUT_DIM)
        team1_embeds = self.player(team1_flat)  # (B*T, num_team1, PERFORMANCE_EMBD_DIM)
        team1_sum = team1_embeds.sum(dim=1)  # (B*T, PERFORMANCE_EMBD_DIM)
        team1_sum = team1_sum.view(B, T, -1)  # (B, T, PERFORMANCE_EMBD_DIM)

        B, T, num_team2, _ = team2_players.shape
        team2_flat = team2_players.view(B * T, num_team2, -1)
        team2_embeds = self.player(team2_flat)  # (B*T, num_team2, PERFORMANCE_EMBD_DIM)
        team2_sum = team2_embeds.sum(dim=1)  # (B*T, PERFORMANCE_EMBD_DIM)
        team2_sum = team2_sum.view(B, T, -1)  # (B, T, PERFORMANCE_EMBD_DIM)

        # Concatenate team summaries with match-level info along last dimension.
        # match_info: (B, T, match_info_dim)
        fused = torch.cat([team1_sum, team2_sum, match_info], dim=-1)  # (B, T, 2*PERFORMANCE_EMBD_DIM + match_info_dim)
        match_embedding = self.proj(fused)  # (B, T, PERFORMANCE_EMBD_DIM)

        return match_embedding



In [None]:
class PerformanceEmbedding(nn.Module):
    """
    Combines a sequence of player embeddings with a sequence of match embeddings to produce a sequence
    of performance embeddings. For each time step:
      1. Obtain the player's embedding from player_input (B, T, PLAYER_INPUT_DIM).
      2. Obtain the match embedding using the MatchEmbedding module, which now expects team-level
         inputs with a time dimension (B, T, ...).
      3. Concatenate these embeddings (resulting in a vector of dimension 2 * PERFORMANCE_EMBD_DIM).
      4. Project the concatenated vector to PERFORMANCE_EMBD_DIM.

    Expected input shapes:
      player_input: (B, T, PLAYER_INPUT_DIM)
      team1_players: (B, T, num_team1, PLAYER_INPUT_DIM)
      team2_players: (B, T, num_team2, PLAYER_INPUT_DIM)
      match_info: (B, T, match_info_dim)

    Output:
      (B, T, PERFORMANCE_EMBD_DIM)
    """
    def __init__(self, player_embedding_module, match_embedding_module, out_channels=config.PERFORMANCE_EMBD_DIM):
        super().__init__()
        self.player_embedding_module = player_embedding_module
        self.match_embedding_module = match_embedding_module
        # Linear layer to map concatenated [player_emb; match_emb] (dimension 2*PERFORMANCE_EMBD_DIM)
        # to PERFORMANCE_EMBD_DIM.
        self.proj = nn.Linear(2 * config.PERFORMANCE_EMBD_DIM, out_channels)

    def forward(self, player_input, team1_players, team2_players, match_info):
        """
        player_input: (B, T, PLAYER_INPUT_DIM) - raw features for a specific player across T matches.
        team1_players: (B, T, num_team1, PLAYER_INPUT_DIM) - raw features for team1 players per match.
        team2_players: (B, T, num_team2, PLAYER_INPUT_DIM) - raw features for team2 players per match.
        match_info: (B, T, match_info_dim) - extra normalized match information per match.
        """
        B, T, _ = player_input.shape

        # Compute player's embedding for each match time step.
        # Reshape to (B*T, PLAYER_INPUT_DIM) so that the player_embedding_module can be applied, then reshape back.
        player_emb = self.player_embedding_module(player_input.view(B * T, -1))  # (B*T, PERFORMANCE_EMBD_DIM)
        player_emb = player_emb.view(B, T, -1)  # (B, T, PERFORMANCE_EMBD_DIM)

        # Compute match embedding across T time steps.
        # Ensure that the match_embedding_module expects inputs with a time dimension.
        match_emb = self.match_embedding_module(team1_players, team2_players, match_info)  # (B, T, PERFORMANCE_EMBD_DIM)

        # Concatenate the player's embedding and match embedding for each time step.
        combined = torch.cat([player_emb, match_emb], dim=-1)  # (B, T, 2 * PERFORMANCE_EMBD_DIM)

        # Project the concatenated vector back to PERFORMANCE_EMBD_DIM.
        performance_emb = self.proj(combined)  # (B, T, PERFORMANCE_EMBD_DIM)
        return performance_emb


In [None]:
class SelfAttention(nn.Module):
    def __init__(self, n_heads, d_embed, in_proj_bias=True, out_proj_bias=True):
        super().__init__()
        # This combines the Wq, Wk and Wv matrices into one matrix
        self.in_proj = nn.Linear(d_embed, 3 * d_embed, bias=in_proj_bias)
        # This one represents the Wo matrix
        self.out_proj = nn.Linear(d_embed, d_embed, bias=out_proj_bias)
        self.n_heads = n_heads   ## how many heads u want ?
        self.d_head = d_embed // n_heads   ## the original embedding get divided in the all heads equally


    def forward(self, x, causal_mask=False):

        # x: # (Batch_Size, Seq_Len, Dim)

        # (Batch_Size, Seq_Len, Dim)
        input_shape = x.shape

        # (Batch_Size, Seq_Len, Dim)
        batch_size, sequence_length, d_embed = input_shape

        # (Batch_Size, Seq_Len, H, Dim / H)
        qkv_shape = (batch_size, sequence_length, self.n_heads, self.d_head)

        # (Batch_Size, Seq_Len, Dim) -> (Batch_Size, Seq_Len, Dim * 3) -> 3 tensor of shape (Batch_Size, Seq_Len, Dim)
        q, k, v = self.in_proj(x).chunk(3, dim=-1)

        # (Batch_Size, Seq_Len, Dim) -> (Batch_Size, Seq_Len, H, Dim / H) -> (Batch_Size, H, Seq_Len, Dim / H)
        q = q.view(qkv_shape).transpose(1, 2)
        k = k.view(qkv_shape).transpose(1, 2)
        v = v.view(qkv_shape).transpose(1, 2)

        # (Batch_Size, H, Seq_Len, Dim / H) @ (Batch_Size, H, Dim / H, Seq_Len) -> (Batch_Size, H, Seq_Len, Seq_Len)
        weight = q @ k.transpose(-1, -2)

        if causal_mask:
            # Mask where the upper triangle (above the principal diagonal) is 1
            mask = torch.ones_like(weight, dtype=torch.bool).triu(1)
            # Fill the upper triangle with -inf
            weight.masked_fill_(mask, -torch.inf)

        # Divide by d_k (Dim / H).
        # (Batch_Size, H, Seq_Len, Seq_Len) -> (Batch_Size, H, Seq_Len, Seq_Len)
        weight /= math.sqrt(self.d_head)

        # (Batch_Size, H, Seq_Len, Seq_Len) -> (Batch_Size, H, Seq_Len, Seq_Len)
        weight = F.softmax(weight, dim=-1)

        # (Batch_Size, H, Seq_Len, Seq_Len) @ (Batch_Size, H, Seq_Len, Dim / H) -> (Batch_Size, H, Seq_Len, Dim / H)
        output = weight @ v

        # (Batch_Size, H, Seq_Len, Dim / H) -> (Batch_Size, Seq_Len, H, Dim / H)
        output = output.transpose(1, 2)

        # (Batch_Size, Seq_Len, H, Dim / H) -> (Batch_Size, Seq_Len, Dim)
        output = output.reshape(input_shape)

        # (Batch_Size, Seq_Len, Dim) -> (Batch_Size, Seq_Len, Dim)
        output = self.out_proj(output)

        # (Batch_Size, Seq_Len, Dim)
        return output


In [None]:
class a_layer(nn.Module):
    """ A Single Transformer Layer/Block """

    def __init__(self, n_head: int, n_embd: int):
        super().__init__()
        # Pre-attention norm
        self.layernorm_1 = nn.LayerNorm(n_embd)
        # Self attention
        self.attention = SelfAttention(n_head, n_embd)
        # Pre-FNN norm
        self.layernorm_2 = nn.LayerNorm(n_embd)
        # Feedforward layer
        self.linear_1 = nn.Linear(n_embd, 4 * n_embd)
        self.linear_2 = nn.Linear(4 * n_embd, n_embd)

    def forward(self, x):
        # (Batch_Size, Seq_Len, Dim)
        residue = x

        ### SELF ATTENTION ###

        # (Batch_Size, Seq_Len, Dim) -> (Batch_Size, Seq_Len, Dim)
        x = self.layernorm_1(x)

        # (Batch_Size, Seq_Len, Dim) -> (Batch_Size, Seq_Len, Dim)
        x = self.attention(x, causal_mask=True)

        # (Batch_Size, Seq_Len, Dim) + (Batch_Size, Seq_Len, Dim) -> (Batch_Size, Seq_Len, Dim)
        x += residue

        ### FEEDFORWARD LAYER ###
        # Apply a feedforward layer where the hidden dimension is 4 times the embedding dimension.

        residue = x
        # (Batch_Size, Seq_Len, Dim) -> (Batch_Size, Seq_Len, Dim)
        x = self.layernorm_2(x)

        # (Batch_Size, Seq_Len, Dim) -> (Batch_Size, Seq_Len, 4 * Dim)
        x = self.linear_1(x)

        # (Batch_Size, Seq_Len, 4 * Dim) -> (Batch_Size, Seq_Len, 4 * Dim)
        x = x * torch.sigmoid(1.702 * x)   # QuickGELU activation function found best for this work

        # (Batch_Size, Seq_Len, 4 * Dim) -> (Batch_Size, Seq_Len, Dim)
        x = self.linear_2(x)

        # (Batch_Size, Seq_Len, Dim) + (Batch_Size, Seq_Len, Dim) -> (Batch_Size, Seq_Len, Dim)
        x += residue

        return x

In [None]:
class NextFormPredictor(nn.Module):
    """ Predict the next Form embedding by previous Form embedding autoregressively with Transformer Decoder """

    """
    Processes a sequence of T performance embeddings (one per match) via a Transformer-based decoder.
    A learnable [CLS] token is prepended and positional embeddings are added. The final [CLS] token output
    serves as the aggregated representation.

    Expected input shapes:
      player_input: (B, T, PLAYER_INPUT_DIM)
      team1_players: (B, T, num_team1, PLAYER_INPUT_DIM)
      team2_players: (B, T, num_team2, PLAYER_INPUT_DIM)
      match_info: (B, T, match_info_dim)

    Output:
      (B, PERFORMANCE_EMBD_DIM)
    """

    def __init__(self, player_embedding_module, match_embedding_module, fantasy_score_prediction_module, custom_loss, embedding_dim=config.PERFORMANCE_EMBD_DIM):
        super().__init__()
        self.player_embedding_module = player_embedding_module
        self.match_embedding_module = match_embedding_module
        # cls_token = nn.Parameter(torch.normal(mean=0.0, std=config.CLS_INIT_STD, size=(1, 1, embedding_dim)))
        self.token_embedding = PerformanceEmbedding(
            player_embedding_module,
            match_embedding_module(player_embedding_module)
        )
        self.pos_embedding = nn.Embedding(config.CONTEXT_LEN+1, embedding_dim)

        self.layers = nn.ModuleList([a_layer(n_head=8, n_embd=embedding_dim) for _ in range(6)])
        self.layernorm = nn.LayerNorm(embedding_dim)
        self.out_proj = nn.Linear(embedding_dim, embedding_dim)
        self.fantasy_score_prediction_module = fantasy_score_prediction_module
        self.custom_loss = custom_loss

    def forward(self, player_input, team1_players, team2_players, match_info, target=None):
      """ here we are aussming that the each of the arg - player_input, team1&2_players ,macth_info have
      size [batch, seq_len]"""
      B, T, _ = player_input.shape     ## B=batch_size, T=Context_len

      ## (B, T) => (B, T, embd)
      x = self.token_embedding(player_input, team1_players, team2_players, match_info)

      # cls_tokens = self.cls_token.expand(B, -1, -1)  ## (batch, 1, embeddign_dim)

      # x = torch.cat((cls_tokens, x), dim=1)  ## (batch, T+1, embedding_dim)

      x = x + self.pos_embedding(torch.arange(T, device=config.DEVICE))  ## (batch, T+1, embedding_dim)

      for layer in self.layers:
        x = layer(x)

      x = self.layernorm(x)
      perf_emb = self.out_proj(x)  # (B, T, embedding_dim)
      # cls_val = x[:, 0, :]  ## (batch, embedding_dim) return the <CLS>

      loss = None
      if target is not None:
          # For autoregressive fantasy score prediction, we use the performance embedding at time t to predict
          # the fantasy score for match t+1. Therefore, we shift the sequence:
          pred_perf = perf_emb[:, :-1, :]  # (B, T-1, embedding_dim) predicted "next form" embeddings.

          # Get the ground truth player embedding for match t+1:
          target_player_embd = self.player_embedding_module(player_input[:, 1:, :])  # (B, T-1, embedding_dim)

          # Get the ground truth match embedding for match t+1:
          target_match_embd = self.match_embedding_module(
              team1_players[:, 1:, :, :],
              team2_players[:, 1:, :, :],
              match_info[:, 1:, :]
          )  # (B, T-1, embedding_dim)

          # Predict fantasy scores from the predicted next form combined with the target player and match embeddings.
          # The fantasy score prediction module expects three tensors of shape (B, T-1, embedding_dim) and outputs (B, T-1, 1)
          pred_fantasy = self.fantasy_score_prediction_module(pred_perf, target_player_embd, target_match_embd)
          # Compute loss (e.g., MSE loss) between predicted fantasy scores and target.
          loss = self.custom_loss(pred_fantasy.squeeze(-1), target)
      return perf_emb, loss


NameError: name 'nn' is not defined

In [None]:
# class NextFormPredictor(nn.Module):
#     """
#     Predicts a sequence of performance embeddings autoregressively.
#     Then, for loss calculation, for each time step t (from 0 to T-2), it uses the predicted
#     performance embedding at time t to predict the fantasy score for match t+1.
#     The prediction is performed by combining:
#       - The predicted performance embedding from time t (acting as the 'next form'),
#       - The ground-truth player embedding for time t+1, and
#       - The ground-truth match embedding for time t+1.
#     These three are passed through an MLP (fantasy_score_prediction_module) to yield the predicted fantasy score.
#     The loss is computed (e.g., via MSE) between these predictions and the provided target fantasy scores.

#     Expected input shapes:
#       player_input: (B, T, PLAYER_INPUT_DIM)
#       team1_players: (B, T, num_team1, PLAYER_INPUT_DIM)
#       team2_players: (B, T, num_team2, PLAYER_INPUT_DIM)
#       match_info: (B, T, match_info_dim)
#       target: (B, T-1)  (fantasy scores for matches 2..T)
#     Output:
#       performance_embeddings: (B, T, PERFORMANCE_EMBD_DIM)
#       loss: computed fantasy score prediction loss
#     """
#     def __init__(self, player_embedding_module, match_embedding_module, fantasy_score_prediction_module, embedding_dim=config.PERFORMANCE_EMBD_DIM, num_layers=6, n_head=8):
#         super().__init__()
#         self.player_embedding_module = player_embedding_module
#         self.match_embedding_module = match_embedding_module
#         self.token_embedding = PerformanceEmbedding(player_embedding_module, match_embedding_module)
#         self.pos_embedding = nn.Embedding(config.CONTEXT_LEN, embedding_dim)
#         self.layers = nn.ModuleList([a_layer(n_head=n_head, n_embd=embedding_dim) for _ in range(num_layers)])
#         self.layernorm = nn.LayerNorm(embedding_dim)
#         self.out_proj = nn.Linear(embedding_dim, embedding_dim)
#         self.fantasy_score_prediction_module = fantasy_score_prediction_module

#     def forward(self, player_input, team1_players, team2_players, match_info, target=None):
#         """
#         Forward pass.
#           player_input: (B, T, PLAYER_INPUT_DIM)
#           team1_players: (B, T, num_team1, PLAYER_INPUT_DIM)
#           team2_players: (B, T, num_team2, PLAYER_INPUT_DIM)
#           match_info: (B, T, match_info_dim)
#           target: (B, T-1) fantasy scores for matches 2..T (if provided)
#         """
#         B, T, _ = player_input.shape
#         # Obtain a sequence of performance embeddings (B, T, embedding_dim)
#         x = self.token_embedding(player_input, team1_players, team2_players, match_info)
#         # Add positional embeddings: generate indices [0, 1, ..., T-1] and add
#         pos_ids = torch.arange(T, device=player_input.device).unsqueeze(0).expand(B, T)
#         x = x + self.pos_embedding(pos_ids)

#         # Process sequence through transformer layers.
#         for layer in self.layers:
#             x = layer(x)
#         x = self.layernorm(x)
#         # Optionally, project outputs if needed:
#         perf_emb = self.out_proj(x)  # (B, T, embedding_dim)

#         loss = None
#         if target is not None:
#             # For autoregressive fantasy score prediction, we use the performance embedding at time t to predict
#             # the fantasy score for match t+1. Therefore, we shift the sequence:
#             pred_perf = perf_emb[:, :-1, :]  # (B, T-1, embedding_dim) predicted "next form" embeddings.

#             # Get the ground truth player embedding for match t+1:
#             target_player_embd = self.player_embedding_module(player_input[:, 1:, :])  # (B, T-1, embedding_dim)

#             # Get the ground truth match embedding for match t+1:
#             target_match_embd = self.match_embedding_module(
#                 team1_players[:, 1:, :, :],
#                 team2_players[:, 1:, :, :],
#                 match_info[:, 1:, :]
#             )  # (B, T-1, embedding_dim)

#             # Predict fantasy scores from the predicted next form combined with the target player and match embeddings.
#             # The fantasy score prediction module expects three tensors of shape (B, T-1, embedding_dim) and outputs (B, T-1, 1)
#             pred_fantasy = self.fantasy_score_prediction_module(pred_perf, target_player_embd, target_match_embd)
#             # Compute loss (e.g., MSE loss) between predicted fantasy scores and target.
#             loss = F.mse_loss(pred_fantasy.squeeze(-1), target)
#         return perf_emb, loss


In [None]:
# experiment with the dropout layers
class FantasyScorePrediction(nn.Module):
    """Predict the Fantasy Score by the <CLS> embedding, the next match_embedding, and the player_embedding."""
    def __init__(self, embedding_dim=config.PERFORMANCE_EMBD_DIM):
        super().__init__()
        self.proj1 = nn.Linear(3 * config.PERFORMANCE_EMBD_DIM, 2048)
        self.proj2 = nn.Linear(2048, 512)
        self.proj3 = nn.Linear(512, 512)
        self.proj4 = nn.Linear(512, 256)
        self.proj5 = nn.Linear(256, 256)
        self.proj6 = nn.Linear(256, 128)
        self.proj7 = nn.Linear(128, 128)
        self.proj8 = nn.Linear(128, 1)

    def forward(self, x, target_player_embd, target_match_embd):  # x = <CLS> token
        # Concatenate the embeddings along the last dimension
        x = torch.cat([x, target_player_embd, target_match_embd], dim=-1)

        x = F.gelu(self.proj1(x))
        x = F.gelu(self.proj2(x))
        x = F.gelu(self.proj3(x))
        x = F.gelu(self.proj4(x))
        x = F.gelu(self.proj5(x))
        x = F.gelu(self.proj6(x))
        x = F.gelu(self.proj7(x))


        x = self.proj8(x)

        return x


In [None]:
def custom_loss(pred_fantasy, target):
    """ Write the code of the our custom Upper-Lower Bound Loss function here properly ... """


## Training

## Evalutaion

## Inference