In [24]:
import os
import math
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
from typing import List
from torch import Tensor
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor

In [None]:
from dataclasses import dataclass, field

@dataclass
class Config:
    PLAYERS_SIZE: int = 0
    CONTEXT_LEN: int = 16  ## predict the 21st match by the 20
    PERFORMANCE_INPUT_DIM = 23
    PERFORMANCE_EMBD_DIM: int = 128
    
    PLAYER_INPUT_DIM: int = 25   ## univpalyer
    MATCH_INPUT_EMBD: int = 14   ## match_info
    NUM_EPOCHS: int = 100
    LEARNING_RATE: float = 1e-3
    BATCH_SIZE: int = 32
    DEVICE: str = field(default_factory=lambda: "cuda" if torch.cuda.is_available() else "cpu")
    # MODEL_SAVE_PATH: str = "/kaggle/working"
    # BASE_DIR: str = '/kaggle/input/flickr8k/'
    # CROSSATT_NUM_HEADS: int = 8
    CLS_INIT_STD: float = 0.02    ## <CLS> token initialized with std 0.02 from the mean=0
    TEST_DATASET_SIZE: int = 180
    IDLE_DEVICE: str = 'cpu'
    ACCUMULATION_STEPS = 4


In [35]:
config = Config()

## Defination of **Losses**

## Data <br>
 Data Preprocessing

In [27]:
#TODO
class PlayerMatchDataset(Dataset):
    """
    Dataset for autoregressive next-match prediction.

    For each sample:
      1. Given a player_id, retrieve universal player features from universal_player.csv.
         (The 'player_id' and 'cricinfo_id' columns are dropped from the input features.)
      2. Retrieve all matches for that player from a directory of player match files
         (each file is named "{player_id}.csv" and contains the matches for that player,
         already sorted by date).
         The performance columns include:
           batting_position, runs, balls, fours, sixes, strike_rate, overs, total_balls, dots,
           maidens, conceded, fours_conceded, sixes_conceded, wickets, LBW, Bowled, noballs,
           wides, economy_rate, catches, stumping, direct_hit, indirect_hit, strike_rate_fp,
           batting_fp, bowling_fp, fielding_fp, total_fp.
         We drop 'match_id' and the fantasy-breakdown columns
         ('strike_rate_fp', 'batting_fp', 'bowling_fp', 'fielding_fp', 'total_fp') when forming the input vector.
      3. Randomly sample a contiguous window of (context_len + 1) matches for this player.
         The first context_len matches serve as input and matches 2 ... context_len+1 yield the target fantasy scores.
      4. For each match in the window:
         - Load the corresponding match players CSV from the match_players folder (file: '{match_id}.csv').
         - Determine the player's team for that match and then separate player_ids into:
              team1_ids: those belonging to the same team as the player,
              team2_ids: those belonging to the other team.
         - Retrieve their universal features from universal_player.csv.
         - Retrieve match info from a single match_info CSV (by matching on match_id).
      5. Return a dictionary containing:
         - 'player_id': the player's id.
         - 'univ_features': the player's universal features.
         - 'context_matches': a numpy array of performance features for the context matches.
         - 'target_scores': a numpy array of target fantasy scores (for matches 2 ... context_len+1).
         - 'team1_players': list (per match) of team1 players' universal features.
         - 'team2_players': list (per match) of team2 players' universal features.
         - 'match_info': list (per match) of match info dictionaries.
    """

    def __init__(self, universal_player_csv, player_matches_dir, match_players_dir, match_info_csv, context_len=25, transform=None):
        """
        Args:
            universal_player_csv (str): Path to universal_player.csv.
            player_matches_dir (str): Directory containing CSV files for each player's matches (named '{player_id}.csv').
            match_players_dir (str): Directory containing match_players CSV files.
            match_info_csv (str): Path to the CSV file containing match info for all matches.
            context_len (int): Number of context matches to use as input 
                                (target will be matches 2 ... context_len+1).
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.context_len = context_len
        self.transform = transform

        # Load universal player features and set player_id as index.
        self.univ_df = pd.read_csv(universal_player_csv)
        self.univ_df = self.univ_df.set_index('player_id')
        # Drop 'cricinfo_id' as it is redundant.
        self.univ_features = self.univ_df.drop(columns=['cricinfo_id'], errors='ignore')

        # Instead of a single file for player matches, we assume a directory where each player's
        # matches are stored in a file named '{player_id}.csv'. We build a list of valid player_ids
        # by checking which player match files exist and ensuring they have at least (context_len+1) rows.
        self.player_matches_dir = player_matches_dir
        self.player_ids = []
        self.player_match_data = {}  # Key: player_id, Value: DataFrame of that player's matches.
        for player_id in self.univ_features.index:
            match_file = os.path.join(player_matches_dir, f"{player_id}.csv")
            if os.path.exists(match_file):
                df_matches = pd.read_csv(match_file)
                # Assume the matches in this file are already sorted by date.
                if len(df_matches) >= (self.context_len + 1):
                    self.player_ids.append(player_id)
                    self.player_match_data[player_id] = df_matches

        self.match_players_dir = match_players_dir

        # Load the single match_info CSV and set match_id as index for fast lookup.
        self.match_info_df = pd.read_csv(match_info_csv)
        self.match_info_df = self.match_info_df.set_index('match_id')
        # Optionally drop columns not needed.
        self.match_info_df = self.match_info_df.drop(columns=['team1', 'team2', 'toss_winner', "toss_decision", "winner"], errors='ignore')
        #Todo don't just drop them we need to infer from them in sense of player's team ->winner 1 else 0

    def __len__(self):
        return len(self.player_ids)

    def __getitem__(self, idx):
        """
        Returns a dictionary with:
          - 'player_id': the player's id.
          - 'univ_features': universal features for the player.
          - 'context_matches': a numpy array of performance features for the context matches.
          - 'target_scores': a numpy array of target fantasy scores (for matches 2 ... context_len+1).
          - 'team1_players': list (per match) of team1 players' universal features.
          - 'team2_players': list (per match) of team2 players' universal features.
          - 'match_info': list (per match) of match info dictionaries.
        """
        #ToDO not just total_fantasy score, rather a weighted sum of error for batting_bowling fp's, fielding_fp with lesser weightage to the overall total_fp  
        # Select the player.
        player_id = self.player_ids[idx]
        # Retrieve the player's universal features as a numpy array.
        player_univ = self.univ_features.loc[player_id].values.astype(float)
        
        # Load this player's matches DataFrame from the pre-loaded dictionary.
        df_matches = self.player_match_data[player_id]
        total_matches = len(df_matches)
        # Randomly select a contiguous window of (context_len + 1) matches.
        start_idx = np.random.randint(1, total_matches - self.context_len+1)
        window_df = df_matches.iloc[start_idx-1 : start_idx + self.context_len]

        # For input, drop the unused columns.
        exclude_cols = [ 'teamname','match_id',
                        'strike_rate_fp', 'batting_fp', 'bowling_fp', 'fielding_fp', 'total_fp']
        # Extract target fantasy scores (from column 'total_fp') for matches 2 ... context_len+1.
        target_scores = window_df.iloc[1:self.context_len+1][['batting_fp', 'bowling_fp', 'fielding_fp','total_fp']].values.astype(float)
        #Todo account for all sub_fp's (stage -2) #Done
        # Extract performance features for input matches.
        context_matches = window_df.iloc[:self.context_len].drop(columns=exclude_cols, errors='ignore')
        

        # For each match in the window, retrieve team players and match info.
        # team1_players_list = [] 
        team2_players_list = []
        match_info_list = []
        for _, match in window_df.iterrows():
            match_id = match['match_id']
            # Load match players file (expects a file named "{match_id}.csv").
            match_players_file = os.path.join(self.match_players_dir, f"{match_id}.csv")
            match_players_df = pd.read_csv(match_players_file)
            ## series of team_name
            player_team_series = match_players_df[
                match_players_df['player_id'] == player_id]['Team']
            #? rather we can refer it from window_df, there in the excluded_cols same as match['team_name']
            if not player_team_series.empty:
                player_team = player_team_series.iloc[0]
            else:
                player_team = match_players_df.iloc[0]['Team'] #?why's this, though it won't execute
            
            # Split player_ids into two groups based on the player's team.
            #! we just need the team2 player_ids (we are omitting the intra-team interactions...)
            team2_ids = match_players_df[
                match_players_df['Team'] != player_team]['player_id'].tolist()

            # Retrieve universal features for these players.
            # team1_features = self.univ_features.reindex(team1_ids).dropna().values.astype(float)
            team2_features = self.univ_features.reindex(team2_ids).dropna().values.astype(float)
            # team1_players_list.append(team1_features)
            team2_players_list.append(team2_features)
            
            # Retrieve match info using match_id from the single match_info DataFrame.
            if match_id in self.match_info_df.index:
                match_info_dict = self.match_info_df.loc[match_id].to_dict()
            else:
                match_info_dict = {}  #TODO how we are going to handle this scenario 
            match_info_list.append(match_info_dict)

        # --- START OF CHANGES: convert to properly shaped torch.Tensors --
        
        # # universal features
        univ_features = torch.tensor(player_univ, dtype=torch.float32)  # (feat_dim,)

        # context matches
        context_matches = context_matches.drop(columns = 'teamname',errors = 'ignore')
        context_matches = torch.tensor(context_matches.values.astype(float),
                                       dtype=torch.float32)  # (context_len, perf_dim)

        # target scores
        target_scores = torch.tensor(target_scores, dtype=torch.float32)  # (context_len,)

        # team1 players: pad to max players across the window, then stack
        # max1 = max(arr.shape[0] for arr in team1_players_list)
        # feat_dim = team1_players_list[0].shape[1] if max1>0 else 0
        # padded1 = [
        #     np.pad(arr, ((0, max1 - arr.shape[0]), (0, 0)), mode='constant')
        #     for arr in team1_players_list
        # ]
        # team1_players = torch.tensor(np.stack(padded1), dtype=torch.float32)
        # shape: (context_len+1, max1, feat_dim)

        # team2 players: same
        max2 = max(arr.shape[0] for arr in team2_players_list)
        feat_dim2 = team2_players_list[0].shape[1] if max2>0 else 0
        padded2 = [
            np.pad(arr, ((0, max2 - arr.shape[0]), (0, 0)), mode='constant')
            for arr in team2_players_list
        ]
        team2_players = torch.tensor(np.stack(padded2), dtype=torch.float32)
        # shape: (context_len+1, max2, feat_dim2)
        #? wht's this

        # match_info: convert list of dicts to array in fixed key order
        keys = list(self.match_info_df.columns)
        info_arr = np.stack([[d.get(k, 0.0) for k in keys] for d in match_info_list])
        match_info = torch.tensor(info_arr, dtype=torch.float32)
        # shape: (context_len+1, len(keys))

        # --- END OF CHANGES ---

        sample = {
            'player_id': player_id,
            'univ_features': univ_features,
            'context_matches': context_matches,
            'target_scores': target_scores,
            # 'team1_players': team1_players[:, :11, :],
            'team2_players': team2_players[:, :11, :],
            'match_info': match_info
        }
        
        if self.transform:
            sample = self.transform(sample)
        return sample


In [28]:
import os
from torch.utils.data import DataLoader

# Define file paths (update these paths as needed for your folder structure)
universal_player_csv = r'C:\Users\kumar\IPL_Fantasy_Score_Prediction\Ashu\cleaned_universal_player.csv'
player_matches_dir = r'C:\Users\kumar\IPL_Fantasy_Score_Prediction\Ashu\Test_1\Cleaned_Global_player_csvs'  # Contains files named like {player_id}.csv (each containing that player's matches)
match_players_dir = r'C:\Users\kumar\IPL_Fantasy_Score_Prediction\Ashu\Test_1\processed_GlobalMatchrecords'    # Contains files like {match_id}.csv
match_info_csv = r'C:\Users\kumar\IPL_Fantasy_Score_Prediction\Ashu\Test_1\cleaned_matchinfo_without_venue_with_updated_match_number.csv'               # Single CSV containing all match info

# Define the context length (number of matches to use as context)
# For example, if config.CONTEXT_LEN is defined in your config module:
# config.CONTEXT_LEN = 5

# Initialize the dataset
dataset = PlayerMatchDataset(
    universal_player_csv=universal_player_csv,
    player_matches_dir=player_matches_dir,  # This parameter may be ignored if you use the directory version
    match_players_dir=match_players_dir,
    match_info_csv=match_info_csv,
    context_len=config.CONTEXT_LEN
)

# def get_shape(lst):
#     shape = []
#     while isinstance(lst, list):
#         shape.append(len(lst))
#         if len(lst) == 0:
#             break
#         lst = lst[0]
#     return tuple(shape)



# Initialize the DataLoader
dataloader = DataLoader(dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=0)



In [29]:

# # Iterate through one batch to check the output shapes and values
for batch in dataloader:
    print("Player IDs:", batch['player_id'])  # list of player_ids (length = batch_size)
    
    print("Universal features shape:", (batch['univ_features']).shape)  # e.g., (batch_size, num_features)
    
    print("Context matches shape:", batch['context_matches'].shape)  # e.g., (batch_size, context_len, performance_feature_dim)
    
    print("Target scores shape:", batch['target_scores'].shape)      # e.g., (batch_size, context_len)
    
    # The following are lists of length (context_len+1); each element is a numpy array.
    # team1_players_tensor = list_of_tensors_to_3d(batch['team1_players'])
    # team1_players_tensor = team1_players_tensor.squeeze(2)
    # print("Number of matches in team1_players (per sample):", batch['team1_players'].shape)
    
    # team2_players_tensor = list_of_tensors_to_3d(batch['team2_players'])
    # team2_players_tensor = team2_players_tensor.squeeze(2)
    print("Number of matches in team2_players (per sample):", batch['team2_players'].shape)
    
    # match_info_tensor = list_of_dicts_to_tensor(batch['match_info'])
    print("Number of matches in match_info (per sample):", (batch['match_info']).shape)
    break

Player IDs: ['95fcbfb7', 'b61a3e1a', 'f05fe9b1', '5d77a96a', '3d8feaf8', '69e23303', '8dd02a98', 'ad286dcc', '955e8e86', '791c088d', '3560a786', 'fb0d68b4', '5451a2c1', 'f8dcfe4e', '36a0a88c', 'e635a5dd', 'a9d788e3', '1bdbf53b', '85ebf2be', '3c6ffae8', '6ceb94d0', '07687c15', 'ce2b42ae', 'b93a5d0b', '33ab1f1c', 'bdadf7da', '0d677597', '903560ed', 'f5180fe6', '74bace71', '1b04e02b', 'd8bee9a1']
Universal features shape: torch.Size([32, 25])
Context matches shape: torch.Size([32, 16, 23])
Target scores shape: torch.Size([32, 16, 4])
Number of matches in team2_players (per sample): torch.Size([32, 17, 11, 25])
Number of matches in match_info (per sample): torch.Size([32, 17, 14])


In [30]:
len(dataloader)

74

In [31]:
def list_of_tensors_to_3d(tensor_list):
    """
    Converts a list of tensors into a 3D tensor with shape:
    (1, number_of_tensors, *inner_tensor_shape)
    
    Args:
        tensor_list (list of torch.Tensor): List of tensors with identical shapes.
        
    Returns:
        torch.Tensor: A tensor with the new shape (1, len(tensor_list), inner dims...).
    """
    # First stack the tensors along dimension 0: shape becomes (number_of_tensors, inner dims...)
    stacked = torch.stack(tensor_list, dim=0)
    # Add a leading dimension to obtain the final shape (1, number_of_tensors, inner dims...)
    return stacked.unsqueeze(0)


def list_of_dicts_to_tensor(data, key_order=None):
    """
    Convert a list of dictionaries (each with tensor or numeric values) 
    into a 3D tensor of shape (1, number_of_dicts, number_of_keys).

    Args:
        data (list): List of dictionaries where each dictionary contains the same keys.
        key_order (list, optional): Specific order of keys to extract from each dictionary.
                                    If None, keys from the first dictionary are used.

    Returns:
        torch.Tensor: A 3D tensor with shape (1, len(data), len(key_order)).
    """
    if not data:
        raise ValueError("The input data list is empty.")
    
    # Use keys from the first dictionary if no order is specified.
    if key_order is None:
        key_order = list(data[0].keys())
    
    values_list = []
    for d in data:
        # Extract values in the specified order. Convert tensor values to scalar if necessary.
        values = []
        for key in key_order:
            value = d[key]
            if isinstance(value, torch.Tensor):
                # Assuming tensor is of shape (1,)
                values.append(value.item())
            else:
                values.append(value)
        values_list.append(values)
    
    # Convert the list of lists to a 2D tensor.
    tensor_2d = torch.tensor(values_list)
    # Add a new dimension at the beginning to make it 3D.
    tensor_3d = tensor_2d.unsqueeze(0)
    return tensor_3d

In [32]:
import os
print("Current working directory:", os.getcwd())


Current working directory: c:\Users\kumar\IPL_Fantasy_Score_Prediction\Ashu


## MODEL **Architechure**

In [33]:
class PlayerEmbedding(nn.Module):
  """ HEre we are doing the Proj of the raw Player embedding into the PERFORMANCE_EMBD_DIM """
  def __init__(self, in_channels=config.PLAYER_INPUT_DIM, out_channels=config.PERFORMANCE_EMBD_DIM):
    super().__init__()
    self.proj = nn.Linear(in_channels, out_channels)

  def forward(self, x):
    # print(f"In Player Embd  {x.shape}")
    # x: (B, PLAYER_INPUT_DIM) or flattened (B*T, PLAYER_INPUT_DIM)
    return self.proj(x)   ## ( B/B*T, PERFORMANCE_EMBD)

In [None]:
#! handle cross attention (also should it be applied using team2 and current player)
class MatchEmbedding(nn.Module):
    """
    Computes a match-level embedding from team and match information.
    performing cross attention from team_players performance -> current palyer performance 
    Expected input shapes (for T matches):
      team1_players: (B, T, num_team1, PLAYER_INPUT_DIM) #!TODO we are passing it
      team2_players: (B, T, num_team2, PLAYER_INPUT_DIM)
      #? TODO implement cross attention (including player X team-2 performance )
      
      match_info: (B, T, match_info_dim)
    Output:
      (B, T, PERFORMANCE_EMBD_DIM)
    """
    def __init__(self, player_embedding_module, in_channels=(2*config.PERFORMANCE_EMBD_DIM + config.MATCH_INPUT_EMBD), out_channels=config.PERFORMANCE_EMBD_DIM):
        super().__init__()
        self.player = player_embedding_module
        self.proj = nn.Linear(in_channels, out_channels)

    def forward(self, team1_players, team2_players, match_info):
        """
          1. Get player embeddings using self.player.
          2. Sum (or pool) embeddings for each team.
          3. Concatenate team representations with match_info.
          4. Project the concatenated vector to obtain the final match embedding.
        """
        B, T, num_team1, _ = team1_players.shape
        # print(f"In match embd {team1_players.shape} & {match_info.shape}, B {B}, T, {T}, nums_team1 {num_team1}")
        # Compute player embeddings
        team1_flat = team1_players.reshape(B * T, num_team1, -1)  # (B*T, num_team1, PLAYER_INPUT_DIM)
        team1_embeds = self.player(team1_flat)  # (B*T, num_team1, PERFORMANCE_EMBD_DIM)
        team1_sum = team1_embeds.sum(dim=1)  # (B*T, PERFORMANCE_EMBD_DIM)
        team1_sum = team1_sum.reshape(B, T, -1)  # (B, T, PERFORMANCE_EMBD_DIM)

        B, T, num_team2, _ = team2_players.shape
        team2_flat = team2_players.reshape(B * T, num_team2, -1)
        team2_embeds = self.player(team2_flat)  # (B*T, num_team2, PERFORMANCE_EMBD_DIM)
        team2_sum = team2_embeds.sum(dim=1)  # (B*T, PERFORMANCE_EMBD_DIM)
        team2_sum = team2_sum.reshape(B, T, -1)  # (B, T, PERFORMANCE_EMBD_DIM)

        # Concatenate team summaries with match-level info along last dimension.
        # match_info: (B, T, match_info_dim)
        fused = torch.cat([team1_sum, team2_sum, match_info], dim=-1)  # (B, T, 2*PERFORMANCE_EMBD_DIM + match_info_dim)
        match_embedding = self.proj(fused)  # (B, T, PERFORMANCE_EMBD_DIM)

        return match_embedding



In [None]:
class PerformanceEmbedding(nn.Module):
    """
    Combines a sequence of player embeddings with a sequence of match embeddings to produce a sequence
    of performance embeddings. For each time step:
      1. Obtain the player's embedding from player_input (B, T, PLAYER_INPUT_DIM).
      2. Obtain the match embedding using the MatchEmbedding module, which now expects team-level
         inputs with a time dimension (B, T, ...).
      3. Concatenate these embeddings (+moresulting in a vector of dimension 2 * PERFORMANCE_EMBD_DIM).
      4. Project the concatenated vector to PERFORMANCE_EMBD_DIM.

    Expected input shapes:
      player_input: (B, T, PLAYER_INPUT_DIM)
      team1_players: (B, T, num_team1, PLAYER_INPUT_DIM)
      team2_players: (B, T, num_team2, PLAYER_INPUT_DIM)
      match_info: (B, T, match_info_dim)

    Output:
      (B, T, PERFORMANCE_EMBD_DIM)
    """
    def __init__(self, player_embedding_module, match_embedding_module, out_channels=config.PERFORMANCE_EMBD_DIM):
        super().__init__()
        self.player_embedding_module = player_embedding_module
        self.match_embedding_module = match_embedding_module
        self.performance_proj = nn.Linear(config.PERFORMANCE_INPUT_DIM, out_channels)
        # Linear layer to map concatenated [player_emb; match_emb] (dimension 2*PERFORMANCE_EMBD_DIM)
        # to PERFORMANCE_EMBD_DIM.
        self.proj = nn.Linear(3 * config.PERFORMANCE_EMBD_DIM, out_channels)

    def forward(self, player_input, player_performance_input, team1_players, team2_players, match_info):
        """
        player_input: (B, T, PLAYER_INPUT_DIM) - raw features for a specific player across T matches.
        team1_players: (B, T, num_team1, PLAYER_INPUT_DIM) - raw features for team1 players per match.
        team2_players: (B, T, num_team2, PLAYER_INPUT_DIM) - raw features for team2 players per match.
        match_info: (B, T, match_info_dim) - extra normalized match information per match.
        """
        B, T, _ = player_performance_input.shape
        # print(f"In performance embedding {player_input.shape} && {player_performance_input.shape}")
        # Compute player's embedding for each match time step.
        # Reshape to (B*T, PLAYER_INPUT_DIM) so that the player_embedding_module can be applied, then reshape back.
        player_emb = self.player_embedding_module(player_input.reshape(B, -1))  # (B, PERFORMANCE_EMBD_DIM)
        player_emb = player_emb.unsqueeze(1).repeat(1, T, 1)  # (B, T, PERFORMANCE_EMBD_DIM)
        #!why we are repeating this.....
        player_performance_emb = self.performance_proj(player_performance_input.reshape(B * T, -1))  # (B*T, PERFORMANCE_INPUT_DIM) => (B*T, PERFORMANCE_EMBD_DIM)
        player_performance_emb = player_performance_emb.reshape(B, T, -1)  # (B, T, PERFORMANCE_EMBD_DIM)

        # Compute match embedding across T time steps.
        # Ensure that the match_embedding_module expects inputs with a time dimension.
        match_emb = self.match_embedding_module(team1_players, team2_players, match_info)  # (B, T, PERFORMANCE_EMBD_DIM)

        # Concatenate the player's embedding and match embedding for each time step.
        combined = torch.cat([player_emb, player_performance_emb, match_emb], dim=-1)  # (B, T, 3 * PERFORMANCE_EMBD_DIM)
        #! what is player_emb, and player_performance_emb (diffence....)
        # Project the concatenated vector back to PERFORMANCE_EMBD_DIM.
        performance_emb = self.proj(combined)  # (B, T, PERFORMANCE_EMBD_DIM)
        #! performance_proj -> performance_embd_dim  and the combined with the match_ifo is also project to the performance_embd_dim 
        #TODO should we use another higher dimesion 
        return performance_emb
