In [2]:
import pandas as pd
import numpy as np

# paths (adjust if your structure differs)
matches_path = "../data/processed/matches_clean.csv"
player_match_path = "../data/processed/player_match_summary.csv"
career_stats_path = "../data/processed/player_career_stats.csv"
recent_form_path = "../data/processed/player_recent_form.csv"

matches = pd.read_csv(matches_path, dtype={"match_id": str})
player_match = pd.read_csv(player_match_path, dtype={"match_id": str})
career = pd.read_csv(career_stats_path)
recent = pd.read_csv(recent_form_path, dtype={"match_id": str})

# quick checks
print("Matches:", len(matches))
print("Player match rows:", len(player_match))
print("Career players:", len(career))
print("Recent rows:", len(recent))


Matches: 1169
Player match rows: 25018
Career players: 768
Recent rows: 25018


In [8]:
career["player"].to_csv("../data/processed/player.csv")

In [2]:
# Ensure player name keys are consistent (strip whitespace)
career['player'] = career['player'].str.strip()
recent['player'] = recent['player'].str.strip()
player_match['player'] = player_match['player'].str.strip()

# Small defensive fill
career.fillna(0, inplace=True)
recent.fillna(0, inplace=True)

# Build simple lookup dicts
bat_strength = career.set_index('player')['batting_strength'].to_dict()
bowl_strength = career.set_index('player')['bowling_strength'].to_dict()
overall_strength = career.set_index('player')['overall_strength'].to_dict()

# recent metrics: take the last row per (player,match) already has rolling values - we want the value in that match
recent_idx = recent.sort_values(['player','match_id']).groupby(['player','match_id']).tail(1)
# For quick lookup of recent metrics at time of a match, we'll use the values in recent_idx
recent_runs_lookup = recent_idx.set_index(['player','match_id'])['recent_runs'].to_dict()
recent_wk_lookup = recent_idx.set_index(['player','match_id'])['recent_wickets'].to_dict()
recent_econ_lookup = recent_idx.set_index(['player','match_id'])['recent_economy'].to_dict()


In [4]:
import yaml
import glob

xi_data = {}   

for file in glob.glob("../data/raw_yaml/*.yaml"):  
    with open(file, "r", encoding="utf-8") as f:
        data = yaml.safe_load(f)

    import os
    match_id = os.path.basename(file).replace(".yaml", "")


    players_dict = data["info"].get("players", {})

    xi_data[match_id] = {}

    for team, plist in players_dict.items():
        xi_data[match_id][team] = [p.strip() for p in plist]


def get_playing_xi(match_id, team):
    mid = str(match_id)              
    return xi_data.get(mid, {}).get(team, [])



In [5]:
sample = matches.iloc[0]
print("Match:", sample['match_id'], sample['team1'], "vs", sample['team2'])
print("Team1 XI size:", len(get_playing_xi(sample['match_id'], sample['team1'])))
print("Team2 XI size:", len(get_playing_xi(sample['match_id'], sample['team2'])))


Match: 1082591 Sunrisers Hyderabad vs Royal Challengers Bangalore
Team1 XI size: 11
Team2 XI size: 11


In [6]:
def compute_team_strength(match_id, team_name):
    """
    Computes aggregated batting, bowling, and recent form strength
    for the playing XI of a given team in a specific match.
    """
    xi = get_playing_xi(match_id, team_name)
    if len(xi) == 0:
        return {
            "batting_strength": 0,
            "bowling_strength": 0,
            "overall_strength": 0,
            "recent_runs": 0,
            "recent_wickets": 0,
            "recent_economy": 0
        }

    bat_vals = []
    bowl_vals = []
    overall_vals = []

    rec_runs = []
    rec_wk = []
    rec_econ = []

    for player in xi:
        p = player.strip()

        # -------- Career Strengths --------
        bat_vals.append(bat_strength.get(p, 0))
        bowl_vals.append(bowl_strength.get(p, 0))
        overall_vals.append(overall_strength.get(p, 0))

        # -------- Recent Form Strengths --------
        key = (p, str(match_id))

        rec_runs.append(recent_runs_lookup.get(key, 0))
        rec_wk.append(recent_wk_lookup.get(key, 0))
        rec_econ.append(recent_econ_lookup.get(key, 0))

    # Aggregate (mean works well)
    return {
        "batting_strength": np.mean(bat_vals),
        "bowling_strength": np.mean(bowl_vals),
        "overall_strength": np.mean(overall_vals),
        "recent_runs": np.mean(rec_runs),
        "recent_wickets": np.mean(rec_wk),
        "recent_economy": np.mean(rec_econ)
    }


In [7]:
team_rows = []

for _, row in matches.iterrows():
    mid = row['match_id']
    t1 = row['team1']
    t2 = row['team2']

    t1_strength = compute_team_strength(mid, t1)
    t2_strength = compute_team_strength(mid, t2)

    team_rows.append({
        "match_id": mid,
        "team1": t1,
        "team2": t2,
        "venue": row["venue"],

        # team1 features
        "t1_batting_strength": t1_strength["batting_strength"],
        "t1_bowling_strength": t1_strength["bowling_strength"],
        "t1_overall_strength": t1_strength["overall_strength"],
        "t1_recent_runs": t1_strength["recent_runs"],
        "t1_recent_wickets": t1_strength["recent_wickets"],
        "t1_recent_economy": t1_strength["recent_economy"],

        # team2 features
        "t2_batting_strength": t2_strength["batting_strength"],
        "t2_bowling_strength": t2_strength["bowling_strength"],
        "t2_overall_strength": t2_strength["overall_strength"],
        "t2_recent_runs": t2_strength["recent_runs"],
        "t2_recent_wickets": t2_strength["recent_wickets"],
        "t2_recent_economy": t2_strength["recent_economy"]
    })


In [8]:
team_strength_df = pd.DataFrame(team_rows)
team_strength_path = "../data/processed/team_strength.csv"

team_strength_df.to_csv(team_strength_path, index=False)
print("Saved:", team_strength_path)

team_strength_df.head()


Saved: ../data/processed/team_strength.csv


Unnamed: 0,match_id,team1,team2,venue,t1_batting_strength,t1_bowling_strength,t1_overall_strength,t1_recent_runs,t1_recent_wickets,t1_recent_economy,t2_batting_strength,t2_bowling_strength,t2_overall_strength,t2_recent_runs,t2_recent_wickets,t2_recent_economy
0,1082591,Sunrisers Hyderabad,Royal Challengers Bangalore,"Rajiv Gandhi International Stadium, Uppal",54.761571,26.476365,81.237936,14.854545,0.872727,5.21135,49.469909,18.216197,67.686106,12.0,0.709091,7.144913
1,1082592,Rising Pune Supergiant,Mumbai Indians,Maharashtra Cricket Association Stadium,48.697381,16.300614,64.997995,12.872727,0.6,4.474398,55.563482,26.550081,82.113564,13.872727,0.345455,5.605701
2,1082593,Gujarat Lions,Kolkata Knight Riders,Saurashtra Cricket Association Stadium,48.696867,13.927022,62.62389,12.927273,0.418182,5.912815,55.333629,32.499403,87.833032,8.454545,0.513636,4.730532
3,1082594,Kings XI Punjab,Rising Pune Supergiant,Holkar Cricket Stadium,51.822113,26.790007,78.61212,12.218182,0.554545,5.728582,49.284474,15.997136,65.28161,18.318182,0.554545,4.681617
4,1082595,Royal Challengers Bangalore,Delhi Daredevils,M.Chinnaswamy Stadium,50.909367,19.900393,70.809759,11.981818,0.745455,5.823667,51.456979,24.25894,75.715919,11.236364,0.509091,4.36794
