In [14]:
import os
import yaml
import pandas as pd

raw_path = "../data/raw_yaml/"
player_rows = []
for file in os.listdir(raw_path):
    if not file.endswith(".yaml"):
        continue

    with open(os.path.join(raw_path, file), "r", encoding="utf-8") as f:
        data = yaml.safe_load(f)

    match_id = file.replace(".yaml", "")
    teams = data["info"]["teams"]

    innings = data["innings"]

    for inning in innings:
        inn_name = list(inning.keys())[0]
        inn_data = inning[inn_name]

        batting_team = inn_data["team"]
        bowling_team = teams[1] if batting_team == teams[0] else teams[0]

        for delivery in inn_data["deliveries"]:
            ball, d = list(delivery.items())[0]

            batter = d.get("batsman")
            bowler = d["bowler"]

            runs_info = d["runs"]
            bat_runs = runs_info["batsman"]
            total_runs = runs_info["total"]
            extras = d.get("extras", {})

            # Detect wides for legal ball calculation
            is_wide = "wides" in extras

            bat_ball = 0 if is_wide else 1
            bowl_ball = 0 if is_wide else 1

            # --------------------------
            # 1️⃣ Batting event
            # --------------------------
            player_rows.append({
                "match_id": match_id,
                "player": batter,
                "team": batting_team,
                "role": "batting",
                "runs": bat_runs,
                "balls": bat_ball,
                "fours": 1 if bat_runs == 4 else 0,
                "sixes": 1 if bat_runs == 6 else 0,
                "dismissed": 0,
                "runs_conceded": None,
                "balls_bowled": None,
                "wicket": None
            })

            # --------------------------
            # 2️⃣ Dismissal event (supports both wicket & wickets formats)
            # --------------------------
            # Case A: wicket (your format)
            if "wicket" in d:
                w = d["wicket"]
                out_player = w.get("player_out")

                player_rows.append({
                    "match_id": match_id,
                    "player": out_player,
                    "team": batting_team,
                    "role": "dismissal",
                    "runs": 0,
                    "balls": 0,
                    "fours": 0,
                    "sixes": 0,
                    "dismissed": 1,
                    "runs_conceded": None,
                    "balls_bowled": None,
                    "wicket": None
                })

            # Case B: wickets list
            elif "wickets" in d:
                for w in d["wickets"]:
                    out_player = w.get("player_out")

                    player_rows.append({
                        "match_id": match_id,
                        "player": out_player,
                        "team": batting_team,
                        "role": "dismissal",
                        "runs": 0,
                        "balls": 0,
                        "fours": 0,
                        "sixes": 0,
                        "dismissed": 1,
                        "runs_conceded": None,
                        "balls_bowled": None,
                        "wicket": None
                    })

            # --------------------------
            # 3️⃣ Bowling event
            # --------------------------
            player_rows.append({
                "match_id": match_id,
                "player": bowler,
                "team": bowling_team,
                "role": "bowling",
                "runs_conceded": total_runs,
                "balls_bowled": bowl_ball,
                "wicket": 1 if ("wicket" in d or "wickets" in d) else 0,
                "runs": None,
                "balls": None,
                "fours": None,
                "sixes": None,
                "dismissed": None
            })

df_players = pd.DataFrame(player_rows)
df_players.head(20)



Unnamed: 0,match_id,player,team,role,runs,balls,fours,sixes,dismissed,runs_conceded,balls_bowled,wicket
0,1082591,DA Warner,Sunrisers Hyderabad,batting,0.0,1.0,0.0,0.0,0.0,,,
1,1082591,TS Mills,Royal Challengers Bangalore,bowling,,,,,,0.0,1.0,0.0
2,1082591,DA Warner,Sunrisers Hyderabad,batting,0.0,1.0,0.0,0.0,0.0,,,
3,1082591,TS Mills,Royal Challengers Bangalore,bowling,,,,,,0.0,1.0,0.0
4,1082591,DA Warner,Sunrisers Hyderabad,batting,4.0,1.0,1.0,0.0,0.0,,,
5,1082591,TS Mills,Royal Challengers Bangalore,bowling,,,,,,4.0,1.0,0.0
6,1082591,DA Warner,Sunrisers Hyderabad,batting,0.0,1.0,0.0,0.0,0.0,,,
7,1082591,TS Mills,Royal Challengers Bangalore,bowling,,,,,,0.0,1.0,0.0
8,1082591,DA Warner,Sunrisers Hyderabad,batting,0.0,0.0,0.0,0.0,0.0,,,
9,1082591,TS Mills,Royal Challengers Bangalore,bowling,,,,,,2.0,0.0,0.0


In [15]:
batting_df = df_players[df_players.role == "batting"]
dismiss_df = df_players[df_players.role == "dismissal"]

bat_sum = batting_df.groupby(
    ["match_id", "player", "team"], as_index=False
).agg({
    "runs": "sum",
    "balls": "sum",
    "fours": "sum",
    "sixes": "sum"
})

out_sum = dismiss_df.groupby(
    ["match_id", "player", "team"], as_index=False
).agg({"dismissed": "sum"})

bat_sum = bat_sum.merge(out_sum, on=["match_id", "player", "team"], how="left")
bat_sum["dismissed"].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  bat_sum["dismissed"].fillna(0, inplace=True)


In [16]:
bowling_df = df_players[df_players.role == "bowling"]

bowl_sum = bowling_df.groupby(
    ["match_id", "player", "team"], as_index=False
).agg({
    "runs_conceded": "sum",
    "balls_bowled": "sum",
    "wicket": "sum"
})

bowl_sum["overs"] = (
    bowl_sum["balls_bowled"] // 6 +
    (bowl_sum["balls_bowled"] % 6) / 10
)


In [17]:
player_match_summary = pd.merge(
    bat_sum, bowl_sum,
    on=["match_id", "player", "team"],
    how="outer"
)

player_match_summary.fillna(0, inplace=True)
player_match_summary.head(20)


Unnamed: 0,match_id,player,team,runs,balls,fours,sixes,dismissed,runs_conceded,balls_bowled,wicket,overs
0,1082591,A Choudhary,Royal Challengers Bangalore,6.0,2.0,0.0,1.0,0.0,55.0,25.0,1.0,4.1
1,1082591,A Nehra,Sunrisers Hyderabad,0.0,0.0,0.0,0.0,0.0,42.0,24.0,2.0,4.0
2,1082591,B Kumar,Sunrisers Hyderabad,0.0,0.0,0.0,0.0,0.0,28.0,24.0,2.0,4.0
3,1082591,BCJ Cutting,Sunrisers Hyderabad,16.0,6.0,0.0,2.0,0.0,35.0,22.0,1.0,3.4
4,1082591,Bipul Sharma,Sunrisers Hyderabad,0.0,0.0,0.0,0.0,0.0,4.0,6.0,1.0,1.0
5,1082591,CH Gayle,Royal Challengers Bangalore,32.0,21.0,2.0,3.0,1.0,0.0,0.0,0.0,0.0
6,1082591,DA Warner,Sunrisers Hyderabad,14.0,8.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0
7,1082591,DJ Hooda,Sunrisers Hyderabad,16.0,12.0,0.0,1.0,0.0,7.0,6.0,1.0,1.0
8,1082591,KM Jadhav,Royal Challengers Bangalore,31.0,16.0,4.0,1.0,1.0,0.0,0.0,0.0,0.0
9,1082591,MC Henriques,Sunrisers Hyderabad,52.0,37.0,3.0,2.0,1.0,20.0,13.0,1.0,2.1


In [18]:
player_match_summary.to_csv("../data/processed/player_match_summary.csv", index=False)
print("Saved player_match_summary.csv successfully!")


Saved player_match_summary.csv successfully!
