In [3]:
import os
import yaml
import pandas as pd
import re

raw_path = "../data/raw_yaml/"

matches = []

def extract_season_from_id(match_id):
    """
    Many Cricsheet YAMLs lack season info (None in your output).
    IPL match_id usually starts with year like '1082591' → 2017.
    This function maps the prefix to season year.
    """
    # IPL Match IDs start with year-like pattern
    # We infer season from first 4 digits (approx but works for IPL)
    try:
        year = int(str(match_id)[:4])
        if 2007 < year < 2030:
            return year
        return None
    except:
        return None

for file in os.listdir(raw_path):
    if file.endswith(".yaml"):
        with open(os.path.join(raw_path, file), 'r', encoding="utf-8") as f:
            data = yaml.safe_load(f)

        info = data.get('info', {})
        outcome = info.get("outcome", {})

        # Winner
        winner = outcome.get("winner")

        # Result type & margin
        result_type = None
        result_margin = None

        if "by" in outcome:
            by_info = outcome["by"]
            if "runs" in by_info:
                result_type = "runs"
                result_margin = by_info["runs"]
            elif "wickets" in by_info:
                result_type = "wickets"
                result_margin = by_info["wickets"]
        else:
            # tie, no result, super over, abandoned
            result_type = list(outcome.keys())[0] if outcome else None

        # Season
        season = info.get("season")
        if season is None:
            season = extract_season_from_id(file.replace(".yaml", ""))

        match = {
            "match_id": file.replace(".yaml", ""),
            "season": season,
            "city": info.get("city"),
            "venue": info.get("venue"),
            "team1": info.get("teams", [None, None])[0],
            "team2": info.get("teams", [None, None])[1],
            "toss_winner": info.get("toss", {}).get("winner"),
            "toss_decision": info.get("toss", {}).get("decision"),
            "winner": winner,
            "result_type": result_type,
            "result_margin": result_margin,
            "player_of_match": info.get("player_of_match", [None])[0],
        }

        matches.append(match)

df_matches = pd.DataFrame(matches)
df_matches.head()


Unnamed: 0,match_id,season,city,venue,team1,team2,toss_winner,toss_decision,winner,result_type,result_margin,player_of_match
0,1082591,,Hyderabad,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,runs,35.0,Yuvraj Singh
1,1082592,,Pune,Maharashtra Cricket Association Stadium,Rising Pune Supergiant,Mumbai Indians,Rising Pune Supergiant,field,Rising Pune Supergiant,wickets,7.0,SPD Smith
2,1082593,,Rajkot,Saurashtra Cricket Association Stadium,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,Kolkata Knight Riders,wickets,10.0,CA Lynn
3,1082594,,Indore,Holkar Cricket Stadium,Kings XI Punjab,Rising Pune Supergiant,Kings XI Punjab,field,Kings XI Punjab,wickets,6.0,GJ Maxwell
4,1082595,,Bengaluru,M.Chinnaswamy Stadium,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,Royal Challengers Bangalore,runs,15.0,KM Jadhav


In [2]:
df_matches

Unnamed: 0,match_id,season,city,venue,team1,team2,toss_winner,toss_decision,winner,result_type
0,1082591,,Hyderabad,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,by
1,1082592,,Pune,Maharashtra Cricket Association Stadium,Rising Pune Supergiant,Mumbai Indians,Rising Pune Supergiant,field,Rising Pune Supergiant,by
2,1082593,,Rajkot,Saurashtra Cricket Association Stadium,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,Kolkata Knight Riders,winner
3,1082594,,Indore,Holkar Cricket Stadium,Kings XI Punjab,Rising Pune Supergiant,Kings XI Punjab,field,Kings XI Punjab,by
4,1082595,,Bengaluru,M.Chinnaswamy Stadium,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,Royal Challengers Bangalore,winner
...,...,...,...,...,...,...,...,...,...,...
1164,981011,,Raipur,Shaheed Veer Narayan Singh International Stadium,Delhi Daredevils,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Royal Challengers Bangalore,by
1165,981013,,Bangalore,M Chinnaswamy Stadium,Gujarat Lions,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Royal Challengers Bangalore,by
1166,981015,,Delhi,Feroz Shah Kotla,Sunrisers Hyderabad,Kolkata Knight Riders,Kolkata Knight Riders,field,Sunrisers Hyderabad,by
1167,981017,,Delhi,Feroz Shah Kotla,Gujarat Lions,Sunrisers Hyderabad,Sunrisers Hyderabad,field,Sunrisers Hyderabad,by


In [4]:
df_matches.to_csv("../data/processed/matches_clean.csv", index=False)
