In [1]:
import pandas as pd
import os
import glob
import json

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
def transform_match_info(df_match_info):
    df_match_info.columns = [x[5:].replace(".", "_") for x in df_match_info.columns if "info." in x]
    desired_columns = [x for x in df_match_info.columns if "registry_people" not in x]
    df_match_info = df_match_info[desired_columns].copy()
    prefix = 'players_'
    player_columns = [col for col in df_match_info.columns if col.startswith(prefix)]
    df_match_info['concatenated_players'] = df_match_info[player_columns].apply(lambda x: '_'.join(x.dropna().astype(str)), axis=1)
    df_match_info = df_match_info.drop(columns=player_columns)
    return df_match_info

In [4]:
def get_people_registry(df_match_info):

    def extract_value(row):
        for value in row:
            if pd.notna(value):
                return value

    registry_column_list = list(set([x for x in df_match_info.columns if "info.registry.people" in x]))
    df_people_registry = df_match_info[registry_column_list].copy()
    df_people_registry = df_people_registry.T
    df_people_registry["Id"] = df_people_registry.apply(extract_value, axis=1)
    df_people_registry.reset_index(names=["Name"], inplace=True)
    df_people_registry = df_people_registry[["Id", "Name"]]
    df_people_registry["Name"] = df_people_registry["Name"].apply(lambda x: x.split(".")[-1].strip())
    return df_people_registry

In [5]:
data_dir_path = os.path.join(os.getcwd(), "odis_json")

In [6]:
print(data_dir_path)

/Users/vasav/Documents/Vasav/REPO/python_notebooks/cricket-etl-python/odis_json


In [7]:
json_files = glob.glob(data_dir_path + "/*.json")

In [8]:
dataset_list = []
for json_file in json_files[:3]:
    try:
        with open(json_file, 'r') as json_file:
            data = json.load(json_file)
            df = pd.json_normalize(data)
            dataset_list.append(df)
    except Exception as e:
        print(f"Error reading {json_file}: {e}")

In [9]:
df_combined = pd.concat(dataset_list)

In [10]:
info_column_list = [x for x in df_combined.columns if "info." in x]

In [11]:
df_match_info = df_combined[info_column_list].copy()

In [12]:
df_people_registry = get_people_registry(df_match_info)
df_people_registry

Unnamed: 0,Id,Name
0,4d3097d8,GJ Hopkins
1,f842c2cf,MJ Clarke
2,ea544967,N de Klerk
3,108c4c09,S Ismail
4,d0bb40b8,Poonam Yadav
...,...,...
70,495d42a5,R Ashwin
71,e219f17a,M du Preez
72,43dd4011,UV Gandhe
73,f0f628c7,MM Patel


In [13]:
df_match_info_transformed = transform_match_info(df_match_info)
df_match_info_transformed

Unnamed: 0,balls_per_over,city,dates,event_match_number,event_name,gender,match_type,match_type_number,officials_match_referees,officials_reserve_umpires,officials_tv_umpires,officials_umpires,outcome_by_runs,outcome_winner,overs,player_of_match,season,team_type,teams,toss_decision,toss_winner,venue,outcome_by_wickets,missing,concatenated_players
0,6,Guwahati,[2010-11-28],1,New Zealand in India ODI Series,male,ODI,3070,[RS Mahanama],[VA Kulkarni],[SS Hazare],"[NJ Llong, SK Tarapore]",40.0,India,50,[V Kohli],2010/11,international,"[India, New Zealand]",field,New Zealand,Nehru Stadium,,,"['M Vijay', 'G Gambhir', 'V Kohli', 'Yuvraj Si..."
0,6,Perth,[2009-02-01],1,Chappell-Hadlee Trophy,male,ODI,2811,[RS Mahanama],,[PR Reiffel],"[SA Bucknor, BNJ Oxenford]",,New Zealand,50,[KD Mills],2008/09,international,"[Australia, New Zealand]",bat,Australia,Western Australia Cricket Association Ground,2.0,,"['BB McCullum', 'MJ Guptill', 'PG Fulton', 'LR..."
0,6,Lucknow,[2021-03-09],2,South Africa Women tour of India,female,ODI,1191,[GS Lakshmi],[UV Gandhe],[CK Nandan],"[BK Ravi, K Srinivasan]",,India,50,[J Goswami],2020/21,international,"[South Africa, India]",field,India,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,9.0,"[{'powerplays': {'1': ['batting'], '2': ['batt...","['JI Rodrigues', 'S Mandhana', 'PG Raut', 'H K..."


In [14]:
df_combined

Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.balls_per_over,info.city,info.dates,info.event.match_number,info.event.name,info.gender,info.match_type,info.match_type_number,info.officials.match_referees,info.officials.reserve_umpires,info.officials.tv_umpires,info.officials.umpires,info.outcome.by.runs,info.outcome.winner,info.overs,info.player_of_match,info.players.India,info.players.New Zealand,info.registry.people.A Nehra,info.registry.people.AJ McKay,info.registry.people.DR Tuffey,info.registry.people.G Gambhir,info.registry.people.GD Elliott,info.registry.people.GJ Hopkins,info.registry.people.JM How,info.registry.people.KD Mills,info.registry.people.KS Williamson,info.registry.people.LRPL Taylor,info.registry.people.M Vijay,info.registry.people.MJ Guptill,info.registry.people.MM Patel,info.registry.people.NJ Llong,info.registry.people.NL McCullum,info.registry.people.R Ashwin,info.registry.people.RS Mahanama,info.registry.people.S Sreesanth,info.registry.people.SB Styris,info.registry.people.SK Raina,info.registry.people.SK Tarapore,info.registry.people.SS Hazare,info.registry.people.V Kohli,info.registry.people.VA Kulkarni,info.registry.people.WP Saha,info.registry.people.YK Pathan,info.registry.people.Yuvraj Singh,info.season,info.team_type,info.teams,info.toss.decision,info.toss.winner,info.venue,info.outcome.by.wickets,info.players.Australia,info.registry.people.BB McCullum,info.registry.people.BJ Haddin,info.registry.people.BNJ Oxenford,info.registry.people.DA Warner,info.registry.people.DJ Hussey,info.registry.people.DL Vettori,info.registry.people.IE O'Brien,info.registry.people.JR Hopes,info.registry.people.JS Patel,info.registry.people.MEK Hussey,info.registry.people.MG Johnson,info.registry.people.MJ Clarke,info.registry.people.NT Broom,info.registry.people.NW Bracken,info.registry.people.PG Fulton,info.registry.people.PR Reiffel,info.registry.people.RT Ponting,info.registry.people.SA Bucknor,info.registry.people.SE Marsh,info.registry.people.SW Tait,info.registry.people.TG Southee,info.missing,info.players.South Africa,info.registry.people.A Khaka,info.registry.people.BK Ravi,info.registry.people.CK Nandan,info.registry.people.DB Sharma,info.registry.people.GS Lakshmi,info.registry.people.H Kaur,info.registry.people.J Goswami,info.registry.people.JI Rodrigues,info.registry.people.K Srinivasan,info.registry.people.L Goodall,info.registry.people.L Lee,info.registry.people.L Wolvaardt,info.registry.people.M Joshi,info.registry.people.M Kapp,info.registry.people.M Raj,info.registry.people.M du Preez,info.registry.people.N Mlaba,info.registry.people.N de Klerk,info.registry.people.PG Raut,info.registry.people.Poonam Yadav,info.registry.people.RS Gayakwad,info.registry.people.S Ismail,info.registry.people.S Luus,info.registry.people.S Mandhana,info.registry.people.S Verma,info.registry.people.T Chetty,info.registry.people.UV Gandhe
0,"[{'team': 'India', 'overs': [{'over': 0, 'deli...",1.0.0,2010-11-29,2,6,Guwahati,[2010-11-28],1,New Zealand in India ODI Series,male,ODI,3070,[RS Mahanama],[VA Kulkarni],[SS Hazare],"[NJ Llong, SK Tarapore]",40.0,India,50,[V Kohli],"[M Vijay, G Gambhir, V Kohli, Yuvraj Singh, SK...","[MJ Guptill, JM How, KS Williamson, LRPL Taylo...",96fd40ae,de3acffe,43936951.0,bb345e0b,c03449e0,4d3097d8,fd8f11e9,7fb32e5b,d027ba9f,b61a3e1a,4b57e452,2be41edb,f0f628c7,573fb985,5673a3fc,495d42a5,57910393.0,6b8eb6e5,57efa3be,1dc12ab9,62a824b4,166e5081,ba607b88,8fe0c4f8,fe11caa6,3c6ffae8,1c914163,2010/11,international,"[India, New Zealand]",field,New Zealand,Nehru Stadium,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,"[{'team': 'Australia', 'overs': [{'over': 0, '...",1.0.0,2013-03-04,1,6,Perth,[2009-02-01],1,Chappell-Hadlee Trophy,male,ODI,2811,[RS Mahanama],,[PR Reiffel],"[SA Bucknor, BNJ Oxenford]",,New Zealand,50,[KD Mills],,"[BB McCullum, MJ Guptill, PG Fulton, LRPL Tayl...",,,,,c03449e0,,,7fb32e5b,,b61a3e1a,,2be41edb,,,,,57910393.0,,,,,,,,,,,2008/09,international,"[Australia, New Zealand]",bat,Australia,Western Australia Cricket Association Ground,2.0,"[SE Marsh, DA Warner, RT Ponting, MJ Clarke, D...",b8a55852,b0c772ee,c1add349,dcce6f09,fd835ab3,d7c6af50,78f34e15,2.498e+166,f5821615,48fd7349,f5180fe6,f842c2cf,a354c917,4c61a0f9,10a2a1ff,c9354f29,7d415ea5,2ad22b51,508a1ea7,addfb70e,13c35c9e,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,"[{'team': 'South Africa', 'overs': [{'over': 0...",1.0.0,2021-03-10,1,6,Lucknow,[2021-03-09],2,South Africa Women tour of India,female,ODI,1191,[GS Lakshmi],[UV Gandhe],[CK Nandan],"[BK Ravi, K Srinivasan]",,India,50,[J Goswami],"[JI Rodrigues, S Mandhana, PG Raut, H Kaur, S ...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020/21,international,"[South Africa, India]",field,India,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,9.0,,,,,,,,,,,,,,,,,,,,,,,"[{'powerplays': {'1': ['batting'], '2': ['batt...","[L Lee, L Wolvaardt, S Luus, L Goodall, M du P...",0d518f89,ff8c3cfb,012de9c4,201fef33,ba8d5615,53cd8da6,84f1ae7f,cb08b611,042a8b69,0bb32e98,7b01ce83,e60f81c9,47fb8d80,63e3b6b3,ec2269dc,e219f17a,686362af,ea544967,4958ad5f,d0bb40b8,237ec05b,108c4c09,6d76bc23,5d2eda89,185e6c0c,53a5767d,43dd4011


In [15]:
%%markdown
## Performing ball by ball analysis

## Performing ball by ball analysis


In [16]:
df_combined = df_combined[["info.match_type", "info.match_type_number", "innings"]]

In [17]:
df_combined

Unnamed: 0,info.match_type,info.match_type_number,innings
0,ODI,3070,"[{'team': 'India', 'overs': [{'over': 0, 'deli..."
0,ODI,2811,"[{'team': 'Australia', 'overs': [{'over': 0, '..."
0,ODI,1191,"[{'team': 'South Africa', 'overs': [{'over': 0..."


In [18]:
df_combined = df_combined.explode('innings', ignore_index=True)

In [19]:
df_combined["team"] = df_combined["innings"].apply(lambda x: x.get("team"))
df_combined["overs_list"] = df_combined["innings"].apply(lambda x: x.get("overs"))
df_combined["power_play"] = df_combined["innings"].apply(lambda x: x.get("powerplays"))
df_combined = df_combined.explode('overs_list', ignore_index=True)
df_combined["over"] = df_combined["overs_list"].apply(lambda x: x.get("over"))
df_combined["over"] = df_combined["over"].apply(lambda x: x + 1)
df_combined["deliveries_list"] = df_combined["overs_list"].apply(lambda x: x.get("deliveries"))
df_combined = df_combined.explode('deliveries_list', ignore_index=True)

In [20]:
df_combined["batter"] = df_combined["deliveries_list"].apply(lambda x: x.get("batter"))
df_combined["bowler"] =df_combined["deliveries_list"].apply(lambda x: x.get("bowler"))
df_combined["non_striker"] =df_combined["deliveries_list"].apply(lambda x: x.get("non_striker"))
df_combined["runs"] =df_combined["deliveries_list"].apply(lambda x: x.get("runs"))
df_combined["wickets"] =df_combined["deliveries_list"].apply(lambda x: x.get("wickets"))

In [21]:
df_combined["batter_runs"] =df_combined["runs"].apply(lambda x: x.get("batter"))
df_combined["extra_runs"] =df_combined["runs"].apply(lambda x: x.get("extras"))
df_combined["total_runs"] =df_combined["runs"].apply(lambda x: x.get("total"))

In [22]:
df_combined.drop("runs", axis=1, inplace=True)
df_combined.drop("deliveries_list", axis=1, inplace=True)
df_combined = df_combined.explode('wickets', ignore_index=True)

In [23]:
df_combined["wicket_kind"] =df_combined["wickets"].apply(lambda x: None if x is None else x.get("kind"))
df_combined["wicket_player_out"] =df_combined["wickets"].apply(lambda x: None if x is None else x.get("player_out"))
df_combined["wicket_fielders"] =df_combined["wickets"].apply(lambda x: None if x is None else x.get("fielders"))

In [24]:
df_combined.drop("wickets", axis=1, inplace=True)

In [25]:
df_combined.drop("innings", axis=1, inplace=True)
df_combined.drop("overs_list", axis=1, inplace=True)

In [26]:
df_combined["over_ball_no"] = df_combined.groupby(["info.match_type_number","team","over"]).cumcount() + 1

In [27]:
df_combined['over_ball_no_str'] = df_combined['over'].astype(str) + '.' + df_combined['over_ball_no'].astype(str)

In [29]:
df_combined

Unnamed: 0,info.match_type,info.match_type_number,team,power_play,over,batter,bowler,non_striker,batter_runs,extra_runs,total_runs,wicket_kind,wicket_player_out,wicket_fielders,over_ball_no,over_ball_no_str
0,ODI,3070,India,"[{'from': 0.1, 'to': 9.6, 'type': 'mandatory'}...",1,M Vijay,KD Mills,G Gambhir,0,0,0,,,,1,1.1
1,ODI,3070,India,"[{'from': 0.1, 'to': 9.6, 'type': 'mandatory'}...",1,M Vijay,KD Mills,G Gambhir,0,0,0,,,,2,1.2
2,ODI,3070,India,"[{'from': 0.1, 'to': 9.6, 'type': 'mandatory'}...",1,M Vijay,KD Mills,G Gambhir,0,0,0,,,,3,1.3
3,ODI,3070,India,"[{'from': 0.1, 'to': 9.6, 'type': 'mandatory'}...",1,M Vijay,KD Mills,G Gambhir,1,0,1,,,,4,1.4
4,ODI,3070,India,"[{'from': 0.1, 'to': 9.6, 'type': 'mandatory'}...",1,G Gambhir,KD Mills,M Vijay,0,0,0,,,,5,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1608,ODI,1191,India,"[{'from': 0.1, 'to': 9.6, 'type': 'mandatory'}]",28,PG Raut,N Mlaba,S Mandhana,1,0,1,,,,6,28.6
1609,ODI,1191,India,"[{'from': 0.1, 'to': 9.6, 'type': 'mandatory'}]",29,PG Raut,N de Klerk,S Mandhana,0,0,0,,,,1,29.1
1610,ODI,1191,India,"[{'from': 0.1, 'to': 9.6, 'type': 'mandatory'}]",29,PG Raut,N de Klerk,S Mandhana,1,0,1,,,,2,29.2
1611,ODI,1191,India,"[{'from': 0.1, 'to': 9.6, 'type': 'mandatory'}]",29,S Mandhana,N de Klerk,PG Raut,4,0,4,,,,3,29.3


In [30]:
df_test1 = df_combined[df_combined["info.match_type_number"] == 3070]

In [31]:
df_test1[["team", "batter", "batter_runs"]].groupby(["team", "batter"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,batter_runs
team,batter,Unnamed: 2_level_1
India,A Nehra,0
India,G Gambhir,38
India,M Vijay,29
India,MM Patel,1
India,R Ashwin,0
India,S Sreesanth,4
India,SK Raina,13
India,V Kohli,105
India,WP Saha,4
India,YK Pathan,29


In [32]:
df_combined.to_csv("temp.csv", index=False)