In [1]:
import pandas as pd
import numpy as np
import os
import glob
import json
import duckdb

In [2]:
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [27]:
def get_players(wickets_dict):
    fielders = wickets_dict.get("fielders")
    if fielders:
        players_list = []
        for item in fielders:
            players_list.append(item.get('name'))
        return ",".join(players_list)
    else:
        return None

In [44]:
def transform_ball_by_ball_details(df_combined: pd.DataFrame) -> pd.DataFrame:
    cols_to_drop = []
    for col in df_combined.columns:
        if 'info.registry.' in col or 'info.players.' in col:
            cols_to_drop.append(col)
    df_combined.drop(cols_to_drop, axis=1, inplace=True)
    df_combined.columns = [col[5:] if "info." in col else col for col in df_combined.columns]
    df_combined['date'] = df_combined['dates'].apply(lambda x: (x[0]))
    df_combined['event.match_number'].fillna(df_combined['event.stage'], inplace=True)
    df_combined['event.match_number'] = df_combined['event.match_number'].astype(str).str.rstrip('.0')
    df_combined = df_combined.explode('innings', ignore_index=True)
    df_combined["team"] = df_combined["innings"].apply(lambda x: x.get("team"))
    df_combined["overs_list"] = df_combined["innings"].apply(lambda x: x.get("overs"))
    df_combined["power_play"] = df_combined["innings"].apply(lambda x: x.get("powerplays"))
    df_combined = df_combined.explode('overs_list', ignore_index=True)
    df_combined["over"] = df_combined["overs_list"].apply(lambda x: x.get("over"))
    df_combined["over"] = df_combined["over"].apply(lambda x: x + 1)
    df_combined["deliveries_list"] = df_combined["overs_list"].apply(lambda x: x.get("deliveries"))
    df_combined = df_combined.explode('deliveries_list', ignore_index=True)
    df_combined["batter"] = df_combined["deliveries_list"].apply(lambda x: x.get("batter"))
    df_combined["bowler"] =df_combined["deliveries_list"].apply(lambda x: x.get("bowler"))
    df_combined["non_striker"] =df_combined["deliveries_list"].apply(lambda x: x.get("non_striker"))
    df_combined["runs"] =df_combined["deliveries_list"].apply(lambda x: x.get("runs"))
    df_combined["wickets"] =df_combined["deliveries_list"].apply(lambda x: x.get("wickets"))
    df_combined["extra_desc"] = df_combined["deliveries_list"].apply(lambda x: ",".join(x.get("extras").keys()) if x.get("extras") else None)
    df_combined["wicket_kind"] = df_combined["deliveries_list"].apply(lambda x: x.get("wickets")[0].get("kind") if x.get("wickets") else None)
    df_combined["player_out"] = df_combined["deliveries_list"].apply(lambda x: x.get("wickets")[0].get("player_out") if x.get("wickets") else None)
    df_combined["wicket_fielders"] = df_combined["deliveries_list"].apply(lambda x: get_players(x.get("wickets")[0]) if x.get("wickets") else None)
    df_combined["batter_runs"] =df_combined["runs"].apply(lambda x: x.get("batter"))
    df_combined["extra_runs"] =df_combined["runs"].apply(lambda x: x.get("extras"))
    df_combined["total_runs"] =df_combined["runs"].apply(lambda x: x.get("total"))
    df_combined.drop("runs", axis=1, inplace=True)
    df_combined.drop("deliveries_list", axis=1, inplace=True)
    df_combined = df_combined.explode('wickets', ignore_index=True)
    df_combined.drop("wickets", axis=1, inplace=True)
    df_combined.drop("innings", axis=1, inplace=True)
    df_combined.drop("overs_list", axis=1, inplace=True)
    df_combined["over_ball_no"] = df_combined.groupby(["season","event.match_number","team","over"]).cumcount() + 1
    df_combined['over_ball_no_str'] = df_combined['over'].astype(str) + '.' + df_combined['over_ball_no'].astype(str)
    df_combined = df_combined[['season','event.match_number', 'date', 'team', 'power_play', 'over', 'batter', 'bowler', 'non_striker',
       'batter_runs', 'extra_runs', 'total_runs', 'over_ball_no','over_ball_no_str','extra_desc', 'wicket_kind', 'player_out',
       'wicket_fielders']]
    df_combined.rename({'event.match_number': 'event_match_number'}, axis=1, inplace=True)
    return df_combined

In [45]:
json_files = glob.glob("./ipl_male_json/" + "*.json")

In [46]:
dataset_list = []
for json_file in json_files:
    json_file_name = os.path.basename(json_file) #1359475
    file_name, file_extension = os.path.splitext(json_file_name)
    if int(file_name) >= 0: #1359475
        try:
            with open(json_file, 'r') as json_file:
                data = json.load(json_file)
                df = pd.json_normalize(data)
                dataset_list.append(df)
        except Exception as e:
            print(f"Error reading {json_file}: {e}")

In [47]:
df_combined = pd.concat(dataset_list)

In [51]:
df_ball_by_ball = transform_ball_by_ball_details(df_combined)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_combined['event.match_number'].fillna(df_combined['event.stage'], inplace=True)


In [52]:
df_ball_by_ball.to_csv("ball_by_ball_ipl.csv", index=False)

In [53]:
df_ball_by_ball.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245308 entries, 0 to 245307
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   season              245308 non-null  object
 1   event_match_number  245308 non-null  object
 2   date                245308 non-null  object
 3   team                245308 non-null  object
 4   power_play          245147 non-null  object
 5   over                245308 non-null  int64 
 6   batter              245308 non-null  object
 7   bowler              245308 non-null  object
 8   non_striker         245308 non-null  object
 9   batter_runs         245308 non-null  int64 
 10  extra_runs          245308 non-null  int64 
 11  total_runs          245308 non-null  int64 
 12  over_ball_no        245308 non-null  int64 
 13  over_ball_no_str    245308 non-null  object
 14  extra_desc          13153 non-null   object
 15  wicket_kind         12143 non-null   object
 16  pl