In [3]:
import pandas as pd

In [5]:
import glob 
import os 

# Loading in Data
file_pattern = "*.csv"
all_files = glob.glob(file_pattern)

dfs = []
for filename in all_files:
    df = pd.read_csv(filename)
    df['match_filename'] = os.path.basename(filename)
    dfs.append(df)

full_data = pd.concat(dfs, axis = 0, ignore_index = True)

In [6]:
full_data

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,match_filename
0,592397,2013/14,2013-11-21,"Brisbane Cricket Ground, Woolloongabba",1,0.1,Australia,England,CJL Rogers,DA Warner,...,,,,,,,,,,592397.csv
1,592397,2013/14,2013-11-21,"Brisbane Cricket Ground, Woolloongabba",1,0.2,Australia,England,CJL Rogers,DA Warner,...,,,,,,,,,,592397.csv
2,592397,2013/14,2013-11-21,"Brisbane Cricket Ground, Woolloongabba",1,0.3,Australia,England,CJL Rogers,DA Warner,...,,,,,,,,,,592397.csv
3,592397,2013/14,2013-11-21,"Brisbane Cricket Ground, Woolloongabba",1,0.4,Australia,England,CJL Rogers,DA Warner,...,,,,,,,,,,592397.csv
4,592397,2013/14,2013-11-21,"Brisbane Cricket Ground, Woolloongabba",1,0.5,Australia,England,CJL Rogers,DA Warner,...,,,,,,,,,,592397.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103070,592400,2013/14,2013-12-26,Melbourne Cricket Ground,4,51.1,Australia,England,SR Watson,MJ Clarke,...,,,,,,,,,,592400.csv
103071,592400,2013/14,2013-12-26,Melbourne Cricket Ground,4,51.2,Australia,England,SR Watson,MJ Clarke,...,,,,,,,,,,592400.csv
103072,592400,2013/14,2013-12-26,Melbourne Cricket Ground,4,51.3,Australia,England,MJ Clarke,SR Watson,...,,,,,,,,,,592400.csv
103073,592400,2013/14,2013-12-26,Melbourne Cricket Ground,4,51.4,Australia,England,SR Watson,MJ Clarke,...,,,,,,,,,,592400.csv


In [8]:
# Creating new columns in dataset
# Converting to datetime
full_data['start_date'] = pd.to_datetime(full_data['start_date'])
full_data = full_data.sort_values(by = ['start_date', 'innings', 'ball'])
full_data =  full_data.reset_index(drop = True)

# Creating year column 
full_data['year'] = full_data['start_date'].dt.year

# New column of total runs 
full_data['total_runs'] = full_data['runs_off_bat'] + full_data['extras']

# If a wicket has happened its classed as 1 
wickets = ['bowled', 'caught', 'lbw', 'stumped', 'caught and bowled', 'hit wicket']
full_data['wicket'] = full_data['wicket_type'].apply(lambda x: 1 if x in wickets else 0)

# Removing NaN 
full_data['wides'] = full_data['wides'].fillna('0')
full_data['noballs'] = full_data['noballs'].fillna('0')
full_data['byes'] = full_data['byes'].fillna('0')
full_data['legbyes'] = full_data['legbyes'].fillna('0')
full_data['penalty'] = full_data['penalty'].fillna('0')
full_data['wicket_type'] = full_data['wicket_type'].fillna('none')
full_data['player_dismissed'] = full_data['player_dismissed'].fillna('none')
full_data['other_wicket_type'] = full_data['other_wicket_type'].fillna('none')
full_data['other_player_dismissed'] = full_data['other_player_dismissed'].fillna('none')

# Splitting ball in over number and ball number 
full_data['over_num'] = full_data['ball'].astype(str).str.split('.').str[0].astype(float)
full_data['ball_num'] = full_data['ball'].astype(str).str.split('.').str[1].astype(float)

# Determing amount of runs each batter got in each innings 
full_data['player_match_runs'] = full_data.groupby(['match_id', 'innings', 'striker'])['runs_off_bat'].transform('sum')

# Determining amount of runs of each innings 
full_data['run_totals'] = full_data.groupby(['match_id', 'innings'])['total_runs'].transform('sum')

# Determining amount of runs scored by each team in entire game  
full_data['total_scored'] = full_data.groupby(['match_id','batting_team'])['total_runs'].transform('sum')

full_data

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,other_player_dismissed,match_filename,year,total_runs,wicket,over_num,ball_num,player_match_runs,run_totals,total_scored
0,64012,2002/03,2002-12-26,Melbourne Cricket Ground,1,0.1,Australia,England,JL Langer,ML Hayden,...,none,64012.csv,2002,0,0,0.0,1.0,250,551,658
1,64012,2002/03,2002-12-26,Melbourne Cricket Ground,1,0.2,Australia,England,JL Langer,ML Hayden,...,none,64012.csv,2002,3,0,0.0,2.0,250,551,658
2,64012,2002/03,2002-12-26,Melbourne Cricket Ground,1,0.3,Australia,England,ML Hayden,JL Langer,...,none,64012.csv,2002,4,0,0.0,3.0,102,551,658
3,64012,2002/03,2002-12-26,Melbourne Cricket Ground,1,0.4,Australia,England,ML Hayden,JL Langer,...,none,64012.csv,2002,0,0,0.0,4.0,102,551,658
4,64012,2002/03,2002-12-26,Melbourne Cricket Ground,1,0.5,Australia,England,ML Hayden,JL Langer,...,none,64012.csv,2002,0,0,0.0,5.0,102,551,658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103070,1455611,2025/26,2025-11-21,Perth Stadium,4,27.4,Australia,England,M Labuschagne,SPD Smith,...,none,1455611.csv,2025,4,0,27.0,4.0,51,205,337
103071,1455611,2025/26,2025-11-21,Perth Stadium,4,27.5,Australia,England,M Labuschagne,SPD Smith,...,none,1455611.csv,2025,0,0,27.0,5.0,51,205,337
103072,1455611,2025/26,2025-11-21,Perth Stadium,4,27.6,Australia,England,M Labuschagne,SPD Smith,...,none,1455611.csv,2025,6,0,27.0,6.0,51,205,337
103073,1455611,2025/26,2025-11-21,Perth Stadium,4,28.1,Australia,England,SPD Smith,M Labuschagne,...,none,1455611.csv,2025,0,0,28.0,1.0,2,205,337
