In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Display plots inline
%matplotlib inline


In [3]:
# Define paths to raw data folders
t20_path = "../data/raw/t20s_csv2"
ipl_path = "../data/raw/ipl_csv2"


In [7]:
# Load only delivery (match) files — skip *_info.csv files
t20_files = [os.path.join(t20_path, f) for f in os.listdir(t20_path) if f.endswith('.csv') and not f.endswith('_info.csv')]
ipl_files = [os.path.join(ipl_path, f) for f in os.listdir(ipl_path) if f.endswith('.csv') and not f.endswith('_info.csv')]

# Read and combine
t20_df = pd.concat((pd.read_csv(file) for file in t20_files), ignore_index=True)
ipl_df = pd.concat((pd.read_csv(file) for file in ipl_files), ignore_index=True)


In [9]:
print("T20 Shape:", t20_df.shape)
print("IPL Shape:", ipl_df.shape)

t20_df.head()


T20 Shape: (929433, 22)
IPL Shape: (271256, 22)


Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.1,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
1,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.2,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
2,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.3,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
3,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.4,Australia,Sri Lanka,M Klinger,AJ Finch,...,0,,,,,,,,,
4,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.5,Australia,Sri Lanka,M Klinger,AJ Finch,...,0,,,,,,,,,


In [11]:
# List all available columns
print("T20 Columns:", t20_df.columns.tolist())
print("IPL Columns:", ipl_df.columns.tolist())


T20 Columns: ['match_id', 'season', 'start_date', 'venue', 'innings', 'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes', 'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type', 'other_player_dismissed']
IPL Columns: ['match_id', 'season', 'start_date', 'venue', 'innings', 'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes', 'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type', 'other_player_dismissed']


In [13]:
# Check for missing/null values
print("Missing values in T20 dataset:")
print(t20_df.isnull().sum())

print("\nMissing values in IPL dataset:")
print(ipl_df.isnull().sum())


Missing values in T20 dataset:
match_id                       0
season                         0
start_date                     0
venue                          0
innings                        0
ball                           0
batting_team                   0
bowling_team                   0
striker                        0
non_striker                    0
bowler                         0
runs_off_bat                   0
extras                         0
wides                     890058
noballs                   924329
byes                      925329
legbyes                   917678
penalty                   929418
wicket_type               877846
player_dismissed          877846
other_wicket_type         929431
other_player_dismissed    929431
dtype: int64

Missing values in IPL dataset:
match_id                       0
season                         0
start_date                     0
venue                          0
innings                        0
ball                           0


In [17]:
# Basic statistical summary
print(t20_df.describe())

# Adjusted key column names to match dataset
print("\nTop 5 Strikers:", t20_df['striker'].value_counts().head())
print("Top 5 Bowlers:", t20_df['bowler'].value_counts().head())
print("Dismissal Types:", t20_df['wicket_type'].value_counts())


           match_id        innings           ball   runs_off_bat  \
count  9.294330e+05  929433.000000  929433.000000  929433.000000   
mean   1.220879e+06       1.462068       9.224927       1.017566   
std    2.879734e+05       0.500281       5.627455       1.448749   
min    2.110280e+05       1.000000       0.100000       0.000000   
25%    1.182913e+06       1.000000       4.400000       0.000000   
50%    1.320202e+06       1.000000       9.100000       1.000000   
75%    1.415744e+06       2.000000      14.100000       1.000000   
max    1.482093e+06       4.000000      19.900000       7.000000   

              extras         wides      noballs         byes       legbyes  \
count  929433.000000  39375.000000  5104.000000  4104.000000  11755.000000   
mean        0.080085      1.201092     1.010188     1.743908      1.255040   
std         0.361209      0.721731     0.184374     1.187292      0.746917   
min         0.000000      1.000000     1.000000     1.000000      1.000000 

In [19]:
# Save processed datasets
t20_df.to_csv("../data/processed/t20s_combined.csv", index=False)
ipl_df.to_csv("../data/processed/ipl_combined.csv", index=False)
