In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly
import zipfile
import os

### Loading the datasets

In [2]:
df1 = pd.read_csv('../Notebook/IPL_Ball_by_Ball_2008_2022.csv')
df2 = pd.read_csv('../Notebook/IPL_Matches_2008_2022.csv')

In [3]:
print("Dataframe 1 shape:",df1.shape)
print("Dataframe 2 shape:",df2.shape)

Dataframe 1 shape: (225954, 17)
Dataframe 2 shape: (950, 20)


### Exploratory data analysis

In [4]:
df2.sample(3)

Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,method,Player_of_Match,Team1Players,Team2Players,Umpire1,Umpire2
415,980935,Hyderabad,2016-04-23,2016,18,Sunrisers Hyderabad,Kings XI Punjab,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,field,N,Sunrisers Hyderabad,Wickets,5.0,,Mustafizur Rahman,"['DA Warner', 'S Dhawan', 'AP Tare', 'EJG Morg...","['M Vijay', 'M Vohra', 'SE Marsh', 'DA Miller'...",AK Chaudhary,CK Nandan
65,1304055,Mumbai,2022-04-02,2022,9,Rajasthan Royals,Mumbai Indians,"Dr DY Patil Sports Academy, Mumbai",Mumbai Indians,field,N,Rajasthan Royals,Runs,23.0,,JC Buttler,"['JC Buttler', 'YBK Jaiswal', 'D Padikkal', 'S...","['Ishan Kishan', 'RG Sharma', 'Anmolpreet Sing...",Nitin Menon,PG Pathak
250,1175359,Jaipur,2019-03-25,2019,4,Kings XI Punjab,Rajasthan Royals,Sawai Mansingh Stadium,Rajasthan Royals,field,N,Kings XI Punjab,Runs,14.0,,CH Gayle,"['KL Rahul', 'CH Gayle', 'MA Agarwal', 'SN Kha...","['AM Rahane', 'JC Buttler', 'SV Samson', 'SPD ...",C Shamshuddin,KN Ananthapadmanabhan


D/L: This refers to the Duckworth-Lewis method, a mathematical formula used to adjust targets in limited-overs cricket matches that are affected by weather interruptions. Matches might be decided by this method if weather conditions significantly disrupt the game.

In [5]:
# Dropping unnecessary features
df2.drop(['ID','Date','Team1Players','Team2Players','method'],axis=1,inplace=True)

In [6]:
# Checking duplicated values
duplicate_Cnt = df2.duplicated().sum()
if (duplicate_Cnt > 0):
    df.drop_duplicates(inplace=True)
    print(f"{duplicate_Cnt} duplicate values removed")
else:
    print("There are no duplicate values")

There are no duplicate values


In [7]:
nan_cols = []

# Finding out the features having missing values and percentage of missing values
for cols in df2.columns[df2.isnull().sum() != 0]:
    nan_cols.append(cols)
    print(f"{cols} : ",np.round(df2[cols].isnull().mean()*100,2))

City :  5.37
SuperOver :  0.42
WinningTeam :  0.42
Margin :  1.89
Player_of_Match :  0.42


In [8]:
# Performing Univariate mode imputation
for cols in ['City','SuperOver','WinningTeam','Margin','Player_of_Match']:
    df2[cols] = df2[cols].fillna(df2[cols].mode()[0])

In [10]:
# Fixing the small duplicate values with correct values
df2['Team1'] = df2['Team1'].replace('Rising Pune Supergiant','Rising Pune Supergiants')
df2['Team2'] = df2['Team2'].replace('Rising Pune Supergiant','Rising Pune Supergiants')

In [11]:
df2.sample(4)

Unnamed: 0,City,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Umpire1,Umpire2
690,Mumbai,2012,12,Mumbai Indians,Rajasthan Royals,Wankhede Stadium,Rajasthan Royals,field,N,Mumbai Indians,Runs,27.0,KA Pollard,Aleem Dar,BNJ Oxenford
556,Hyderabad,2013,72,Sunrisers Hyderabad,Kolkata Knight Riders,"Rajiv Gandhi International Stadium, Uppal",Kolkata Knight Riders,bat,N,Sunrisers Hyderabad,Wickets,5.0,PA Patel,Asad Rauf,S Asnani
99,Sharjah,2021,35,Royal Challengers Bangalore,Chennai Super Kings,Sharjah Cricket Stadium,Chennai Super Kings,field,N,Chennai Super Kings,Wickets,6.0,DJ Bravo,AK Chaudhary,Nitin Menon
899,Kolkata,2007/08,52,Kolkata Knight Riders,Kings XI Punjab,Eden Gardens,Kings XI Punjab,bat,N,Kolkata Knight Riders,Wickets,3.0,Umar Gul,SJ Davis,I Shivram
