In [479]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from scipy import stats
import warnings

warnings.filterwarnings(action='ignore')

In [480]:
balls=pd.read_csv('./each_ball_records.csv')
matches=pd.read_csv('./each_match_records.csv')

### Balls Data column details
- Match no: Match Number
- ballnumber: Ball count number for each match
- outcome: Outcome of the ball (1,2,3,4,5,6,n,W,wd)
- batter: Batsmen name
- bowler: Boweler name

In [481]:
balls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17858 entries, 0 to 17857
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   match_no    17858 non-null  int64  
 1   ballnumber  17858 non-null  int64  
 2   inningno    17858 non-null  int64  
 3   over        17858 non-null  float64
 4   outcome     17858 non-null  object 
 5   batter      17858 non-null  object 
 6   bowler      17858 non-null  object 
 7   comment     17858 non-null  object 
 8   score       17858 non-null  int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 1.2+ MB


In [482]:
balls.head()

Unnamed: 0,match_no,ballnumber,inningno,over,outcome,batter,bowler,comment,score
0,1,1,1,0.1,0,Devon Conway,Mohammed Shami,"Mohammed Shami to Devon Conway, no run,",0
1,1,2,1,0.2,1lb,Devon Conway,Mohammed Shami,"Mohammed Shami to Devon Conway, 1 leg bye,",1
2,1,3,1,0.3,0,Ruturaj Gaikwad,Mohammed Shami,"Mohammed Shami to Ruturaj Gaikwad, no run,",0
3,1,4,1,0.4,1,Ruturaj Gaikwad,Mohammed Shami,"Mohammed Shami to Ruturaj Gaikwad, 1 run,",1
4,1,5,1,0.5,0,Devon Conway,Mohammed Shami,"Mohammed Shami to Devon Conway, no run,",0


In [483]:
balls.outcome.unique()

array(['0', '1lb', '1', '4', 'w', '6', '1nb', '4lb', '2', '1b', '1wd',
       '2nb', '5nb', '7nb', '3', '5wd', '4b', '2wd', '5', '3wd', '2lb',
       '3nb', '2b'], dtype=object)

In [484]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   season          74 non-null     int64  
 1   date            74 non-null     object 
 2   match_number    74 non-null     int64  
 3   match_type      74 non-null     object 
 4   venue           74 non-null     object 
 5   location        74 non-null     object 
 6   team1           74 non-null     object 
 7   team2           74 non-null     object 
 8   toss_won        74 non-null     object 
 9   toss_decision   74 non-null     object 
 10  umpire1         74 non-null     object 
 11  umpire2         74 non-null     object 
 12  reserve_umpire  74 non-null     object 
 13  match_referee   74 non-null     object 
 14  winner          74 non-null     object 
 15  winner_runs     40 non-null     float64
 16  winner_wickets  33 non-null     float64
 17  man_of_match    73 non-null     objec

In [485]:
balls.head(2)

Unnamed: 0,match_no,ballnumber,inningno,over,outcome,batter,bowler,comment,score
0,1,1,1,0.1,0,Devon Conway,Mohammed Shami,"Mohammed Shami to Devon Conway, no run,",0
1,1,2,1,0.2,1lb,Devon Conway,Mohammed Shami,"Mohammed Shami to Devon Conway, 1 leg bye,",1


In [486]:
balls[(balls.batter.str.contains('Cam')) & (balls.ballnumber.between(1,37))]

Unnamed: 0,match_no,ballnumber,inningno,over,outcome,batter,bowler,comment,score
979,5,16,1,2.4,1,Cameron Green,Mohammed Siraj,"Mohammed Siraj to Cameron Green, 1 run,",1
982,5,19,1,3.1,4,Cameron Green,Reece Topley,"Reece Topley to Cameron Green, Four,",4
983,5,20,1,3.2,0,Cameron Green,Reece Topley,"Reece Topley to Cameron Green, no run,",0
984,5,21,1,3.3,w,Cameron Green,Reece Topley,"Reece Topley to Cameron Green, no run, ...",0
2685,12,27,1,4.2,1,Cameron Green,Mitchell Santner,"Mitchell Santner to Cameron Green, 1 run,",1
...,...,...,...,...,...,...,...,...,...
17211,72,35,1,5.5,1,Cameron Green,Krunal Pandya,"Krunal Pandya to Cameron Green, 1 run,",1
17537,73,6,2,0.6,4,Cameron Green,Mohammed Shami,"Mohammed Shami to Cameron Green, Four,",4
17541,73,10,2,1.4,1wd,Cameron Green,Hardik Pandya,"Hardik Pandya to Cameron Green, 1 wide,",1
17542,73,11,2,1.4,0,Cameron Green,Hardik Pandya,"Hardik Pandya to Cameron Green, no run,",0


In [487]:
balls.columns

Index(['match_no', 'ballnumber', 'inningno', 'over', 'outcome', 'batter',
       'bowler', 'comment', 'score'],
      dtype='object')

In [488]:
matches.head(2)

Unnamed: 0,season,date,match_number,match_type,venue,location,team1,team2,toss_won,toss_decision,umpire1,umpire2,reserve_umpire,match_referee,winner,winner_runs,winner_wickets,man_of_match
0,2023,31-03-2023,1,Group,Narendra Modi Stadium,Ahmedabad,Chennai Super Kings,Gujarat Titans,Gujarat Titans,field,Nitin Menon,HAS Khalid,A Bengeri,J Srinath,Gujarat Titans,,5.0,Rashid Khan
1,2023,01-04-2023,2,Group,Punjab Cricket Association IS Bindra Stadium,Chandigarh,Punjab Kings,Kolkata Knight Riders,Kolkata Knight Riders,field,BNJ Oxenford,YC Barde,PM Joshi,M Nayyar,Punjab Kings,7.0,,Arshdeep Singh


In [489]:
matches[(matches.team1 == 'Royal Challengers Bangalore') | (matches.team2 == 'Royal Challengers Bangalore')].shape

(14, 18)

In [490]:
matches[matches.match_number == 74]

Unnamed: 0,season,date,match_number,match_type,venue,location,team1,team2,toss_won,toss_decision,umpire1,umpire2,reserve_umpire,match_referee,winner,winner_runs,winner_wickets,man_of_match
73,2023,29-05-2023,74,Final,Narendra Modi Stadium,Ahmedabad,Chennai Super Kings,Gujarat Titans,Chennai Super Kings,field,Nitin Menon,RJ Tucker,J Madanagopal,J Srinath,Chennai Super Kings,,5.0,DP Conway


### Data Cleaning

In [491]:
raw_batsmen_data=balls[['match_no',  'inningno', 'over',  'batter', 'score','outcome']]

In [492]:
# raw_batsmen_data=raw_batsmen_data.merge(right=innings_counter,on='batter')
raw_batsmen_data.loc[(raw_batsmen_data['outcome']!='w'),'outcome']=0 #Batter is NOT OUT
raw_batsmen_data.loc[(raw_batsmen_data['outcome']=='w'),'outcome']=1 #Batter is OUT
raw_batsmen_data['batter_key']=raw_batsmen_data['batter']

In [493]:
raw_batsmen_data

Unnamed: 0,match_no,inningno,over,batter,score,outcome,batter_key
0,1,1,0.1,Devon Conway,0,0,Devon Conway
1,1,1,0.2,Devon Conway,1,0,Devon Conway
2,1,1,0.3,Ruturaj Gaikwad,0,0,Ruturaj Gaikwad
3,1,1,0.4,Ruturaj Gaikwad,1,0,Ruturaj Gaikwad
4,1,1,0.5,Devon Conway,0,0,Devon Conway
...,...,...,...,...,...,...,...
17853,74,2,14.2,Shivam Dube,1,0,Shivam Dube
17854,74,2,14.3,Ravindra Jadeja,1,0,Ravindra Jadeja
17855,74,2,14.4,Shivam Dube,1,0,Shivam Dube
17856,74,2,14.5,Ravindra Jadeja,6,0,Ravindra Jadeja


### Data Pre-Processing

In [494]:
BT_score_over=raw_batsmen_data.groupby(['over','batter','match_no']).agg({'score':'sum','batter_key':'count','outcome':'sum'}).reset_index().rename(columns={'batter_key':'balls_faced','outcome':'OUTs'})

#### Collating all the ball data to overwise.

In [495]:
Over_score=pd.DataFrame()
for over in range(0,20):
    for ball in range(1,7):
        temp=pd.DataFrame()
        v=float(str(over)+"."+str(ball))
        temp=BT_score_over[BT_score_over['over']==v]
        temp['over_number']=over+1
        Over_score=pd.concat([Over_score,temp])

Over_score.drop('over',axis=1,inplace=True)
temp=Over_score.groupby(['over_number','batter','match_no']).agg({'score':'sum','balls_faced':'sum','OUTs':'sum'}).reset_index()
temp.sort_values(['over_number','score'],ascending=[True,False],inplace=True)
temp.reset_index(drop=True,inplace=True)
Over_score=temp

In [496]:
Over_score.head(2)

Unnamed: 0,over_number,batter,match_no,score,balls_faced,OUTs
0,1,Yashasvi Jaiswal,56,26,6,0
1,1,Yashasvi Jaiswal,11,20,6,0


## Openers

In [497]:
PP_Avg_Strike=Over_score[Over_score['over_number'].between(1,6)]

In [498]:
innings_counter=PP_Avg_Strike[['batter','match_no']]
innings_counter.drop_duplicates(inplace=True)
innings_counter=innings_counter.groupby('batter').agg({'match_no':'count'}).rename(columns={'match_no':'innings_played'})
innings_counter.reset_index(inplace=True)

In [499]:
PP_Avg_Strike=PP_Avg_Strike.merge(right=innings_counter,on='batter')

In [500]:
PP_Avg_Strike=PP_Avg_Strike.groupby('batter').agg({'score':'sum','balls_faced':'sum','innings_played':'max','OUTs':'sum'}).reset_index()
PP_Avg_Strike['strike_rate']=round((PP_Avg_Strike['score']/PP_Avg_Strike['balls_faced'])*100,3)
PP_Avg_Strike.loc[PP_Avg_Strike['OUTs'] == 0,'OUTs']=1 #To tacle division by zero in average caluculation
PP_Avg_Strike['batter_average']=PP_Avg_Strike['score']/PP_Avg_Strike['OUTs']

In [501]:
PP_Avg_Strike.head()

Unnamed: 0,batter,score,balls_faced,innings_played,OUTs,strike_rate,batter_average
0,Abhishek Sharma,124,91,8,5,136.264,24.8
1,Abishek Porel,1,1,1,1,100.0,1.0
2,Aiden Markram,18,30,7,1,60.0,18.0
3,Ajinkya Rahane,130,60,6,1,216.667,130.0
4,Aman Hakim Khan,4,5,1,1,80.0,4.0


### Removing Outliers

In [502]:
z=np.abs(stats.zscore(PP_Avg_Strike.innings_played))
PP_Avg_Strike['z_score']=z
PP_Avg_Strike=PP_Avg_Strike[PP_Avg_Strike.z_score>1]
PP_Avg_Strike.head()

Unnamed: 0,batter,score,balls_faced,innings_played,OUTs,strike_rate,batter_average,z_score
13,Cameron Green,180,100,11,3,180.0,60.0,1.118784
16,David Warner,300,210,14,4,142.857,75.0,1.744191
19,Devon Conway,324,236,15,4,137.288,81.0,1.952661
21,Faf du Plessis,356,210,14,3,169.524,118.666667,1.744191
27,Ishan Kishan,344,236,15,6,145.763,57.333333,1.952661


In [503]:
fig=px.scatter(PP_Avg_Strike,x='batter_average',y='strike_rate',color='batter',size='score',text='batter')
fig.update_layout(
    title=dict(text="Best Batsmen in the POWER PLAY in IPL 2023 min 7 Innings", font=dict(size=20),  yref='paper')
)

In [504]:
px.histogram(PP_Avg_Strike,x='batter',y='strike_rate')

## Boundary hitters

In [505]:
power_hitters=balls[['batter','outcome']]
power_hitters=power_hitters[~power_hitters.outcome.str.contains('4b')]
power_hitters=power_hitters[(power_hitters.outcome.str.contains('7n')) | (power_hitters.outcome.str.contains('6')) | (power_hitters.outcome.str.contains('4'))  | (power_hitters.outcome.str.contains('5nb'))]
power_hitters.loc[power_hitters['outcome'] == '7nb','outcome']=6
power_hitters.loc[power_hitters['outcome'] == '5nb','outcome']=4
power_hitters.loc[power_hitters['outcome'] == '4lb','outcome']=4
power_hitters.outcome.astype('int')
power_hitters=power_hitters.groupby('batter').agg({'outcome':'count'}).reset_index(drop=False).rename(columns={'outcome':'boundaries_hit'})

In [506]:
ball_counter=balls[['batter','ballnumber']]
ball_counter=ball_counter.groupby('batter').agg({'ballnumber':'count'}).reset_index(drop=False).rename(columns={'ballnumber':'balls_faced'})
pow_inning_counter=balls[['batter','inningno','match_no']]
pow_inning_counter=pow_inning_counter.drop_duplicates()
pow_inning_counter=pow_inning_counter.groupby('batter').agg({'match_no':'count'}).reset_index(drop=False).rename(columns={'match_no':'innings_played'})

In [507]:
power_hitters=power_hitters.merge(pow_inning_counter,on='batter')
power_hitters=power_hitters.merge(ball_counter,on='batter')

power_hitters=power_hitters.sort_values('boundaries_hit',ascending=False)
power_hitters.reset_index(drop=True,inplace=True)

power_hitters['boundaries_probability'] = power_hitters['boundaries_hit']/power_hitters['balls_faced']

In [508]:
power_hitters[power_hitters.boundaries_probability.between(0.05,0.2)]

Unnamed: 0,batter,boundaries_hit,innings_played,balls_faced,boundaries_probability
2,Faf du Plessis,95,14,475,0.200000
3,Devon Conway,93,15,481,0.193347
5,Virat Kohli,81,14,469,0.172708
6,David Warner,77,14,401,0.192020
7,Ruturaj Gaikwad,76,15,408,0.186275
...,...,...,...,...,...
142,Umesh Yadav,1,5,20,0.050000
143,Arshad Khan,1,3,15,0.066667
144,Harshal Patel,1,2,6,0.166667
145,Noor Ahmad,1,3,9,0.111111


In [478]:
# z=np.abs(stats.zscore(power_hitters.balls_faced))
# power_hitters['z_score']=z
# # PP_Avg_Strike=PP_Avg_Strike[PP_Avg_Strike.z_score>1]
# power_hitters[power_hitters.z_score <1.05]

Unnamed: 0,batter,boundaries_hit,innings_played,balls_faced,boundaries_probability,z_score
10,Glenn Maxwell,60,14,225,0.266667,0.879790
15,Prabhsimran Singh,57,14,242,0.235537,1.022686
21,Nicholas Pooran,49,15,214,0.228972,0.787328
23,Tilak Varma,49,11,220,0.222727,0.837762
25,Sanju Samson,48,14,231,0.207792,0.930224
...,...,...,...,...,...,...
142,Umesh Yadav,1,5,20,0.050000,0.843365
143,Arshad Khan,1,3,15,0.066667,0.885394
144,Harshal Patel,1,2,6,0.166667,0.961044
145,Noor Ahmad,1,3,9,0.111111,0.935827


In [358]:
px.box(power_hitters,'innings_played')

In [354]:
px.histogram(power_hitters,x='innings_played',y='batter')

In [351]:
power_hitters.sort_values('six_probability',ascending=False).head(10)

Unnamed: 0,batter,sixes_hit,innings_played,balls_faced,six_probability
78,David Wiese,3,3,11,0.272727
91,Umran Malik,2,1,8,0.25
101,Mark Wood,1,2,5,0.2
33,Rashid Khan,13,9,66,0.19697
75,Glenn Phillips,4,5,22,0.181818
114,Harshal Patel,1,2,6,0.166667
112,kes,1,1,6,0.166667
45,MS Dhoni,10,12,63,0.15873
3,Glenn Maxwell,31,14,225,0.137778
1,Shivam Dube,35,14,277,0.126354


In [509]:
px.scatter(power_hitters,x='balls_faced',y='boundaries_hit',text='batter')