# Indian Premier League 2008-2019
**Ball-by-ball Indian Premier League (IPL) cricket dataset**

## 1. Importing Libraries

In [7]:
import sys #access to system parameters
print("Python version: {}". format(sys.version))

import numpy as np
print("NumPy version: {}". format(np.__version__))

import pandas as pd 
print("pandas version: {}". format(pd.__version__))

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
print("matplotlib version: {}". format(mpl.__version__))

import seaborn as sns
print("seaborn version: {}". format(sns.__version__))

import sklearn
print("scikit-learn version: {}". format(sklearn.__version__))

#misc libraries
# import random
# import time


#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

import os
print('Dataset file {}'.format(os.listdir('./datasets/')))

#Visualization
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

Python version: 3.7.1 (default, Dec 14 2018, 19:28:38) 
[GCC 7.3.0]
NumPy version: 1.15.4
pandas version: 0.23.4
matplotlib version: 3.0.2
seaborn version: 0.9.0
scikit-learn version: 0.20.1
-------------------------
Dataset file ['matches.csv', 'deliveries.csv']


## Data Dictionary

**Matches**
1. id - Unique Id for a match
2. season - Year of the match
3. city - City where the match took place
4. date - Date of match (DD/MM/YY)
5. team1 - Team batting first
6. team2 - Team batting second
7. toss_winner - Toss winner
8. toss_decision - Toss decision - bat/field
9. result - Match result
0. dl_applied - Is Duckworth Lewis (DL) rule applied
1. winner - Winner of the match
2. win_by_runs - Win by runs
3. win_by_wickets - Win by wickets
4. player_of_match - Player of the match award (Man of the match)
5. venue - Match venue
6. umpire1- Umpire 1
7. umpire2 - Umpire 2
8. umpire3 - Umpire 3

**Deliveries**
1. match_id - Unique Identifier for a match
2. inningMatch innings - 1st innings/2nd innings
3. batting_team - Name of the batting team
4. bowling_team - Name of the bowling team
5. over - Current over
6. ball - Current ball of the over
7. batsman - Name of the batsman on strike
8. non_striker - Name of the batsman on non-striker's end
9. bowler - Name of the bowler
10. is_super_over - Is this a super-over (0 or 1)
11. wide_runs - Runs given as wide
12. bye_runs - Runs given as bye
13. legbye_runs - Runs given as leg-bye
14. noball_runs-Runs given as no-ball
15. penalty_runs - Runs given as penalty
16. batsman_runs - Runs scored by the batsman
18. extra_runs - Total extra runs (Wide, Bye, No-ball, Penalty)
19. total_runs - Totalruns from the ball (extra_runs, batsman_runs)
20. player_dismissed - Name of the player dismissed (If out)
21. dismissal_kind - How the player was dismissed (If out)
22. fielderFielder -  involved in the dismissal (If any)

In [33]:
# enter the file(csv, xls, etc.)
match_path = './datasets/matches.csv'
deliveries_path = './datasets/deliveries.csv'

In [34]:
# loading the dataset/sheet in the df variable
match_data = pd.read_csv(match_path)
deliveries_data = pd.read_csv(deliveries_path)

In [35]:
#To display maximum columns
pd.set_option('display.max_columns',50)

In [36]:
match_data.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,2,2017,Pune,2017-04-06,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,
2,3,2017,Rajkot,2017-04-07,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,
3,4,2017,Indore,2017-04-08,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,
4,5,2017,Bangalore,2017-04-08,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,,


In [37]:
deliveries_data.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,,,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,,,
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,4,0,4,,,
3,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,4,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,,,
4,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,5,DA Warner,S Dhawan,TS Mills,0,2,0,0,0,0,0,2,2,,,


In [38]:
match_data.columns

Index(['id', 'season', 'city', 'date', 'team1', 'team2', 'toss_winner',
       'toss_decision', 'result', 'dl_applied', 'winner', 'win_by_runs',
       'win_by_wickets', 'player_of_match', 'venue', 'umpire1', 'umpire2',
       'umpire3'],
      dtype='object')

In [39]:
deliveries_data.columns

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batsman', 'non_striker', 'bowler', 'is_super_over', 'wide_runs',
       'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs',
       'batsman_runs', 'extra_runs', 'total_runs', 'player_dismissed',
       'dismissal_kind', 'fielder'],
      dtype='object')

In [40]:
# Checking the number of rows and columns
print('Number of rows and column in Match datatset',format(match_data.shape))
print('-'*80)
print('Number of rows and column in Deliveries datatset',format(deliveries_data.shape))

Number of rows and column in Match datatset (756, 18)
--------------------------------------------------------------------------------
Number of rows and column in Deliveries datatset (179078, 21)


In [41]:
match_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,756.0,1792.178571,3464.478148,1.0,189.75,378.5,567.25,11415.0
season,756.0,2013.444444,3.366895,2008.0,2011.0,2013.0,2016.0,2019.0
dl_applied,756.0,0.025132,0.15663,0.0,0.0,0.0,0.0,1.0
win_by_runs,756.0,13.283069,23.471144,0.0,0.0,0.0,19.0,146.0
win_by_wickets,756.0,3.350529,3.387963,0.0,0.0,4.0,6.0,10.0


In [42]:
deliveries_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
match_id,179078.0,1802.252957,3472.322805,1.0,190.0,379.0,567.0,11415.0
inning,179078.0,1.482952,0.502074,1.0,1.0,1.0,2.0,5.0
over,179078.0,10.162488,5.677684,1.0,5.0,10.0,15.0,20.0
ball,179078.0,3.615587,1.806966,1.0,2.0,4.0,5.0,9.0
is_super_over,179078.0,0.000452,0.021263,0.0,0.0,0.0,0.0,1.0
wide_runs,179078.0,0.036721,0.251161,0.0,0.0,0.0,0.0,5.0
bye_runs,179078.0,0.004936,0.11648,0.0,0.0,0.0,0.0,4.0
legbye_runs,179078.0,0.021136,0.194908,0.0,0.0,0.0,0.0,5.0
noball_runs,179078.0,0.004183,0.070492,0.0,0.0,0.0,0.0,5.0
penalty_runs,179078.0,5.6e-05,0.016709,0.0,0.0,0.0,0.0,5.0


In [43]:
match_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 18 columns):
id                 756 non-null int64
season             756 non-null int64
city               749 non-null object
date               756 non-null object
team1              756 non-null object
team2              756 non-null object
toss_winner        756 non-null object
toss_decision      756 non-null object
result             756 non-null object
dl_applied         756 non-null int64
winner             752 non-null object
win_by_runs        756 non-null int64
win_by_wickets     756 non-null int64
player_of_match    752 non-null object
venue              756 non-null object
umpire1            754 non-null object
umpire2            754 non-null object
umpire3            119 non-null object
dtypes: int64(5), object(13)
memory usage: 106.4+ KB


In [44]:
deliveries_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179078 entries, 0 to 179077
Data columns (total 21 columns):
match_id            179078 non-null int64
inning              179078 non-null int64
batting_team        179078 non-null object
bowling_team        179078 non-null object
over                179078 non-null int64
ball                179078 non-null int64
batsman             179078 non-null object
non_striker         179078 non-null object
bowler              179078 non-null object
is_super_over       179078 non-null int64
wide_runs           179078 non-null int64
bye_runs            179078 non-null int64
legbye_runs         179078 non-null int64
noball_runs         179078 non-null int64
penalty_runs        179078 non-null int64
batsman_runs        179078 non-null int64
extra_runs          179078 non-null int64
total_runs          179078 non-null int64
player_dismissed    8834 non-null object
dismissal_kind      8834 non-null object
fielder             6448 non-null object
dtype

## Checking Null Values

In [45]:
# Checking null values in the match dataset
match_data.isnull().sum()

id                   0
season               0
city                 7
date                 0
team1                0
team2                0
toss_winner          0
toss_decision        0
result               0
dl_applied           0
winner               4
win_by_runs          0
win_by_wickets       0
player_of_match      4
venue                0
umpire1              2
umpire2              2
umpire3            637
dtype: int64

In [46]:
# Checking null values in the deliveries dataset
deliveries_data.isnull().sum()

match_id                 0
inning                   0
batting_team             0
bowling_team             0
over                     0
ball                     0
batsman                  0
non_striker              0
bowler                   0
is_super_over            0
wide_runs                0
bye_runs                 0
legbye_runs              0
noball_runs              0
penalty_runs             0
batsman_runs             0
extra_runs               0
total_runs               0
player_dismissed    170244
dismissal_kind      170244
fielder             172630
dtype: int64

In [47]:
print(f'Match Data has total {match_data.isnull().sum().sum()} null values')
print(f' Data has total {deliveries_data.isnull().sum().sum()} null values')

Train Data has total 656 null values
Test Data has total 513118 null values


### Single Variate Analysis

**Let's understand the data**

In [21]:
# Let's understand the Item Identifier
# getcolumninfo(train_data,'Item_Identifier')
train_data['Item_Identifier'].head()

0    FDA15
1    DRC01
2    FDN15
3    FDX07
4    NCD19
Name: Item_Identifier, dtype: object

In [22]:
len(train_data['Item_Identifier'].unique())

1559

In [23]:
train_data.skew()

Item_Weight                  0.082426
Item_Visibility              1.167091
Item_MRP                     0.127202
Outlet_Establishment_Year   -0.396641
Item_Outlet_Sales            1.177531
dtype: float64

In [24]:
getcolumninfo(train_data,'Item_Identifier')

Unnamed: 0,Details
column_name,Item_Identifier
total_length,8523
min,DRA12
max,NCZ54
unique_count,1559


In [25]:
# Item Weight
train_data['Item_Weight'].head()

0     9.30
1     5.92
2    17.50
3    19.20
4     8.93
Name: Item_Weight, dtype: float64

In [26]:
# drawdistchart(train_data,)
# sns.distplot(train_data,'Item_Weight')
# get_class_counts(train_data,'Item_Weight')
sns.

SyntaxError: invalid syntax (<ipython-input-26-87837a39b573>, line 4)

In [None]:
train_data.columns