## *We can predict the ipl cricket team first inning score based on venue, batting team, bowling team, current run and current wicket in current over, last 5 overs runs and last 5 overs wickets.*

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("ipl.csv")
df.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


## Data preprocessing

### - Drop unnecessary columns

In [3]:
df = df.drop(['mid','date','striker','non-striker'],axis=1)

In [4]:
df.head()

Unnamed: 0,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,total
0,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,222
1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,222
2,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,222
3,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,222
4,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,222


### - Handle missing value

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76014 entries, 0 to 76013
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   venue           76014 non-null  object 
 1   bat_team        76014 non-null  object 
 2   bowl_team       76014 non-null  object 
 3   batsman         76014 non-null  object 
 4   bowler          76014 non-null  object 
 5   runs            76014 non-null  int64  
 6   wickets         76014 non-null  int64  
 7   overs           76014 non-null  float64
 8   runs_last_5     76014 non-null  int64  
 9   wickets_last_5  76014 non-null  int64  
 10  total           76014 non-null  int64  
dtypes: float64(1), int64(5), object(5)
memory usage: 6.4+ MB


#### As we saw above there is no missing value

In [6]:
df['bat_team'].unique()

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Mumbai Indians', 'Deccan Chargers', 'Kings XI Punjab',
       'Royal Challengers Bangalore', 'Delhi Daredevils',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant'], dtype=object)

### - Features selections

In [7]:
#Take only recent team
recent_team = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Mumbai Indians', 'Kings XI Punjab','Royal Challengers Bangalore', 'Delhi Daredevils',
        'Sunrisers Hyderabad']

In [8]:
df = df[(df.bat_team.isin(recent_team)) & (df.bowl_team.isin(recent_team))]

In [9]:
df.shape

(53811, 11)

In [10]:
df.head()

Unnamed: 0,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,total
0,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,222
1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,222
2,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,222
3,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,222
4,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,222


### - One hot encoding for categorical variable

In [11]:
#handle categorical variable
data = df.join(pd.get_dummies(df[['venue','bat_team','bowl_team']]))

### - Drop unnecessary columns

In [12]:
data = data.drop(['venue','bat_team','bowl_team','batsman','bowler'],axis=1)

In [13]:
#See the whole columns
pd.set_option('display.max_columns',None)

In [14]:
data.columns

Index(['runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5', 'total',
       'venue_Barabati Stadium', 'venue_Brabourne Stadium',
       'venue_Buffalo Park', 'venue_De Beers Diamond Oval',
       'venue_Dr DY Patil Sports Academy',
       'venue_Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'venue_Dubai International Cricket Stadium', 'venue_Eden Gardens',
       'venue_Feroz Shah Kotla',
       'venue_Himachal Pradesh Cricket Association Stadium',
       'venue_Holkar Cricket Stadium',
       'venue_JSCA International Stadium Complex', 'venue_Kingsmead',
       'venue_M Chinnaswamy Stadium', 'venue_MA Chidambaram Stadium, Chepauk',
       'venue_Maharashtra Cricket Association Stadium',
       'venue_New Wanderers Stadium', 'venue_Newlands',
       'venue_OUTsurance Oval',
       'venue_Punjab Cricket Association IS Bindra Stadium, Mohali',
       'venue_Punjab Cricket Association Stadium, Mohali',
       'venue_Rajiv Gandhi International Stadium, Uppal',
      

In [15]:
data.head()

Unnamed: 0,runs,wickets,overs,runs_last_5,wickets_last_5,total,venue_Barabati Stadium,venue_Brabourne Stadium,venue_Buffalo Park,venue_De Beers Diamond Oval,venue_Dr DY Patil Sports Academy,venue_Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium,venue_Dubai International Cricket Stadium,venue_Eden Gardens,venue_Feroz Shah Kotla,venue_Himachal Pradesh Cricket Association Stadium,venue_Holkar Cricket Stadium,venue_JSCA International Stadium Complex,venue_Kingsmead,venue_M Chinnaswamy Stadium,"venue_MA Chidambaram Stadium, Chepauk",venue_Maharashtra Cricket Association Stadium,venue_New Wanderers Stadium,venue_Newlands,venue_OUTsurance Oval,"venue_Punjab Cricket Association IS Bindra Stadium, Mohali","venue_Punjab Cricket Association Stadium, Mohali","venue_Rajiv Gandhi International Stadium, Uppal","venue_Sardar Patel Stadium, Motera",venue_Sawai Mansingh Stadium,venue_Shaheed Veer Narayan Singh International Stadium,venue_Sharjah Cricket Stadium,venue_Sheikh Zayed Stadium,venue_St George's Park,venue_Subrata Roy Sahara Stadium,venue_SuperSport Park,venue_Wankhede Stadium,bat_team_Chennai Super Kings,bat_team_Delhi Daredevils,bat_team_Kings XI Punjab,bat_team_Kolkata Knight Riders,bat_team_Mumbai Indians,bat_team_Rajasthan Royals,bat_team_Royal Challengers Bangalore,bat_team_Sunrisers Hyderabad,bowl_team_Chennai Super Kings,bowl_team_Delhi Daredevils,bowl_team_Kings XI Punjab,bowl_team_Kolkata Knight Riders,bowl_team_Mumbai Indians,bowl_team_Rajasthan Royals,bowl_team_Royal Challengers Bangalore,bowl_team_Sunrisers Hyderabad
0,1,0,0.1,1,0,222,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
1,1,0,0.2,1,0,222,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
2,2,0,0.2,2,0,222,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
3,2,0,0.3,2,0,222,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
4,2,0,0.4,2,0,222,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0


In [16]:
#Rearranging the columns for an ease purpose
data = data[['venue_Barabati Stadium', 'venue_Brabourne Stadium',
       'venue_Buffalo Park', 'venue_De Beers Diamond Oval',
       'venue_Dr DY Patil Sports Academy',
       'venue_Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'venue_Dubai International Cricket Stadium', 'venue_Eden Gardens',
       'venue_Feroz Shah Kotla',
       'venue_Himachal Pradesh Cricket Association Stadium',
       'venue_Holkar Cricket Stadium',
       'venue_JSCA International Stadium Complex', 'venue_Kingsmead',
       'venue_M Chinnaswamy Stadium', 'venue_MA Chidambaram Stadium, Chepauk',
       'venue_Maharashtra Cricket Association Stadium',
       'venue_New Wanderers Stadium', 'venue_Newlands',
       'venue_OUTsurance Oval',
       'venue_Punjab Cricket Association IS Bindra Stadium, Mohali',
       'venue_Punjab Cricket Association Stadium, Mohali',
       'venue_Rajiv Gandhi International Stadium, Uppal',
       'venue_Sardar Patel Stadium, Motera', 'venue_Sawai Mansingh Stadium',
       'venue_Shaheed Veer Narayan Singh International Stadium',
       'venue_Sharjah Cricket Stadium', 'venue_Sheikh Zayed Stadium',
       "venue_St George's Park", 'venue_Subrata Roy Sahara Stadium',
       'venue_SuperSport Park', 'venue_Wankhede Stadium',
       'bat_team_Chennai Super Kings', 'bat_team_Delhi Daredevils',
       'bat_team_Kings XI Punjab', 'bat_team_Kolkata Knight Riders',
       'bat_team_Mumbai Indians', 'bat_team_Rajasthan Royals',
       'bat_team_Royal Challengers Bangalore', 'bat_team_Sunrisers Hyderabad',
       'bowl_team_Chennai Super Kings', 'bowl_team_Delhi Daredevils',
       'bowl_team_Kings XI Punjab', 'bowl_team_Kolkata Knight Riders',
       'bowl_team_Mumbai Indians', 'bowl_team_Rajasthan Royals',
       'bowl_team_Royal Challengers Bangalore',
       'bowl_team_Sunrisers Hyderabad','runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5', 'total']]

In [17]:
data.head()

Unnamed: 0,venue_Barabati Stadium,venue_Brabourne Stadium,venue_Buffalo Park,venue_De Beers Diamond Oval,venue_Dr DY Patil Sports Academy,venue_Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium,venue_Dubai International Cricket Stadium,venue_Eden Gardens,venue_Feroz Shah Kotla,venue_Himachal Pradesh Cricket Association Stadium,venue_Holkar Cricket Stadium,venue_JSCA International Stadium Complex,venue_Kingsmead,venue_M Chinnaswamy Stadium,"venue_MA Chidambaram Stadium, Chepauk",venue_Maharashtra Cricket Association Stadium,venue_New Wanderers Stadium,venue_Newlands,venue_OUTsurance Oval,"venue_Punjab Cricket Association IS Bindra Stadium, Mohali","venue_Punjab Cricket Association Stadium, Mohali","venue_Rajiv Gandhi International Stadium, Uppal","venue_Sardar Patel Stadium, Motera",venue_Sawai Mansingh Stadium,venue_Shaheed Veer Narayan Singh International Stadium,venue_Sharjah Cricket Stadium,venue_Sheikh Zayed Stadium,venue_St George's Park,venue_Subrata Roy Sahara Stadium,venue_SuperSport Park,venue_Wankhede Stadium,bat_team_Chennai Super Kings,bat_team_Delhi Daredevils,bat_team_Kings XI Punjab,bat_team_Kolkata Knight Riders,bat_team_Mumbai Indians,bat_team_Rajasthan Royals,bat_team_Royal Challengers Bangalore,bat_team_Sunrisers Hyderabad,bowl_team_Chennai Super Kings,bowl_team_Delhi Daredevils,bowl_team_Kings XI Punjab,bowl_team_Kolkata Knight Riders,bowl_team_Mumbai Indians,bowl_team_Rajasthan Royals,bowl_team_Royal Challengers Bangalore,bowl_team_Sunrisers Hyderabad,runs,wickets,overs,runs_last_5,wickets_last_5,total
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0.1,1,0,222
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0.2,1,0,222
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0.2,2,0,222
3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0.3,2,0,222
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0.4,2,0,222


In [18]:
#Removing the first 5 overs data in every match because first 5 overs are powerplay over ,so 
#we just ignore it
data = data[data['overs']>=5.0]

In [19]:
X = data.drop('total',axis=1)
y = data['total']

## Train test split

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=20)

## Model selection

### 1) Multiple-linear regression

In [21]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()

### 2) Ridge regression (using hyperperameter tuning)

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
r = Ridge()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40]}
ridge_reg = GridSearchCV(r,parameters,cv=5)

### 3) Lasso regression (using hyperprameter tuning) 

In [23]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
lasso=Lasso()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40]}
lasso_regressor=GridSearchCV(lasso,parameters,cv=5)

### 4) Decision tree regression

In [24]:
from sklearn.tree import DecisionTreeRegressor
Decision_reg = DecisionTreeRegressor()

In [25]:
training = list()
testing = list()

In [26]:
def select_model(model):
    model.fit(X_train,y_train)
    testing.append(model.score(X_test,y_test))
    training.append(model.score(X_train,y_train))
    return testing,training

In [27]:
model_list = [linear_model,ridge_reg,lasso_regressor,Decision_reg,]
for i in model_list:
    select_model(i)

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


In [49]:
score_df = pd.DataFrame({'Model':['Linear','Ridge','Lasso','Decision-tree'],'Training_accuracy':training,'Testing_accuracy':testing})

In [50]:
score_df

Unnamed: 0,Model,Training_accuracy,Testing_accuracy
0,Linear,0.678325,0.668358
1,Ridge,0.678321,0.668351
2,Lasso,0.678321,0.668347
3,Decision-tree,0.999963,0.907869


### As we can saw among all regression Decision tree reg gave the better result so we choose Decision tree for predection
### But One-hot encoding categorical variables with high cardinality can cause inefficiency in tree-based ensembles. Continuous variables will be given more importance than the dummy variables by the algorithm which will obscure the order of feature importance resulting in poorer performance.
### So here we use any algo except 'decision tree' because 'Decision tree algo' does not make prediction on categorical data like venue, betting_team and bowling team as follow

In [29]:
ven = [1]*1 + [0]*30
print("Veneu is 'Barabati Stadium' so one hot encoding of this stadium is ",ven)

Veneu is 'Barabati Stadium' so one hot encoding of this stadium is  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [30]:
bet_team=[1]*1 + [0]*7
bowl_team = [0]*7 + [1]*1

In [31]:
print("Betting team is 'Chennai Super Kings' so one hot encodding is ",bet_team)
print("Bowling team is 'Sunrisers Hyderabad' so one hot encodding is ",bowl_team)

Betting team is 'Chennai Super Kings' so one hot encodding is  [1, 0, 0, 0, 0, 0, 0, 0]
Bowling team is 'Sunrisers Hyderabad' so one hot encodding is  [0, 0, 0, 0, 0, 0, 0, 1]


In [32]:
run = [50]
wicket=[1]
over = [10]
run_last_5_over = [10]
wicket_lat_5_over = [3]

In [33]:
#Concat all lists
pred_lists = ven+bet_team+bowl_team+run+wicket+over+run_last_5_over+wicket_lat_5_over

In [34]:
#convert list into array because to make prediction
ipl_data = np.array([pred_lists])

In [35]:
# Use Decision tree algo
op = Decision_reg.predict(ipl_data)

In [36]:
lower = op-10
uper = op+5

In [37]:
print("Team score will be in between the range of {:.0f} to {:.0f}".format(lower[0],uper[0]))

Team score will in between 161 to 176


### Now if we change venue, betting_team and bowling_team then there will be no affect on output

In [38]:
# Try different categorical value for Decision tree algo
ven = [0]*10 + [1]*1 + [0]*20
bet_team=[0]*5 + [1]*1 + [0]*2
bowl_team = [0]*7 + [1]*1
run = [50]
wicket=[1]
over = [10]
run_last_5_over = [10]
wicket_lat_5_over = [3]
pred_lists = ven+bet_team+bowl_team+run+wicket+over+run_last_5_over+wicket_lat_5_over
ipl_data = np.array([pred_lists])
op = Decision_reg.predict(ipl_data)
lower = op-10
uper = op+5
print("Team score will be in between the range of {:.0f} to {:.0f}".format(lower[0],uper[0]))

Team score will in between 161 to 176


### As we saw there is no change in result

### Here we'll take Ridge algo

In [39]:
ven = [0]*10 + [1]*1 + [0]*20
bet_team=[0]*5 + [1]*1 + [0]*2
bowl_team = [0]*7 + [1]*1
run = [50]
wicket=[1]
over = [10]
run_last_5_over = [10]
wicket_lat_5_over = [3]
pred_lists = ven+bet_team+bowl_team+run+wicket+over+run_last_5_over+wicket_lat_5_over
ipl_data = np.array([pred_lists])
op = ridge_reg.predict(ipl_data)
lower = op-10
uper = op+5
print("Team score will be in between the range of {:.0f} to {:.0f}".format(lower[0],uper[0]))

Team score will in between 143 to 158


### Now let's change categorical data and see result

In [40]:
ven = [1]*1 + [0]*30
bet_team=[1]*1 + [0]*7
bowl_team = [0]*7 + [1]*1
run = [50]
wicket=[1]
over = [10]
run_last_5_over = [10]
wicket_lat_5_over = [3]
pred_lists = ven+bet_team+bowl_team+run+wicket+over+run_last_5_over+wicket_lat_5_over
ipl_data = np.array([pred_lists])
op = ridge_reg.predict(ipl_data)
lower = op-10
uper = op+5
print("Team score will be in between the range of {:.0f} to {:.0f}".format(lower[0],uper[0]))

Team score will in between 144 to 159


### As we can saw that there was an impact of categorical variable on result

### -  Save model

In [51]:
import joblib
joblib.dump(ridge_reg,"IPL_Score_Predictor.pkl")

['IPL_Score_Predictor.pkl']