In [1]:
# Importing Necessary Libraries
import pandas as pd
import numpy as np

In [2]:
# Loading the dataset
data = pd.read_csv('data.csv')

# **Data Analysis**

In [3]:
data.head()

Unnamed: 0,mid,date,venue,batting_team,bowling_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


In [4]:
# Describing Numerical Values of the Dataset
data.describe()

Unnamed: 0,mid,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
count,76014.0,76014.0,76014.0,76014.0,76014.0,76014.0,76014.0,76014.0,76014.0
mean,308.62774,74.889349,2.415844,9.783068,33.216434,1.120307,24.962283,8.869287,160.901452
std,178.156878,48.823327,2.015207,5.772587,14.914174,1.053343,20.079752,10.795742,29.246231
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0
25%,154.0,34.0,1.0,4.6,24.0,0.0,10.0,1.0,142.0
50%,308.0,70.0,2.0,9.6,34.0,1.0,20.0,5.0,162.0
75%,463.0,111.0,4.0,14.6,43.0,2.0,35.0,13.0,181.0
max,617.0,263.0,10.0,19.6,113.0,7.0,175.0,109.0,263.0


In [5]:
# Number of Unique Values in each column
data.nunique()

mid               617
date              442
venue              35
batting_team       14
bowling_team       14
batsman           411
bowler            329
runs              252
wickets            11
overs             140
runs_last_5       102
wickets_last_5      8
striker           155
non-striker        88
total             138
dtype: int64

In [6]:
# Datatypes of all Columns
data.dtypes

mid                 int64
date               object
venue              object
batting_team       object
bowling_team       object
batsman            object
bowler             object
runs                int64
wickets             int64
overs             float64
runs_last_5         int64
wickets_last_5      int64
striker             int64
non-striker         int64
total               int64
dtype: object

# **Data Cleaning**

Removing Unwanted Columns

In [7]:
# Names of all columns
data.columns

Index(['mid', 'date', 'venue', 'batting_team', 'bowling_team', 'batsman',
       'bowler', 'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5',
       'striker', 'non-striker', 'total'],
      dtype='object')

In [8]:
# Removing unwanted columns
columns_to_remove = ['mid', 'date' , 'venue', 'batsman', 'bowler', 'striker', 'non-striker']
print('Before removing unwanted columns: {}'.format(data.shape))
data.drop(labels=columns_to_remove, axis=1, inplace=True)
print('After removing unwanted columns: {}'.format(data.shape))


Before removing unwanted columns: (76014, 15)
After removing unwanted columns: (76014, 8)


In [9]:
consistent_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
                    'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
                    'Delhi Daredevils', 'Sunrisers Hyderabad']

In [10]:
print('Before Removing Inconsistent Teams : {}'.format(data.shape))
data = data[(data['batting_team'].isin(consistent_teams)) & (data['bowling_team'].isin(consistent_teams))]
print('After Removing Irrelevant Columns : {}'.format(data.shape))

Before Removing Inconsistent Teams : (76014, 8)
After Removing Irrelevant Columns : (53811, 8)


In [11]:
#teams after removing
data['batting_team'].unique()

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
       'Delhi Daredevils', 'Sunrisers Hyderabad'], dtype=object)

In [12]:
# Removing the first 5 overs data in every match
print('Before removing first 5 overs data: {}'.format(data.shape))
data = data[data['overs']>=5.0]
print('After removing first 5 overs data: {}'.format(data.shape))

Before removing first 5 overs data: (53811, 8)
After removing first 5 overs data: (40108, 8)


In [32]:
#from seaborn import heatmap
#heatmap(data=data.co(), annot=True)

# **Data Preprocessing and Encoding**

Performing Label Encoding

In [14]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 
label = LabelEncoder()
for col in ['batting_team', 'bowling_team']:
  data[col] = label.fit_transform(data[col])
data.head()

Unnamed: 0,batting_team,bowling_team,runs,wickets,overs,runs_last_5,wickets_last_5,total
32,3,6,61,0,5.1,59,0,222
33,3,6,61,1,5.2,59,1,222
34,3,6,61,1,5.3,59,1,222
35,3,6,61,1,5.4,59,1,222
36,3,6,61,1,5.5,58,1,222


Performing One Hot Encoding and Column Transformation

In [15]:
from sklearn.compose import ColumnTransformer
columnTransformer = ColumnTransformer([('encoder', 
                                        OneHotEncoder(), 
                                        [0, 1])], 
                                      remainder='passthrough')

In [16]:
data = np.array(columnTransformer.fit_transform(data))

In [17]:
# Rearranging the columns
cols = ['bat_team_Chennai Super Kings', 'bat_team_Delhi Daredevils', 'bat_team_Kings XI Punjab',
              'bat_team_Kolkata Knight Riders', 'bat_team_Mumbai Indians', 'bat_team_Rajasthan Royals',
              'bat_team_Royal Challengers Bangalore', 'bat_team_Sunrisers Hyderabad',
              'bowl_team_Chennai Super Kings', 'bowl_team_Delhi Daredevils', 'bowl_team_Kings XI Punjab',
              'bowl_team_Kolkata Knight Riders', 'bowl_team_Mumbai Indians', 'bowl_team_Rajasthan Royals',
              'bowl_team_Royal Challengers Bangalore', 'bowl_team_Sunrisers Hyderabad',
              'overs', 'runs', 'wickets', 'runs_last_5', 'wickets_last_5', 'total']
df = pd.DataFrame(data, columns=cols)

In [18]:
# Visualizing the Encoded Data
df.head()

Unnamed: 0,bat_team_Chennai Super Kings,bat_team_Delhi Daredevils,bat_team_Kings XI Punjab,bat_team_Kolkata Knight Riders,bat_team_Mumbai Indians,bat_team_Rajasthan Royals,bat_team_Royal Challengers Bangalore,bat_team_Sunrisers Hyderabad,bowl_team_Chennai Super Kings,bowl_team_Delhi Daredevils,...,bowl_team_Mumbai Indians,bowl_team_Rajasthan Royals,bowl_team_Royal Challengers Bangalore,bowl_team_Sunrisers Hyderabad,overs,runs,wickets,runs_last_5,wickets_last_5,total
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,61.0,0.0,5.1,59.0,0.0,222.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,61.0,1.0,5.2,59.0,1.0,222.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,61.0,1.0,5.3,59.0,1.0,222.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,61.0,1.0,5.4,59.0,1.0,222.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,61.0,1.0,5.5,58.0,1.0,222.0


# **Train and Test Splits**

In [19]:
features = df.drop(['total'], axis=1)
labels = df['total']

In [20]:
# Perform 80 : 20 Train-Test split
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.20, shuffle=True)
print(f"Training Set : {train_features.shape}\nTesting Set : {test_features.shape}")

Training Set : (32086, 21)
Testing Set : (8022, 21)


# **Algorithms**
Training and Testing on different Machine Learning Algorithms

In [21]:
# Keeping track of model perfomances
models = dict()

## Random Forest Regression Implementation

In [22]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor()
# Train Model
forest.fit(train_features, train_labels)

RandomForestRegressor()

In [23]:
# Evaluate Model
train_score_forest = str(forest.score(train_features, train_labels)*100)
test_score_forest = str(forest.score(test_features, test_labels)*100)
print(f'Train Score : {train_score_forest[:5]}%\nTest Score : {test_score_forest[:5]}%')
models["forest"] = test_score_forest

Train Score : 99.05%
Test Score : 93.41%


In [24]:
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, accuracy_score
print("---- Random Forest Regression - Model Evaluation ----")
print("Mean Absolute Error (MAE): {}".format(mae(test_labels, forest.predict(test_features))))
print("Mean Squared Error (MSE): {}".format(mse(test_labels, forest.predict(test_features))))
print("Root Mean Squared Error (RMSE): {}".format(np.sqrt(mse(test_labels, forest.predict(test_features)))))

---- Random Forest Regression - Model Evaluation ----
Mean Absolute Error (MAE): 4.50549497809595
Mean Squared Error (MSE): 59.347495327008744
Root Mean Squared Error (RMSE): 7.7037325581180935


### From above we can see that  Random Forest performed the best, closely followed by Decision Tree and Linear Regression. So we will be choosing Random Forest for the Final model.

Prediction Function

In [25]:
def predict_score(batting_team, bowling_team, runs, wickets, overs, runs_last_5, wickets_last_5, model=forest):
  prediction_array = []
  # Batting Team
  if batting_team == 'Chennai Super Kings':
    prediction_array = prediction_array + [1,0,0,0,0,0,0,0]
  elif batting_team == 'Delhi Daredevils':
    prediction_array = prediction_array + [0,1,0,0,0,0,0,0]
  elif batting_team == 'Kings XI Punjab':
    prediction_array = prediction_array + [0,0,1,0,0,0,0,0]
  elif batting_team == 'Kolkata Knight Riders':
    prediction_array = prediction_array + [0,0,0,1,0,0,0,0]
  elif batting_team == 'Mumbai Indians':
    prediction_array = prediction_array + [0,0,0,0,1,0,0,0]
  elif batting_team == 'Rajasthan Royals':
    prediction_array = prediction_array + [0,0,0,0,0,1,0,0]
  elif batting_team == 'Royal Challengers Bangalore':
    prediction_array = prediction_array + [0,0,0,0,0,0,1,0]
  elif batting_team == 'Sunrisers Hyderabad':
    prediction_array = prediction_array + [0,0,0,0,0,0,0,1]

  # Bowling Team
  if bowling_team == 'Chennai Super Kings':
    prediction_array = prediction_array + [1,0,0,0,0,0,0,0]
  elif bowling_team == 'Delhi Daredevils':
    prediction_array = prediction_array + [0,1,0,0,0,0,0,0]
  elif bowling_team == 'Kings XI Punjab':
    prediction_array = prediction_array + [0,0,1,0,0,0,0,0]
  elif bowling_team == 'Kolkata Knight Riders':
    prediction_array = prediction_array + [0,0,0,1,0,0,0,0]
  elif bowling_team == 'Mumbai Indians':
    prediction_array = prediction_array + [0,0,0,0,1,0,0,0]
  elif bowling_team == 'Rajasthan Royals':
    prediction_array = prediction_array + [0,0,0,0,0,1,0,0]
  elif bowling_team == 'Royal Challengers Bangalore':
    prediction_array = prediction_array + [0,0,0,0,0,0,1,0]
  elif bowling_team == 'Sunrisers Hyderabad':
    prediction_array = prediction_array + [0,0,0,0,0,0,0,1]

  prediction_array = prediction_array + [runs, wickets, overs, runs_last_5, wickets_last_5]
  prediction_array = np.array([prediction_array])
  predict = model.predict(prediction_array)
  return int(predict[0])

# Predictions

In [26]:
batting_team='Kolkata Knight Riders'
bowling_team='Royal Challengers Bangalore'
final_score = predict_score(batting_team, bowling_team, overs=9.1, runs=78, wickets=1, runs_last_5=24, wickets_last_5=1)
print("The final predicted score (range): {} to {}".format(final_score-10, final_score+10))

The final predicted score (range): 196 to 216


  "X does not have valid feature names, but"


In [27]:
batting_team='Chennai Super Kings'
bowling_team='Mumbai Indians'
final_score = predict_score(batting_team, bowling_team, overs=18.6, runs=168, wickets=5, runs_last_5=66, wickets_last_5=2)
print("The final predicted score (range): {} to {}".format(final_score-10, final_score+10))

The final predicted score (range): 175 to 195


  "X does not have valid feature names, but"


In [28]:
batting_team='Delhi Daredevils'
bowling_team='Chennai Super Kings'
final_score = predict_score(batting_team, bowling_team,  overs=10.2, runs=68, wickets=3, runs_last_5=29, wickets_last_5=1)
print("The final predicted score (range): {} to {}".format(final_score-10, final_score+10))

The final predicted score (range): 137 to 157


  "X does not have valid feature names, but"


In [29]:
batting_team = 'Kolkata Knight Riders';
bowlling_team = 'Mumbai Indians';
final_score = predict_score(batting_team,bowling_team,overs=7.5,runs=36,wickets=5,runs_last_5=23,wickets_last_5=4)
print("The final predicted score (range): {} to {}".format(final_score-10, final_score+10))

  "X does not have valid feature names, but"


The final predicted score (range): 106 to 126


In [30]:
batting_team = 'Mumbai Indians'
bowling_team = 'Kings XI Punjab'
final_score = predict_score(batting_team,bowling_team,overs=14.1,runs=136,wickets=4,runs_last_5=50,wickets_last_5=0)
print("The final predicted score (range): {} to {}".format(final_score-10, final_score+10))

  "X does not have valid feature names, but"


The final predicted score (range): 182 to 202
