In [1]:
import pandas as pd
import numpy as np

In [2]:
odi_df = pd.read_csv('odi.csv')

In [3]:
odi_df.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2006-06-13,"Civil Service Cricket Club, Stormont",England,Ireland,ME Trescothick,DT Johnston,0,0,0.1,0,0,0,0,301
1,1,2006-06-13,"Civil Service Cricket Club, Stormont",England,Ireland,ME Trescothick,DT Johnston,0,0,0.2,0,0,0,0,301
2,1,2006-06-13,"Civil Service Cricket Club, Stormont",England,Ireland,ME Trescothick,DT Johnston,4,0,0.3,4,0,0,0,301
3,1,2006-06-13,"Civil Service Cricket Club, Stormont",England,Ireland,ME Trescothick,DT Johnston,6,0,0.4,6,0,0,0,301
4,1,2006-06-13,"Civil Service Cricket Club, Stormont",England,Ireland,ME Trescothick,DT Johnston,6,0,0.5,6,0,0,0,301


In [4]:
odi_df.shape

(350899, 15)

In [5]:
odi_df['venue'].unique().shape

(136,)

In [6]:
# --- Data Cleaning ---
# Removing unwanted columns
columns_to_remove = ['mid', 'date', 'batsman', 'bowler', 'striker', 'non-striker']
odi_df.drop(labels=columns_to_remove, axis=1, inplace=True)

In [7]:
odi_df.head()

Unnamed: 0,venue,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total
0,"Civil Service Cricket Club, Stormont",England,Ireland,0,0,0.1,0,0,301
1,"Civil Service Cricket Club, Stormont",England,Ireland,0,0,0.2,0,0,301
2,"Civil Service Cricket Club, Stormont",England,Ireland,4,0,0.3,4,0,301
3,"Civil Service Cricket Club, Stormont",England,Ireland,6,0,0.4,6,0,301
4,"Civil Service Cricket Club, Stormont",England,Ireland,6,0,0.5,6,0,301


In [8]:
odi_df["bat_team"].unique()

array(['England', 'Pakistan', 'Sri Lanka', 'Australia', 'South Africa',
       'New Zealand', 'Bangladesh', 'West Indies', 'India', 'Zimbabwe',
       'Ireland', 'Scotland', 'Kenya', 'Bermuda', 'Netherlands', 'Canada',
       'Asia XI', 'Afghanistan', 'United Arab Emirates', 'Hong Kong',
       'Papua New Guinea'], dtype=object)

In [9]:
odi_teams = ['England', 'Pakistan', 'Sri Lanka', 'Australia', 'South Africa',
       'New Zealand', 'Bangladesh', 'West Indies', 'India', 'Afghanistan']

In [10]:
odi_df = odi_df[odi_df['bat_team'].isin(odi_teams)]
odi_df = odi_df[odi_df['bowl_team'].isin(odi_teams)]

In [11]:
odi_df.shape

(263661, 9)

In [30]:
odi_df['venue'].unique()

array(['The Rose Bowl', 'Trent Bridge', 'Edgbaston', "Lord's",
       'Kennington Oval', 'Riverside Ground', 'Old Trafford',
       'Headingley', 'Brisbane Cricket Ground, Woolloongabba',
       'Docklands Stadium', 'Sydney Cricket Ground', 'Adelaide Oval',
       'Western Australia Cricket Association Ground', 'Bellerive Oval',
       'Jade Stadium', 'Westpac Stadium', 'Rawalpindi Cricket Stadium',
       'Gaddafi Stadium', 'Multan Cricket Stadium', 'National Stadium',
       'Shaheed Chandu Stadium', 'Chittagong Divisional Stadium',
       'Narayanganj Osmani Stadium', 'Queenstown Events Centre',
       'McLean Park', 'Eden Park', 'Feroz Shah Kotla',
       'Nahar Singh Stadium', 'Nehru Stadium, Fatorda', 'Keenan Stadium',
       'Maharani Usharaje Trust Cricket Ground', 'Newlands',
       "St George's Park", 'Kingsmead', 'New Wanderers Stadium',
       'R Premadasa Stadium', 'Sinhalese Sports Club Ground',
       'Sabina Park, Kingston', 'Warner Park, Basseterre',
       "Queen's Pa

In [13]:
print(odi_df['bat_team'].unique())
print(odi_df['bowl_team'].unique())

['England' 'Pakistan' 'Sri Lanka' 'Australia' 'South Africa' 'New Zealand'
 'Bangladesh' 'West Indies' 'India' 'Afghanistan']
['Pakistan' 'England' 'Sri Lanka' 'South Africa' 'Australia' 'New Zealand'
 'India' 'West Indies' 'Bangladesh' 'Afghanistan']


In [14]:
# Removing the first 5 overs data in every match
odi_df = odi_df[odi_df['overs']>=5.0]

In [15]:
odi_df.head()

Unnamed: 0,venue,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total
347,The Rose Bowl,England,Pakistan,29,1,5.1,28,0,271
348,The Rose Bowl,England,Pakistan,30,1,5.2,25,0,271
349,The Rose Bowl,England,Pakistan,30,1,5.3,24,0,271
350,The Rose Bowl,England,Pakistan,34,1,5.4,28,0,271
351,The Rose Bowl,England,Pakistan,35,1,5.5,29,0,271


In [16]:
X = odi_df.iloc[:,:-1]
y = odi_df.iloc[:,-1]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [17]:
odi_df.head()

Unnamed: 0,venue,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total
347,The Rose Bowl,England,Pakistan,29,1,5.1,28,0,271
348,The Rose Bowl,England,Pakistan,30,1,5.2,25,0,271
349,The Rose Bowl,England,Pakistan,30,1,5.3,24,0,271
350,The Rose Bowl,England,Pakistan,34,1,5.4,28,0,271
351,The Rose Bowl,England,Pakistan,35,1,5.5,29,0,271


In [18]:
X_train

Unnamed: 0,venue,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5
30269,"National Cricket Stadium, St George's",Australia,New Zealand,342,6,49.5,60,1
320435,Rangiri Dambulla International Stadium,Sri Lanka,Australia,144,6,34.5,20,2
73353,Iqbal Stadium,South Africa,Pakistan,191,8,47.5,26,3
55824,Andhra Cricket Association-Visakhapatnam Distr...,Sri Lanka,India,124,5,26.3,34,1
139715,Pallekele International Cricket Stadium,New Zealand,Pakistan,163,4,39.6,20,0
...,...,...,...,...,...,...,...,...
336501,Headingley,England,South Africa,264,5,43.4,38,0
6120,Bellerive Oval,Sri Lanka,South Africa,181,2,36.3,24,1
304609,Sophia Gardens,England,Sri Lanka,151,3,27.4,26,1
337394,Clontarf Cricket Club Ground,Bangladesh,New Zealand,34,0,7.3,26,0


In [19]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [20]:
from sklearn.linear_model import LinearRegression

In [21]:
ohe = OneHotEncoder()
ohe.fit(odi_df[['venue','bat_team','bowl_team']])

In [22]:
column_trans=make_column_transformer((OneHotEncoder(categories=ohe.categories_),['venue','bat_team','bowl_team']),
                                    remainder='passthrough')

In [23]:
lr = LinearRegression()
pipe = make_pipeline(column_trans,lr)

In [24]:
pipe.fit(X_train, y_train)

In [25]:
y_pred = pipe.predict(X_test)

In [26]:
r2_score(y_test,y_pred)

0.6363470224913299

In [27]:
y_pred

array([263.54750972, 255.96953397, 198.91871437, ..., 277.06965802,
       273.12358646, 239.76795612])

In [28]:
y_test

117175    214
151309    303
46919     213
67521     290
110640    321
         ... 
103950    309
180764    280
59886     253
257511    262
145878    229
Name: total, Length: 47168, dtype: int64

In [29]:
# # Creating a pickle file for the classifier
# import pickle
# filename = 'odi_model.pkl'
# pickle.dump(pipe, open(filename, 'wb'))