In [1]:
import pandas as pd
import numpy as np

In [2]:
t20_df = pd.read_csv('t20.csv')

In [3]:
t20_df.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2005-06-13,The Rose Bowl,England,Australia,ME Trescothick,B Lee,0,0,0.1,0,0,0,0,179
1,1,2005-06-13,The Rose Bowl,England,Australia,ME Trescothick,B Lee,1,0,0.2,1,0,1,0,179
2,1,2005-06-13,The Rose Bowl,England,Australia,GO Jones,B Lee,1,0,0.3,1,0,1,0,179
3,1,2005-06-13,The Rose Bowl,England,Australia,GO Jones,B Lee,1,0,0.4,1,0,1,0,179
4,1,2005-06-13,The Rose Bowl,England,Australia,GO Jones,B Lee,1,0,0.5,1,0,1,0,179


In [4]:
t20_df.shape

(180777, 15)

In [5]:
t20_df['venue'].unique().shape

(114,)

In [6]:
# --- Data Cleaning ---
# Removing unwanted columns
columns_to_remove = ['mid', 'date', 'batsman', 'bowler', 'striker', 'non-striker']
t20_df.drop(labels=columns_to_remove, axis=1, inplace=True)

In [7]:
t20_df["bat_team"].unique()

array(['England', 'Australia', 'South Africa', 'Sri Lanka', 'West Indies',
       'Kenya', 'Pakistan', 'India', 'New Zealand', 'Bangladesh',
       'Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Mumbai Indians', 'Deccan Chargers', 'Kings XI Punjab',
       'Royal Challengers Bangalore', 'Delhi Daredevils', 'Bermuda',
       'Scotland', 'Ireland', 'Afghanistan', 'Zimbabwe', 'Canada',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Brisbane Heat',
       'Melbourne Stars', 'Adelaide Strikers', 'Hobart Hurricanes',
       'Melbourne Renegades', 'Sydney Sixers', 'Perth Scorchers',
       'Sydney Thunder', 'Netherlands', 'Sunrisers Hyderabad', 'Nepal',
       'United Arab Emirates', 'Hong Kong', 'Papua New Guinea', 'Oman',
       'Nottinghamshire', 'Worcestershire', 'Somerset', 'Surrey',
       'Lancashire', 'Middlesex', 'Derbyshire', 'Leicestershire',
       'Warwickshire', 'Glamorgan', 'Durham', 'Sussex', 'Hampshire',
       'Yorkshire', 'Essex', 'Kent', 'Glo

In [8]:
t20_teams = ['England', 'Australia', 'South Africa', 'Sri Lanka', 'West Indies',
       'Pakistan', 'India', 'New Zealand', 'Bangladesh', 'Afghanistan']

In [9]:
t20_df = t20_df[t20_df['bat_team'].isin(t20_teams)]
t20_df = t20_df[t20_df['bowl_team'].isin(t20_teams)]

In [10]:
t20_df.shape

(43920, 9)

In [26]:
t20_df['venue'].unique()

array(['The Rose Bowl', 'Eden Park', 'New Wanderers Stadium',
       'County Ground', 'Brisbane Cricket Ground, Woolloongabba',
       'Sydney Cricket Ground', 'Kennington Oval', 'Newlands',
       'Kingsmead', 'Western Australia Cricket Association Ground',
       'Melbourne Cricket Ground', 'Old Trafford', 'Brabourne Stadium',
       'Jade Stadium', 'Gymkhana Club Ground', "St George's Park",
       'Kensington Oval, Bridgetown', 'National Stadium',
       'SuperSport Park', "Queen's Park Oval, Port of Spain",
       'Trent Bridge', "Lord's", 'Maple Leaf North-West Ground',
       'Westpac Stadium', 'Seddon Park', 'AMI Stadium',
       'R Premadasa Stadium', 'Dubai International Cricket Stadium',
       'Warner Park, Basseterre', 'Bellerive Oval', 'Providence Stadium',
       'Beausejour Stadium, Gros Islet', 'Edgbaston', 'Sophia Gardens',
       'Vidarbha Cricket Association Stadium, Jamtha',
       'Punjab Cricket Association Stadium, Mohali',
       'Sir Vivian Richards Stadium, N

In [12]:
# Removing the first 5 overs data in every match
t20_df = t20_df[t20_df['overs']>=5.0]

In [13]:
X = t20_df.iloc[:,:-1]
y = t20_df.iloc[:,-1]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
ohe = OneHotEncoder()
ohe.fit(t20_df[['venue','bat_team','bowl_team']])

In [17]:
column_trans=make_column_transformer((OneHotEncoder(categories=ohe.categories_),['venue','bat_team','bowl_team']),
                                    remainder='passthrough')

In [18]:
lr = LinearRegression()
pipe = make_pipeline(column_trans,lr)

In [19]:
pipe.fit(X_train, y_train)

In [20]:
y_pred = pipe.predict(X_test)

In [21]:
r2_score(y_test,y_pred)

0.7391531397474085

In [22]:
# # Creating a pickle file for the classifier
# import pickle
# filename = 't20i_model.pkl'
# pickle.dump(pipe, open(filename, 'wb'))