In [None]:
! pip install -U pandas==1.5.3

In [None]:
import numpy as np
import pandas as pd

In [None]:
match = pd.read_csv("/kaggle/input/ipl-data-set/matches.csv")
delivery = pd.read_csv("/kaggle/input/ipl-data-set/deliveries.csv")
most_run_strikerate = pd.read_csv("/kaggle/input/ipl-data-set/most_runs_average_strikerate.csv")
teams = pd.read_csv("/kaggle/input/ipl-data-set/teams.csv")
home_away = pd.read_csv("/kaggle/input/ipl-data-set/teamwise_home_and_away.csv")

In [None]:
match.head()

In [None]:
match.shape

In [None]:
delivery.head()

In [None]:
delivery.shape

In [None]:
total_score_df = delivery.groupby(['match_id', 'inning']).sum()['total_runs'].reset_index()

In [None]:
total_score_df.head()

In [None]:
#total score of 1st innings
total_score_df = total_score_df[total_score_df['inning']==1]

In [None]:
# merging total_score_df with matches
match_df = match.merge(total_score_df[['match_id', 'total_runs']], left_on='id', right_on='match_id')

In [None]:
match_df['team1'].unique()

In [None]:
teams = [
    'Sunrisers Hyderabad',
    'Mumbai Indians',
    'Royal Challengers Bangalore',
    'Kolkata Knight Riders',
    'Kings XI Punjab',
    'Chennai Super Kings',
    'Rajasthan Royals',
    'Delhi Capitals'
]

In [None]:
#changing the name of teams
match_df['team1'] = match_df['team1'].str.replace('Delhi Daredevils', 'Delhi Capitals')
match_df['team2'] = match_df['team2'].str.replace('Delhi Daredevils', 'Delhi Capitals')

match_df['team1'] = match_df['team1'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
match_df['team2'] = match_df['team2'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')

In [None]:
#keeping only those matches where only existing teams played
match_df = match_df[match_df['team1'].isin(teams)]
match_df = match_df[match_df['team2'].isin(teams)]

In [None]:
match_df.shape

In [None]:
#removing dl_applied
match_df = match_df[match_df['dl_applied'] == 0 ]

In [None]:
#extracting necessary features
match_df = match_df[['match_id', 'city', 'winner', 'total_runs']]

In [None]:
#merging two Dfs
delivery_df = match_df.merge(delivery, on='match_id')

In [None]:
delivery_df.head()

In [None]:
#innings = 2
delivery_df = delivery_df[delivery_df['inning'] == 2]

In [None]:
delivery_df.shape

In [None]:
#remaining target after each ball
delivery_df['current_score'] = delivery_df.groupby('match_id').cumsum()['total_runs_y']
delivery_df['runs_left'] = delivery_df['total_runs_x'] - delivery_df['current_score']

In [None]:
#ball left
delivery_df['balls_left'] = 126 - (delivery_df['over']*6 +delivery_df['ball'])

In [None]:
delivery_df

In [None]:
#wickets left
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].fillna("0")
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].apply(lambda x:x if x == "0" else "1")
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].astype('int')
wickets = delivery_df.groupby('match_id').cumsum()['player_dismissed'].values
delivery_df['wickets'] = 10 - wickets
delivery_df.head()

In [None]:
delivery_df.tail()

In [None]:
#crr = runs/overs
delivery_df['crr'] = (delivery_df['current_score']*6)/(120 - delivery_df['balls_left'])

In [None]:
delivery_df.head()

In [None]:
#rrr = runs left/balls left
delivery_df['rrr'] = (delivery_df['runs_left']*6)/delivery_df['balls_left']

In [None]:
def result(row):
    return 1 if row['batting_team'] == row['winner'] else 0

In [None]:
delivery_df['result'] = delivery_df.apply(result, axis = 1)

In [None]:
delivery_df.head()

In [None]:
final_df = delivery_df[['batting_team','bowling_team','city','runs_left','balls_left','wickets','total_runs_x','crr','rrr','result']]

In [None]:
final_df.head()

In [None]:
final_df = final_df.sample(final_df.shape[0])

In [None]:
final_df.dropna(inplace=True)

In [None]:
final_df = final_df[final_df['balls_left'] != 0]

In [None]:
final_df.head()

In [None]:
X_train = final_df.iloc[:,:-1]
y_train = final_df.iloc[:, -1]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_train,y_train,test_size=0.2,random_state=1)

In [None]:
X_train

In [None]:
y_train

In [None]:
#handling nominal(categorical) data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC

trf = ColumnTransformer([('trf', OneHotEncoder(sparse=False, drop='first'), ['batting_team', 'bowling_team', 'city'])], remainder='passthrough')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.pipeline import Pipeline

In [None]:
# pipe = Pipeline(steps=[
#     ('step1',trf),
#     ('step2',LogisticRegression(solver='liblinear'))
# ])

In [None]:
# pipe = Pipeline(steps=[
#     ('step1',trf),
#     ('step2',RandomForestClassifier())
# ])

In [None]:
# pipe = Pipeline(steps=[
#     ('step1',trf),
#     ('step2',SVC( probability=True))
# ])

In [None]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',tree.DecisionTreeClassifier())
])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
pipe.predict_proba(X_test)[10]