In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

### Importing data

In [2]:
squads = pd.read_csv('IPL 2020 Squads.csv', engine = 'python')
deliveries = pd.read_csv('Traning Deliveries Matches IPL 2008-2019.csv' ,engine = 'python')
qualifier = pd.read_csv('Qualifiers IPL 2008-2019.csv', engine = 'python')
test = pd.read_csv('Testset Matches IPL 2020.csv',engine = 'python')
train = pd.read_csv('Training Matches IPL 2008-2019.csv', engine = 'python')

### Data pre-processing

Dropping unwanted parameters from deliveries.

In [3]:
deliveries.drop(["over","ball","non_striker","is_super_over","wide_runs","bye_runs",
                 "legbye_runs","noball_runs","penalty_runs","extra_runs",
                 "total_runs","dismissal_kind"],axis = 1,inplace = True)

In [4]:
deliveries["player_dismissed"] = deliveries["player_dismissed"].fillna(0)
deliveries["player_dismissed"] = np.where(deliveries["player_dismissed"] == 0 ,0,1)
deliveries["catch"] = deliveries["fielder"]
deliveries["catch"] = deliveries["catch"].fillna(0)
deliveries["catch"] = np.where(deliveries["catch"] == 0 ,0,1)

In [5]:
deliveries["batsman_extra_score"] = deliveries["batsman_runs"]
deliveries["batsman_extra_score"] = deliveries["batsman_extra_score"].replace(1,0)
deliveries["batsman_extra_score"] = deliveries["batsman_extra_score"].replace(2,0)
deliveries["batsman_extra_score"] = deliveries["batsman_extra_score"].replace(3,0)
deliveries["batsman_extra_score"] = deliveries["batsman_extra_score"].replace(5,0)
deliveries["batsman_extra_score"] = deliveries["batsman_extra_score"].replace(4,1)
deliveries["batsman_extra_score"] = deliveries["batsman_extra_score"].replace(6,2)

Checking all teams

In [6]:
teams = train.team1.unique()
for i in teams:
    print(i)

Kolkata Knight Riders
Chennai Super Kings
Rajasthan Royals
Mumbai Indians
Deccan Chargers
Kings XI Punjab
Royal Challengers Bangalore
Delhi Daredevils
Kochi Tuskers Kerala
Pune Warriors
Sunrisers Hyderabad
Rising Pune Supergiants
Gujarat Lions
Rising Pune Supergiant
Delhi Capitals


In [7]:
squads["Player_ipl_team"].unique()

array(['CSK', 'RCB', 'DC', 'MI', 'KKR', 'RR', 'KXIP', 'SRH'], dtype=object)

In [8]:
teams.sort()

team_abv = []
for t in teams:
    new = t[0]
    if t == "Kings XI Punjab":
        team_abv.append("KXIP")
    if t == "Sunrisers Hyderabad":
        team_abv.append("SRH")
    else:
        for i in range(len(t)):
            if t[i] == " ":
                new = new + t[i+1]
        team_abv.append(new)

l1 = ["team1" , "team2","toss_winner","winner"]
l2 = ["batting_team","bowling_team"]
for i in range(len(teams)):
    for l in l1:
        train[l] = train[l].replace(teams[i],team_abv[i])
    for l in l2:
        deliveries[l] = deliveries[l].replace(teams[i],team_abv[i])

In [9]:
batsman = deliveries.batsman.unique()
batsman.sort()
bowler = deliveries.bowler.unique()
bowler.sort()

corrected_batsman = []
for b in batsman:
    new = b[0]
    for i in range(len(b)):
        if b[i] == " ":
            new = new + b[i:]
    corrected_batsman.append(new)

corrected_bowler = []
for b in bowler:
    new = b[0]
    for i in range(len(b)):
        if b[i] == " ":
            new = new + b[i:]
    corrected_bowler.append(new)
    
for i in range(len(batsman)):
    deliveries["batsman"] = deliveries["batsman"].replace(batsman[i],corrected_batsman[i])

for i in range(len(bowler)):
    deliveries["bowler"] = deliveries["bowler"].replace(bowler[i],corrected_bowler[i])

In [10]:
for j in range(squads.shape[0]):
    p = squads["Player_name"][j]
    tem = p[0]+" "
    for i in range(len(p)):
        if p[i] == " ":
            tem += p[i+1:]
        if p[i] == "(":
            tem = tem[:i-2]
            
    squads["Player_name"][j] = tem

Cleaning train data

In [11]:
train["toss_decision"] = np.where(train["toss_decision"] == "bat",1,0)

In [12]:
train["team1_toss_win"] = train["team1"]
train["team1_toss_win"] = np.where(train["team1"] == train ["toss_winner"],1,0)

train["team1_bat"] = train["team1"]
train["team1_bat"] = np.where(train["team1_toss_win"] == train["toss_decision"],1,0)

train["team1_win"] = train["team1"]
train["team1_win"] = np.where(train["team1"] == train ["winner"],1,0)

Assigning values to each team

In [13]:
encoder= preprocessing.LabelEncoder()
train["team1"]=encoder.fit_transform(train["team1"])
train["team2"]=encoder.fit_transform(train["team2"])


### Defining function for calculating Player_value

In [14]:
def Player_value(player,match):
    df_1 = deliveries.query("match_id == '%s'"%match).query("batsman == '%s'"%player)
    score = sum(df_1["batsman_runs"])
    if score == 0:
        score += -2
    if score >= 50 and score < 100:
        score += 8
    if score >= 100:
        score += 16
    score += sum(df_1["batsman_extra_score"])
    
    df_2 = deliveries.query("match_id == '%s'"%match).query("bowler == '%s'"%player)
    if sum(df_2["player_dismissed"]) == 4:
        score += 8
    if sum(df_2["player_dismissed"]) >= 5:
        score += 16
    score += sum(df_2["player_dismissed"])*25
    
    df_3 = deliveries.query("match_id == '%s'"%match).query("fielder == '%s'"%player)
    score += sum(df_3["catch"])*8
    
    if df_1.shape[0] == 0 and df_2.shape[0] == 0 and df_3.shape[0] == 0:
        return 0
    
    return score

### Creating Dataset for teams' scores

In [16]:
match_id = deliveries.match_id.unique()

Scores = pd.DataFrame(columns = ["id","team1_bats_score","team1_bowl_score","team2_bats_score","team2_bowl_score"])

for i in match_id:
    bowler_1 = deliveries.query("match_id == '%s'"%i).query("inning == 2").bowler.unique()
    bowler_2 = deliveries.query("match_id == '%s'"%i).query("inning == 1").bowler.unique()
    bats_1 = deliveries.query("match_id == '%s'"%i).query("inning == 1").batsman.unique()
    bats_2 = deliveries.query("match_id == '%s'"%i).query("inning == 2").batsman.unique()
    
    team1_bowl_score = 0
    team2_bowl_score = 0
    team1_bats_score = 0
    team2_bats_score = 0
    
    for p in bowler_1:
        team1_bowl_score += Player_value(p,i)
        
    for p in bowler_2:
        team2_bowl_score += Player_value(p,i)
    
    for p in bats_1:
        team1_bats_score += Player_value(p,i)
        
    for p in bats_2:
        team2_bats_score += Player_value(p,i)
        
    Scores = Scores.append({"id":i,"team1_bats_score":team1_bats_score,
                            "team1_bowl_score":team1_bowl_score,
                            "team2_bats_score":team2_bats_score,
                            "team2_bowl_score":team2_bowl_score},  ignore_index = True)

Selecting useful parameters from train data

In [17]:
train = train[["id","team1","team2","team1_toss_win","team1_bat","team1_win"]]

Merging teams' scores and train data

In [54]:
TEMP = Scores.merge(train,how='left', on = 'id')

In [55]:
TEMP.drop(["id"],axis = 1,inplace = True)

In [20]:
TEMP = TEMP.astype(float)

### Training Models

Finding the higly correlated features

In [56]:
correlated_features = set()
correlation_matrix = TEMP.drop('team1_win', axis=1).corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            column = correlation_matrix.columns[i]
            correlated_features.add(column)
            

correlation_matrix

Unnamed: 0,team1,team2,team1_toss_win,team1_bat
team1,1.0,-0.110777,-0.131155,-0.076954
team2,-0.110777,1.0,-0.003184,-0.076191
team1_toss_win,-0.131155,-0.003184,1.0,-0.064383
team1_bat,-0.076954,-0.076191,-0.064383,1.0


Spliting data for training

In [97]:
X = TEMP[["team1","team2","team1_toss_win","team1_bat","team1_bats_score","team1_bowl_score","team2_bats_score","team2_bowl_score"]]
target = TEMP["team1_win"]
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=0,shuffle=True)

Checking for the best Model

In [98]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of Logistic Regression Classifier on test set: {:.4f}'.format(logreg.score(X_test, y_test)))

#Decision Tree Classifier
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
y_pred = dtree.predict(X_test)
print('Accuracy of Decision Tree Classifier on test set: {:.4f}'.format(dtree.score(X_test, y_test)))

#SVM
svm=SVC()
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print('Accuracy of SVM Classifier on test set: {:.4f}'.format(svm.score(X_test, y_test)))

#Random Forest Classifier
randomForest= RandomForestClassifier(n_estimators=100)
randomForest.fit(X_train,y_train)
y_pred = randomForest.predict(X_test)
print('Accuracy of Random Forest Classifier on test set: {:.4f}'.format(randomForest.score(X_test, y_test)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy of Logistic Regression Classifier on test set: 0.7895
Accuracy of Decision Tree Classifier on test set: 0.7500
Accuracy of SVM Classifier on test set: 0.8092
Accuracy of Random Forest Classifier on test set: 0.8158


We can see both Logistic and SVM model are good. But we choose SVM classifier.

# Test

In [24]:
test = test[["id","team1","team2"]]

Defining team1 toss win with 0.5 probability and decision of toss with 0.5 probability

In [25]:
test["team1_toss_win"] = np.random.randint(0,2,test.shape[0])
test["team1_bat"] = np.random.randint(0,2,test.shape[0])

Calculating expected Scores for each team

In [78]:
teams2020 = squads["Player_ipl_team"].unique()
Expected = pd.DataFrame(columns = ["team","avg_bats_score","avg_bowl_score"])

for t in teams2020:
    team_bats_avg = 0
    team_bowl_avg = 0
    temp = squads.query("Player_ipl_team == '%s'"%t)
    for i in temp["Player_name"]:
        player_bowl_value = 0
        player_bats_value = 0
        bats_matches = deliveries.query("batsman == '%s'"%i).match_id.unique()
        bowler_matches = deliveries.query("bowler == '%s'"%i).match_id.unique()
        
        for m in bowler_matches:
            player_bowl_value += Player_value(i,m)**2
        
        for m in bats_matches:
            player_bats_value += Player_value(i,m)**2
        
        if len(bats_matches) != 0:
            team_bats_avg += player_bats_value // len(bats_matches)
            
            
        if len(bowler_matches) != 0:
            team_bowl_avg += player_bowl_value // len(bowler_matches)
            
        else: 
            team_bats_avg += 20
            team_bowl_avg += 20
            
    Expected = Expected.append({"team": t,"avg_bats_score":(team_bats_avg)**(1/2),"avg_bowl_score":(team_bowl_avg)**(1/2)},ignore_index=True)

In [81]:
test["team1_bats_score"] = test["team1"]
test["team1_bowl_score"] = test["team2"]
test["team2_bats_score"] = test["team1"]
test["team2_bowl_score"] = test["team2"]

for i in range(Expected.shape[0]):
    test["team1_bats_score"] = test["team1_bats_score"].replace(Expected["team"][i],Expected["avg_bats_score"][i])
    test["team2_bats_score"] = test["team2_bats_score"].replace(Expected["team"][i],Expected["avg_bats_score"][i])
    test["team1_bowl_score"] = test["team1_bowl_score"].replace(Expected["team"][i],Expected["avg_bowl_score"][i])
    test["team2_bowl_score"] = test["team2_bowl_score"].replace(Expected["team"][i],Expected["avg_bowl_score"][i])

In [82]:
test

Unnamed: 0,id,team1,team2,team1_toss_win,team1_bat,team1_bats_score,team2_bowl_score,team2_bats_score,team1_bowl_score
0,1216492,4,0,0,1,4,0,4,0
1,1216493,1,3,0,1,1,3,1,3
2,1216534,7,5,0,1,7,5,7,5
3,1216496,6,0,0,0,6,0,6,0
4,1216508,2,4,0,0,2,4,2,4
5,1216510,3,5,0,0,3,5,3,5
6,1216539,0,1,1,0,0,1,0,1
7,1216545,2,7,1,1,2,7,2,7
8,1216527,6,3,0,0,6,3,6,3
9,1216547,5,4,1,1,5,4,5,4


Assigning values to each team as above (train data)

In [83]:
test["team1"]=encoder.fit_transform(test["team1"])
test["team2"]=encoder.fit_transform(test["team2"])

Selecting parameters from test data

In [84]:
predictor = test[["team1","team2","team1_toss_win","team1_bat","team1_bats_score","team1_bowl_score","team2_bats_score","team2_bowl_score"]]

Predicting data from SVM classifier

In [85]:
y_pred_svm = svm.predict(predictor)
y_pred_log = logreg.predict(predictor)
y_pred_dtree = dtree.predict(predictor)
y_pred_rforest = randomForest.predict(predictor)

In [86]:
Submission_svm = pd.DataFrame(test["id"])
Submission_log = pd.DataFrame(test["id"])
Submission_dtree = pd.DataFrame(test["id"])
Submission_rforest = pd.DataFrame(test["id"])

In [87]:
Submission_svm["winner"] = y_pred_svm
Submission_log["winner"] = y_pred_log
Submission_dtree["winner"] = y_pred_dtree
Submission_rforest["winner"] = y_pred_rforest

In [88]:
Submission_svm["winner"] = Submission_svm["winner"].replace(0,2)
Submission_log["winner"] = Submission_log["winner"].replace(0,2)
Submission_dtree["winner"] = Submission_dtree["winner"].replace(0,2)
Submission_rforest["winner"] = Submission_rforest["winner"].replace(0,2)

In [89]:
Submission_svm = Submission_svm.astype(int)
Submission_log = Submission_log.astype(int)
Submission_dtree = Submission_dtree.astype(int)
Submission_rforest = Submission_rforest.astype(int)

In [90]:
Submission_svm.to_csv("Submission_svm.csv", index = False)
Submission_log.to_csv("Submission_log.csv", index = False)
Submission_dtree.to_csv("Submission_dtree.csv", index = False)
Submission_rforest.to_csv("Submission_rforest.csv", index = False)

Reversing assigned value  to team names

In [91]:
test["team1"] = encoder.inverse_transform(test["team1"])
test["team2"] = encoder.inverse_transform(test["team2"])

In [92]:
test["winner"] = test["predict"]
for i in range(test.shape[0]):
    if test["predict"][i] == 1:
        test["winner"][i] = int(1)
    else:
        test["winner"][i] = int(2)

KeyError: 'predict'

Submission ( match id and it's winner)

In [None]:
Submission_svm = test[["id","winner"]]

In [None]:
Submission_svm = Submission_svm.astype(int)

In [None]:
Submission.to_csv("Submission_svm.csv", index = False)