In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.metrics import accuracy_score


import warnings
warnings.filterwarnings('ignore')

In [2]:
# Match results have been scraped from https://www.mykhel.com/kabaddi/pro-kabaddi-results/ for all the seasons
match_results = pd.read_csv('match_results.csv')

In [3]:
match_results.head()

Unnamed: 0,Team_1,Team_2,Team1_Score,Team2_Score,Team1_Raid_Points,Team2_Raid_Points,Team1_Tackle_Points,Team2_Tackle_Points,Team1_AllOut_Points,Team2_AllOut_Points,Team1_Extra_Points,Team2_Extra_Points,Season,Winner
0,U Mumba,Jaipur Pink Panthers,45,31,28,25,12,1,4,2,0,0,1,1
1,Dabang Delhi,Bengaluru Bulls,28,47,15,23,13,13,0,6,0,0,1,2
2,Bengaluru Bulls,Puneri Paltan,40,37,20,20,14,11,4,2,1,1,1,1
3,U Mumba,Bengal Warriors,36,25,24,17,9,7,2,0,0,0,1,1
4,Puneri Paltan,Dabang Delhi,31,35,22,23,7,10,2,2,0,0,1,2


In [4]:
# Convert teams into one-hot encoding
match_results_lr = pd.get_dummies(data=match_results,
                                        columns =['Team_1','Team_2'], 
                                        prefix=['Team_1','Team_2'])
match_results_lr.columns

Index(['Team1_Score', 'Team2_Score', 'Team1_Raid_Points', 'Team2_Raid_Points',
       'Team1_Tackle_Points', 'Team2_Tackle_Points', 'Team1_AllOut_Points',
       'Team2_AllOut_Points', 'Team1_Extra_Points', 'Team2_Extra_Points',
       'Season', 'Winner', 'Team_1_Bengal Warriors', 'Team_1_Bengaluru Bulls',
       'Team_1_Dabang Delhi', 'Team_1_Gujarat Fortune Giants',
       'Team_1_Haryana Steelers', 'Team_1_Jaipur Pink Panthers',
       'Team_1_Patna Pirates', 'Team_1_Puneri Paltan',
       'Team_1_Tamil Thalaivas', 'Team_1_Telugu Titans', 'Team_1_U Mumba',
       'Team_1_UP Yoddha', 'Team_2_Bengal Warriors', 'Team_2_Bengaluru Bulls',
       'Team_2_Dabang Delhi', 'Team_2_Gujarat Fortune Giants',
       'Team_2_Haryana Steelers', 'Team_2_Jaipur Pink Panthers',
       'Team_2_Patna Pirates', 'Team_2_Puneri Paltan',
       'Team_2_Tamil Thalaivas', 'Team_2_Telugu Titans', 'Team_2_U Mumba',
       'Team_2_UP Yoddha'],
      dtype='object')

In [5]:
match_results_lr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 632 entries, 0 to 631
Data columns (total 36 columns):
Team1_Score                      632 non-null int64
Team2_Score                      632 non-null int64
Team1_Raid_Points                632 non-null int64
Team2_Raid_Points                632 non-null int64
Team1_Tackle_Points              632 non-null int64
Team2_Tackle_Points              632 non-null int64
Team1_AllOut_Points              632 non-null int64
Team2_AllOut_Points              632 non-null int64
Team1_Extra_Points               632 non-null int64
Team2_Extra_Points               632 non-null int64
Season                           632 non-null int64
Winner                           632 non-null int64
Team_1_Bengal Warriors           632 non-null uint8
Team_1_Bengaluru Bulls           632 non-null uint8
Team_1_Dabang Delhi              632 non-null uint8
Team_1_Gujarat Fortune Giants    632 non-null uint8
Team_1_Haryana Steelers          632 non-null uint8
Team_1_Jaip

In [6]:
#match_results_df = match_results_lr.drop(['Season'],axis=1)
match_results_df = match_results_lr
match_results_df.columns

Index(['Team1_Score', 'Team2_Score', 'Team1_Raid_Points', 'Team2_Raid_Points',
       'Team1_Tackle_Points', 'Team2_Tackle_Points', 'Team1_AllOut_Points',
       'Team2_AllOut_Points', 'Team1_Extra_Points', 'Team2_Extra_Points',
       'Season', 'Winner', 'Team_1_Bengal Warriors', 'Team_1_Bengaluru Bulls',
       'Team_1_Dabang Delhi', 'Team_1_Gujarat Fortune Giants',
       'Team_1_Haryana Steelers', 'Team_1_Jaipur Pink Panthers',
       'Team_1_Patna Pirates', 'Team_1_Puneri Paltan',
       'Team_1_Tamil Thalaivas', 'Team_1_Telugu Titans', 'Team_1_U Mumba',
       'Team_1_UP Yoddha', 'Team_2_Bengal Warriors', 'Team_2_Bengaluru Bulls',
       'Team_2_Dabang Delhi', 'Team_2_Gujarat Fortune Giants',
       'Team_2_Haryana Steelers', 'Team_2_Jaipur Pink Panthers',
       'Team_2_Patna Pirates', 'Team_2_Puneri Paltan',
       'Team_2_Tamil Thalaivas', 'Team_2_Telugu Titans', 'Team_2_U Mumba',
       'Team_2_UP Yoddha'],
      dtype='object')

In [7]:
match_results_df.head()
match_results_df.describe()

Unnamed: 0,Team1_Score,Team2_Score,Team1_Raid_Points,Team2_Raid_Points,Team1_Tackle_Points,Team2_Tackle_Points,Team1_AllOut_Points,Team2_AllOut_Points,Team1_Extra_Points,Team2_Extra_Points,...,Team_2_Dabang Delhi,Team_2_Gujarat Fortune Giants,Team_2_Haryana Steelers,Team_2_Jaipur Pink Panthers,Team_2_Patna Pirates,Team_2_Puneri Paltan,Team_2_Tamil Thalaivas,Team_2_Telugu Titans,Team_2_U Mumba,Team_2_UP Yoddha
count,632.0,632.0,632.0,632.0,632.0,632.0,632.0,632.0,632.0,632.0,...,632.0,632.0,632.0,632.0,632.0,632.0,632.0,632.0,632.0,632.0
mean,31.925633,32.063291,17.881329,17.93038,9.549051,9.506329,2.234177,2.39557,1.568038,1.621835,...,0.093354,0.050633,0.060127,0.093354,0.099684,0.096519,0.060127,0.094937,0.088608,0.063291
std,7.174252,7.665084,5.8643,5.925172,3.415375,3.518566,1.916374,1.960428,1.42152,1.394248,...,0.291159,0.219421,0.23791,0.291159,0.299815,0.295536,0.23791,0.29336,0.284402,0.243679
min,15.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,26.0,14.0,14.0,7.0,7.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,32.0,32.0,18.0,18.0,9.0,9.0,2.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,36.0,37.0,22.0,22.0,12.0,12.0,4.0,4.0,2.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,69.0,67.0,44.0,36.0,20.0,20.0,10.0,10.0,7.0,6.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
X = match_results_df.drop(['Team1_Score','Team2_Score','Winner'],axis=1)

y = match_results_df['Winner']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state = 99)
X_train.head()

Unnamed: 0,Team1_Raid_Points,Team2_Raid_Points,Team1_Tackle_Points,Team2_Tackle_Points,Team1_AllOut_Points,Team2_AllOut_Points,Team1_Extra_Points,Team2_Extra_Points,Season,Team_1_Bengal Warriors,...,Team_2_Dabang Delhi,Team_2_Gujarat Fortune Giants,Team_2_Haryana Steelers,Team_2_Jaipur Pink Panthers,Team_2_Patna Pirates,Team_2_Puneri Paltan,Team_2_Tamil Thalaivas,Team_2_Telugu Titans,Team_2_U Mumba,Team_2_UP Yoddha
509,16,19,7,14,0,6,2,2,6,1,...,0,0,0,0,0,0,0,0,0,1
572,6,9,14,15,0,0,1,0,7,0,...,0,0,0,0,0,0,0,1,0,0
404,18,25,11,2,4,0,6,3,6,0,...,0,0,0,0,0,0,0,0,0,0
247,13,17,4,11,0,2,1,1,5,0,...,0,0,0,0,0,0,0,0,0,1
26,19,15,13,7,4,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
'''
Apply Logistic Regression on the historical match results data set - matches that happened till 01-October-2019. 
the Winner team is the target - the classification being 2 (team 2),1(team 1),0(draw)
This model will be used to predict the result of future matches.
''' 
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)

predict_train_lr = lr.predict(X_train)

# Accuray Score on train dataset
accuracy_train_lr = accuracy_score(y_train,predict_train_lr)
print('accuracy_score on train dataset : ', accuracy_train_lr)

# predict the target on the test dataset
predict_test_lr = lr.predict(X_test)

# Accuracy Score on test dataset
accuracy_test_lr = accuracy_score(y_test,predict_test_lr)
print('accuracy_score on test dataset : ', accuracy_test_lr)

accuracy_score on train dataset :  0.920814479638009
accuracy_score on test dataset :  0.8947368421052632


In [10]:
predict_test_lr

array([1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 1,
       0, 2, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 1,
       1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 1, 1, 2, 2, 2, 0, 2,
       2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2,
       2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 2, 1, 2,
       1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2,
       2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1,
       1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 2], dtype=int64)

In [11]:
predict_proba_test = lr.predict_proba(X_test)

In [12]:
np.around(predict_proba_test,2)

array([[0.08, 0.92, 0.  ],
       [0.17, 0.83, 0.  ],
       [0.12, 0.  , 0.88],
       [0.43, 0.56, 0.  ],
       [0.07, 0.  , 0.93],
       [0.04, 0.  , 0.96],
       [0.15, 0.85, 0.  ],
       [0.19, 0.8 , 0.01],
       [0.29, 0.3 , 0.41],
       [0.08, 0.  , 0.92],
       [0.59, 0.03, 0.39],
       [0.13, 0.87, 0.  ],
       [0.04, 0.26, 0.69],
       [0.15, 0.  , 0.85],
       [0.02, 0.  , 0.98],
       [0.15, 0.  , 0.85],
       [0.08, 0.92, 0.  ],
       [0.03, 0.  , 0.97],
       [0.08, 0.01, 0.91],
       [0.03, 0.  , 0.97],
       [0.18, 0.78, 0.04],
       [0.56, 0.03, 0.41],
       [0.09, 0.91, 0.  ],
       [0.25, 0.7 , 0.05],
       [0.02, 0.98, 0.  ],
       [0.04, 0.95, 0.02],
       [0.12, 0.88, 0.  ],
       [0.02, 0.98, 0.  ],
       [0.18, 0.82, 0.  ],
       [0.51, 0.45, 0.04],
       [0.04, 0.  , 0.96],
       [0.05, 0.  , 0.95],
       [0.18, 0.02, 0.8 ],
       [0.21, 0.79, 0.  ],
       [0.09, 0.91, 0.  ],
       [0.23, 0.  , 0.77],
       [0.1 , 0.9 , 0.  ],
 

In [13]:
''' 
FUNDAMENTAL ASSUMPTION - For future matches, the performance from last 7 matches against each parameter 
(raid points, tackle points, all out points & extra points) for each team is taken and their average 
is used as independent variables for the future match.
'''
match_results_mod = match_results_df
def get_last_5(parameter,team):
    df_last_5 = match_results_mod.loc[(match_results_mod['Team_1_'+team]==1) | 
                                      (match_results_mod['Team_2_'+team]==1)].iloc[-7:]
    s1 = df_last_5.loc[df_last_5['Team_1_'+team]==1]['Team1_'+parameter].sum()
    s2 = df_last_5.loc[df_last_5['Team_2_'+team]==1]['Team2_'+parameter].sum()
    new_value_param = (int((s1+s2)/7))
    return new_value_param

In [14]:
'''
Return the dataframe for both teams in future fixture with values for independent variables 
calculated as average of last 7 matches. This dataframe will be passed to Logistic Regression model 
trained with data to date
''' 
def get_new_match_results(team1,team2):
    
    new_df = pd.DataFrame(columns = match_results_mod.columns)
    #print(new_df.shape)
    new_df.loc[0,'Team_1_'+team1] = 1
    new_df.loc[0,'Team1_Raid_Points'] = get_last_5('Raid_Points',team1)
    new_df.loc[0,'Team1_Tackle_Points'] = get_last_5('Tackle_Points',team1)
    new_df.loc[0,'Team1_AllOut_Points'] = get_last_5('AllOut_Points',team1)
    new_df.loc[0,'Team1_Extra_Points'] = get_last_5('Extra_Points',team1)
    new_df.loc[0,'Team1_Score'] = new_df.loc[0,['Team1_Raid_Points','Team1_Tackle_Points','Team1_AllOut_Points','Team1_Extra_Points']].sum()
    
    #new_df.loc[0,'Team1_Score'] = s1
    
    new_df.loc[0,'Team_2_'+team2] = 1
    new_df.loc[0,'Team2_Raid_Points'] = get_last_5('Raid_Points',team2)
    new_df.loc[0,'Team2_Tackle_Points'] = get_last_5('Tackle_Points',team2)
    new_df.loc[0,'Team2_AllOut_Points'] = get_last_5('AllOut_Points',team2)
    new_df.loc[0,'Team2_Extra_Points'] = get_last_5('Extra_Points',team2)
    new_df.loc[0,'Team2_Score'] = new_df.loc[0,['Team2_Raid_Points','Team2_Tackle_Points','Team2_AllOut_Points','Team2_Extra_Points']].sum()
    new_df.loc[0,'Season'] = 7
    #new_df.loc[0,'Team2_Score'] = s2
    
    new_df = new_df.fillna(0)
    #print(new_df)
    return new_df


In [15]:
'''
This is how the averaged out values would look like.
The results are not very promising and cannot compete with the random match results, 
but due to lack of any other relationsip, a 'law of average' (although not very statistically correct) can be applied
to predict on the match outcome.
''' 
new_df = get_new_match_results('U Mumba','Bengaluru Bulls')
new_df

Unnamed: 0,Team1_Score,Team2_Score,Team1_Raid_Points,Team2_Raid_Points,Team1_Tackle_Points,Team2_Tackle_Points,Team1_AllOut_Points,Team2_AllOut_Points,Team1_Extra_Points,Team2_Extra_Points,...,Team_2_Dabang Delhi,Team_2_Gujarat Fortune Giants,Team_2_Haryana Steelers,Team_2_Jaipur Pink Panthers,Team_2_Patna Pirates,Team_2_Puneri Paltan,Team_2_Tamil Thalaivas,Team_2_Telugu Titans,Team_2_U Mumba,Team_2_UP Yoddha
0,32,35,19,24,9,8,2,2,2,1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
'''
 Get the fixtures - future matches after 01-Oct-2019
 Run the model on each of the match to predict the winner/draw.
 Reiterating - Average of last 7 matches is dynamically taken after every match is updated in the data frame.
'''

fixtures_df = pd.read_csv('Fixtures.csv')
winning_team = []
draw_teams = []
for i in range(0,len(fixtures_df)):
    team1 = fixtures_df.iloc[i,0]
    team2 = fixtures_df.iloc[i,1]
    new_df = get_new_match_results(team1,team2)
    
    X_test = new_df.drop(['Team1_Score','Team2_Score','Winner'],axis=1)
    predict_test_lr = lr.predict(X_test)
    
    winner = ''
    if predict_test_lr[0] == 1:
        winner=team1
        winning_team.append(winner)
    elif predict_test_lr[0] ==2:
        winner = team2
        winning_team.append(winner)
    else:
        winner = 'Draw'
        draw_teams.append(team1)
        draw_teams.append(team2)
    
    print('Match between: '+team1 +" vs. "+ team2)
    print('Winner: '+winner)
    
    print('------------------------------------\n')
    match_results_mod = pd.concat([match_results_mod,new_df])

winning_team    

Match between: U Mumba vs. Patna Pirates
Winner: Patna Pirates
------------------------------------

Match between: Haryana Steelers vs. Bengaluru Bulls
Winner: Haryana Steelers
------------------------------------

Match between: Telugu Titans vs. Puneri Paltan
Winner: Puneri Paltan
------------------------------------

Match between: Jaipur Pink Panthers vs. Bengaluru Bulls
Winner: Bengaluru Bulls
------------------------------------

Match between: Haryana Steelers vs. Telugu Titans
Winner: Draw
------------------------------------

Match between: UP Yoddha vs. Dabang Delhi
Winner: Dabang Delhi
------------------------------------

Match between: Gujarat Fortune Giants vs. Patna Pirates
Winner: Patna Pirates
------------------------------------

Match between: Bengal Warriors vs. Patna Pirates
Winner: Bengal Warriors
------------------------------------

Match between: UP Yoddha vs. Puneri Paltan
Winner: Draw
------------------------------------

Match between: Telugu Titans vs. Guj

['Patna Pirates',
 'Haryana Steelers',
 'Puneri Paltan',
 'Bengaluru Bulls',
 'Dabang Delhi',
 'Patna Pirates',
 'Bengal Warriors',
 'Telugu Titans',
 'Jaipur Pink Panthers',
 'Bengal Warriors',
 'Haryana Steelers',
 'Dabang Delhi']

In [17]:
'''
Get the data for team points till date (01-Oct-2019)
Add the points calculated according to the result for the future matches to build the final points table.
5 points for the winner. 
3 points each for the teams for a draw.
'''
team_points_short = pd.read_csv('team_points_short.csv')
for i in range(0,len(team_points_short)):
    f_points = 5*winning_team.count(team_points_short.iloc[i,0])
    d_points = 3*draw_teams.count(team_points_short.iloc[i,0])
    team_points_short.iloc[i,1] = team_points_short.iloc[i,1]+f_points+d_points

team_points_short = team_points_short.sort_values(by=['Points'],ascending=False)
team_points_short = team_points_short.reset_index().drop('index',axis=1)
team_points_short

# Final points table as here under.

Unnamed: 0,Team,Points
0,Dabang Delhi,87
1,Bengal Warriors,83
2,Haryana Steelers,73
3,UP Yoddha,67
4,Bengaluru Bulls,66
5,Jaipur Pink Panthers,57
6,U Mumba,54
7,Puneri Paltan,50
8,Patna Pirates,50
9,Telugu Titans,50


In [18]:
print('League Winner : '+team_points_short.iloc[0,0])

# predict the league winner

League Winner : Dabang Delhi


In [19]:
# Eliminator 1
# Eliminator match between Team finishing 3rd and 6th
'''
Use the same logistic regression model trained with earlier data . it is not retrained with the latest data.
Only used to predict the outcome for the match passed in.
'''
team1_f = team_points_short.iloc[2,0]
team2_f = team_points_short.iloc[5,0]

new_df_el1 = get_new_match_results(team1_f,team2_f)
    
X_test_el1 = new_df_el1.drop(['Team1_Score','Team2_Score','Winner'],axis=1)
predict_test_lr_el1 = lr.predict(X_test_el1)
predict_proba_test_lr_el1 = lr.predict_proba(X_test_el1)

winner_el1 = ''
chance_of_win = 0

if predict_proba_test_lr_el1[0][1]>predict_proba_test_lr_el1[0][2]:
    winner_el1=team1_f
    chance_of_win = predict_proba_test_lr_el1[0][1]
else:
    winner_el1 = team2_f
    chance_of_win = predict_proba_test_lr_el1[0][2]

print('Eliminator 1 : Match between : '+team1_f +" vs. "+ team2_f)
print('\n')
print('Eliminator 1 Winner : '+winner_el1+': chance of win '+str(int(100*chance_of_win))+'%')


match_results_mod = pd.concat([match_results_mod,new_df_el1])

Eliminator 1 : Match between : Haryana Steelers vs. Jaipur Pink Panthers


Eliminator 1 Winner : Haryana Steelers: chance of win 70%


In [20]:
# Eliminator 2: Team finishing 4th vs Team finishing 5th in the league matches
'''
Use the same logistic regression model trained with earlier data . it is not retrained with the latest data.
Only used to predict the outcome for the match passed in.
'''
team1_f_el2 = team_points_short.iloc[3,0]
team2_f_el2 = team_points_short.iloc[4,0]

new_df_el2 = get_new_match_results(team1_f_el2,team2_f_el2)
    
X_test_el2 = new_df_el2.drop(['Team1_Score','Team2_Score','Winner'],axis=1)
predict_test_lr_el2 = lr.predict(X_test_el2)
predict_proba_test_lr_el2 = lr.predict_proba(X_test_el2)

winner_el2 = ''
chance_of_win_el2 = 0

if predict_proba_test_lr_el2[0][1] > predict_proba_test_lr_el2[0][2]:
    winner_el2=team1_f_el2
    chance_of_win_el2 = predict_proba_test_lr_el2[0][1]
else:
    winner_el2 = team2_f_el2
    chance_of_win_el2 = predict_proba_test_lr_el2[0][2]

print('Eliminator 2 : Match between '+team1_f_el2 +" vs. "+ team2_f_el2)
print('\n')
print('Eliminator 2 Winner : '+winner_el2+': chance of win '+str(int(100*chance_of_win_el2))+'%')


match_results_mod = pd.concat([match_results_mod,new_df_el2])

# Predict Eliminator 2 winner

Eliminator 2 : Match between UP Yoddha vs. Bengaluru Bulls


Eliminator 2 Winner : UP Yoddha: chance of win 29%


In [21]:
# Semifinal 1: League winner vs. Eliminator 1 Winner
'''
Use the same logistic regression model trained with earlier data . it is not retrained with the latest data.
Only used to predict the outcome for the match passed in.
The moving average has already been accounting for the previous 7 performances of the participating team
'''


team1_f_sf1 = team_points_short.iloc[0,0]
team2_f_sf1 = winner_el1

new_df_sf1 = get_new_match_results(team1_f_sf1,team2_f_sf1)
    
X_test_sf1 = new_df_sf1.drop(['Team1_Score','Team2_Score','Winner'],axis=1)
predict_test_lr_sf1 = lr.predict(X_test_sf1)
predict_proba_test_lr_sf1 = lr.predict_proba(X_test_sf1)

winner_sf1 = ''
chance_of_win_sf1 = 0

if predict_proba_test_lr_sf1[0][1]>predict_proba_test_lr_sf1[0][2]:
    winner_sf1=team1_f_sf1
    chance_of_win_sf1 = predict_proba_test_lr_sf1[0][1]
else:
    winner_sf1 = team2_f_sf1
    chance_of_win_sf1 = predict_proba_test_lr_sf1[0][2]

print('Semi Final 1 : Match between: '+team1_f_sf1 +" vs. "+ team2_f_sf1)
print('\n')
print('Semi Final 1 Winner : '+winner_sf1+': chance of win '+str(int(100*chance_of_win_sf1))+'%')

match_results_mod = pd.concat([match_results_mod,new_df_sf1])

# Semifinal 1 winner prediction

Semi Final 1 : Match between: Dabang Delhi vs. Haryana Steelers


Semi Final 1 Winner : Dabang Delhi: chance of win 96%


In [22]:
# Semifinal 2 - 2nd position in League matches vs Eliminator 2 winner
'''
Use the same logistic regression model trained with earlier data . it is not retrained with the latest data.
Only used to predict the outcome for the match passed in.
The moving average has already been accounting for the previous 7 performances of the participating team
'''

team1_f_sf2 = team_points_short.iloc[1,0]
team2_f_sf2 = winner_el2

new_df_sf2 = get_new_match_results(team1_f_sf2,team2_f_sf2)
    
X_test_sf2 = new_df_sf2.drop(['Team1_Score','Team2_Score','Winner'],axis=1)
predict_test_lr_sf2 = lr.predict(X_test_sf2)
predict_proba_test_lr_sf2 = lr.predict_proba(X_test_sf2)

winner_sf2 = ''
chance_of_win_sf2 = 0

if predict_proba_test_lr_sf2[0][1] > predict_proba_test_lr_sf2[0][2]:
    winner_sf2=team1_f_sf2
    chance_of_win_sf2 = predict_proba_test_lr_sf2[0][1]
else:
    winner_sf2 = team2_f_sf2
    chance_of_win_sf2 = predict_proba_test_lr_sf2[0][2]

print('Semi Final 2 : Match between : '+team1_f_sf2 +" vs. "+ team2_f_sf2)
print('\n')
print('Winner : '+winner_sf2+': chance of win '+str(int(100*chance_of_win_sf2))+'%')

match_results_mod = pd.concat([match_results_mod,new_df_sf2])

Semi Final 2 : Match between : Bengal Warriors vs. UP Yoddha


Winner : Bengal Warriors: chance of win 87%


In [23]:
# Final between SF1 winner and SF2 winner
'''
Use the same logistic regression model trained with earlier data . it is not retrained with the latest data.
Only used to predict the outcome for the match passed in.
The moving average has already been accounting for the previous 7 performances of the participating team
'''

team1_f_f = winner_sf1
team2_f_f = winner_sf2

new_df_f = get_new_match_results(team1_f_f,team2_f_f)
    
X_test_f = new_df_f.drop(['Team1_Score','Team2_Score','Winner'],axis=1)
predict_test_lr_f = lr.predict(X_test_f)
predict_proba_test_lr_f = lr.predict_proba(X_test_f)

winner_f = ''
chance_of_win_f = 0

if predict_proba_test_lr_f[0][1]>predict_proba_test_lr_f[0][2]:
    winner_f=team1_f_f
    chance_of_win_f = predict_proba_test_lr_f[0][1]
else:
    winner_f = team2_f_f
    chance_of_win_f = predict_proba_test_lr_f[0][2]

print('Final : '+team1_f_f +" vs. "+ team2_f_f)
print('\n')
print('Final Winner : '+winner_f+': chance of win '+str(int(100*chance_of_win_f))+'%')

match_results_mod = pd.concat([match_results_mod,new_df_f])

# Winner of the Tournament

Final : Dabang Delhi vs. Bengal Warriors


Final Winner : Dabang Delhi: chance of win 78%


In [24]:
# this is the final dataset for the season including results of all previous seasons.
match_results_mod.Season.value_counts()

6    138
5    138
7    137
4     60
3     60
2     60
1     60
Name: Season, dtype: int64

#### Get the teams with highest Raid Points & Tackle Points

In [25]:
df_raid_tackle_points = pd.DataFrame(columns=['Team','Raid_Points','Tackle_Points'])

teams = ['Bengal Warriors','Dabang Delhi','Gujarat Fortune Giants','Haryana Steelers','Jaipur Pink Panthers',
         'Bengaluru Bulls','Puneri Paltan','Tamil Thalaivas','Telugu Titans','U Mumba','UP Yoddha','Patna Pirates']
i=0
for team in teams:

    df_Points = match_results_mod.loc[((match_results_mod['Team_1_'+team]==1) | (match_results_mod['Team_2_'+team]==1)) & (match_results_mod['Season']==7)]
    raid_1 = df_Points.loc[df_Points['Team_1_'+team]==1]['Team1_Raid_Points'].sum()
    raid_2 = df_Points.loc[df_Points['Team_2_'+team]==1]['Team2_Raid_Points'].sum()
    Raid_points = int(raid_1+raid_2)

    tackle_1 = df_Points.loc[df_Points['Team_1_'+team]==1]['Team1_Tackle_Points'].sum()
    tackle_2 = df_Points.loc[df_Points['Team_2_'+team]==1]['Team2_Tackle_Points'].sum()
    Tackle_points = int(tackle_1+tackle_2)

    df_raid_tackle_points.loc[i,'Team'] = team
    df_raid_tackle_points.loc[i,'Raid_Points'] = Raid_points
    df_raid_tackle_points.loc[i,'Tackle_Points'] = Tackle_points
    
    i=i+1

df_raid_tackle_points

Unnamed: 0,Team,Raid_Points,Tackle_Points
0,Bengal Warriors,527,229
1,Dabang Delhi,526,222
2,Gujarat Fortune Giants,354,223
3,Haryana Steelers,490,241
4,Jaipur Pink Panthers,387,250
5,Bengaluru Bulls,493,224
6,Puneri Paltan,399,256
7,Tamil Thalaivas,381,184
8,Telugu Titans,430,215
9,U Mumba,382,212


In [26]:
max_raids = df_raid_tackle_points.Raid_Points.max()
max_tackles = df_raid_tackle_points.Tackle_Points.max()

print('Team with Most Successful Raid Points : '+df_raid_tackle_points.loc[df_raid_tackle_points['Raid_Points']==max_raids].Team.values[0])
print('Team with Most Successful Tackle Points : '+df_raid_tackle_points.loc[df_raid_tackle_points['Tackle_Points']==max_tackles].Team.values[0])

Team with Most Successful Raid Points : Bengal Warriors
Team with Most Successful Tackle Points : UP Yoddha


##### Super performance total - There is no data available per match-wise for Super_raids and Super-tackles. Hence current SPT is sorted and assumed to be highest

In [27]:
# Scraped from https://www.prokabaddi.com/stats for the teams

team_data = pd.read_csv('team_data_clean.csv')
team_data.fillna(0)
team_data_df = team_data.loc[:,['Team','Matches_Played','Super_raid','Super_tackles','All_outs_inflicted',
                                'All_outs_conceded','Season','SPT']]

max_spt =  team_data_df.loc[team_data_df['Season']=='7'].SPT.max()
#print(max_spt)
print('Team with highest super-performance total : '+team_data_df.loc[team_data_df['SPT']==max_spt].Team.values[0])
print('\n')

Team with highest super-performance total : Bengal Warriors




In [28]:
# Data scraped from https://www.prokabaddi.com/stats for all seasons for all the players in the Prokabaddi league
players_data = pd.read_csv('players_data_1.csv')
players_data.fillna(0)
players_data.head()

Unnamed: 0,Name,Profile,Team,Matches_Played,Total_points,Successful_raids,Raid_points,Successful_tackles,Tackle_points,Avg_raid_points,Avg_tackle_points,Do_or_Die_raid_points,Super_raids,Super_tackles,Super_10s,High_5s,Season
0,pardeep narwal,Raider,Patna Pirates,104,1108.0,836.0,1101.0,6.0,7.0,10.69,0.07,136.0,50.0,1.0,57.0,,All
1,rahul chaudhari,Raider,Tamil Thalaivas,119,1001.0,775.0,942.0,54.0,59.0,7.92,0.5,164.0,24.0,5.0,40.0,,All
2,deepak niwas hooda,All Rounder,Jaipur Pink Panthers,123,943.0,698.0,856.0,79.0,87.0,6.96,0.71,166.0,20.0,8.0,31.0,4.0,All
3,ajay thakur,Raider,Tamil Thalaivas,115,811.0,640.0,790.0,21.0,21.0,6.87,0.18,163.0,23.0,,29.0,,All
4,maninder singh,Raider,Bengal Warriors,78,727.0,576.0,718.0,8.0,9.0,9.21,0.12,102.0,21.0,1.0,32.0,,All


In [29]:
'''
The data set contains stats for each player scraped from prokabaddi by going into each of the player's profile
'''
players_stats = pd.read_csv('players_stats.csv')
players_stats.rename(columns={'player':'Name'},inplace=True)
players_stats.fillna(0)
players_stats.head()

Unnamed: 0,Name,matches_played,total_points_earned,Most points_in_a_match,not_out_percent,Total_Raids,Successful_Raids_percent,no_of_super_raids,super_10s,total_raid_points,avg_raid_points_per_match,no_of_super_tackles,high_5s,total_tackle_points,avg_tackle_per_match,total_tackles,tackle_strike_rate,season
0,pawan kumar sehrawat,74,601,29,72.46%,868,65.43,18,25,568,7.67,3,0,33,0.4,87,37.93%,All
1,pawan kumar sehrawat,19,256,29,74.05%,343,70.55,6,13,242,12.73,1,0,13,0.66,27,48.14%,7
2,pawan kumar sehrawat,24,282,22,77.33%,375,72.26,12,13,271,11.29,0,0,11,0.45,40,27.50%,6
3,pawan kumar sehrawat,9,10,4,45.83%,24,37.5,0,0,9,1.0,0,0,1,0.11,2,50%,5
4,pawan kumar sehrawat,10,11,4,48.48%,33,33.33,0,0,11,1.1,0,0,0,0.0,1,0,4


In [30]:
'''
The players data has been sourced from prokabaddi
steps -- go to https://www.prokabaddi.com/stats --> Player tab --> Gather all the data [Total_points,Successful_raids,Successful_tackles,Raid_points, Tackle_points...] 
for each of the players for all the seasons

The following function returns the number of Successful raids, successful tackles, super raids, super tackles 
this data to be merged with player stats.
'''
def f(row):
    array_d=[]
    raids = players_data.loc[(players_data['Name']==row['Name']) & (players_data['Season']==row['season'])].Successful_raids.values
    if len(raids) >0 :
        array_d.append(raids[0])
    else:
        array_d.append(0)
        
    tackles = players_data.loc[(players_data['Name']==row['Name']) & (players_data['Season']==row['season'])].Successful_tackles.values
    if len(tackles) >0 :
        array_d.append(tackles[0])
    else:
        array_d.append(0)
        
    s_raids = players_data.loc[(players_data['Name']==row['Name']) & (players_data['Season']==row['season'])].Super_raids.values
    if len(s_raids) >0 :
        array_d.append(s_raids[0])
    else:
        array_d.append(0)
        
    s_tackles = players_data.loc[(players_data['Name']==row['Name']) & (players_data['Season']==row['season'])].Super_tackles.values
    if len(s_tackles) >0 :
        array_d.append(s_tackles[0])
    else:
        array_d.append(0)
    
    team = players_data.loc[(players_data['Name']==row['Name']) & (players_data['Season']==row['season'])].Team.values
    if len(team) >0 :
        array_d.append(team[0])
    else:
        array_d.append('No Team')
        
    profile = players_data.loc[(players_data['Name']==row['Name']) & (players_data['Season']==row['season'])].Profile.values
    if len(profile) >0 :
        array_d.append(profile[0])
    else:
        array_d.append('All Rounder')
    
    return array_d
    

In [31]:
# merge the 2 datasets to get the details for each player
players_stats[['Successful_raids','Successful_tackles','Super_raids','Super_tackles','Team','Profile']] = players_stats.apply(f,axis=1,result_type='expand')
#players_stats['Successful_raids'] = players_stats.apply(f,axis=1)
players_stats = players_stats.fillna(0)
players_stats.rename(columns={'Successful_Raids_percent':'Success_Raids_points_percent'},inplace=True)
players_stats.head()

Unnamed: 0,Name,matches_played,total_points_earned,Most points_in_a_match,not_out_percent,Total_Raids,Success_Raids_points_percent,no_of_super_raids,super_10s,total_raid_points,...,avg_tackle_per_match,total_tackles,tackle_strike_rate,season,Successful_raids,Successful_tackles,Super_raids,Super_tackles,Team,Profile
0,pawan kumar sehrawat,74,601,29,72.46%,868,65.43,18,25,568,...,0.4,87,37.93%,All,445.0,31.0,19.0,3.0,Bengaluru Bulls,Raider
1,pawan kumar sehrawat,19,256,29,74.05%,343,70.55,6,13,242,...,0.66,27,48.14%,7,187.0,13.0,6.0,1.0,Bengaluru Bulls,Raider
2,pawan kumar sehrawat,24,282,22,77.33%,375,72.26,12,13,271,...,0.45,40,27.50%,6,209.0,11.0,12.0,0.0,Bengaluru Bulls,Raider
3,pawan kumar sehrawat,9,10,4,45.83%,24,37.5,0,0,9,...,0.11,2,50%,5,5.0,1.0,0.0,0.0,Gujarat Fortune Giants,Raider
4,pawan kumar sehrawat,10,11,4,48.48%,33,33.33,0,0,11,...,0.0,1,0,4,8.0,0.0,0.0,0.0,Bengaluru Bulls,Raider


In [32]:
# Filter out the aggregated stats for the player
players_stats_f = players_stats.loc[~(players_stats['season']=='All')]

In [33]:
'''
As per the given problem statement, SUCCESSFUL RAID percent is to be calculated as
(No. of Successful Raids)/(No. of Total Raids) percentage
'''
def success_raid_percent(row):
    if row['Total_Raids']>0:
        return int(100*row['Successful_raids']/row['Total_Raids'])
    else:
        return 0
players_stats_f['Successful_Raid_percent'] = players_stats_f.apply(success_raid_percent,axis=1)

In [34]:
'''
As per the given problem statement, SUCCESSFUL TACKLES percent is to be calculated as
(No. of Successful Tackles)/(Total No. of Tackles) percentage
'''

def success_tackle_percent(row):
    if row['total_tackles']>0:
        return int(100*row['Successful_tackles']/row['total_tackles'])
    else:
        return 0
players_stats_f['Successful_Tackle_percent'] = players_stats_f.apply(success_tackle_percent,axis=1)

In [35]:
players_stats_f.head()

Unnamed: 0,Name,matches_played,total_points_earned,Most points_in_a_match,not_out_percent,Total_Raids,Success_Raids_points_percent,no_of_super_raids,super_10s,total_raid_points,...,tackle_strike_rate,season,Successful_raids,Successful_tackles,Super_raids,Super_tackles,Team,Profile,Successful_Raid_percent,Successful_Tackle_percent
1,pawan kumar sehrawat,19,256,29,74.05%,343,70.55,6,13,242,...,48.14%,7,187.0,13.0,6.0,1.0,Bengaluru Bulls,Raider,54,48
2,pawan kumar sehrawat,24,282,22,77.33%,375,72.26,12,13,271,...,27.50%,6,209.0,11.0,12.0,0.0,Bengaluru Bulls,Raider,55,27
3,pawan kumar sehrawat,9,10,4,45.83%,24,37.5,0,0,9,...,50%,5,5.0,1.0,0.0,0.0,Gujarat Fortune Giants,Raider,20,50
4,pawan kumar sehrawat,10,11,4,48.48%,33,33.33,0,0,11,...,0,4,8.0,0.0,0.0,0.0,Bengaluru Bulls,Raider,24,0
5,pawan kumar sehrawat,13,53,7,64.54%,110,40.9,1,0,45,...,47.05%,3,36.0,6.0,1.0,2.0,Bengaluru Bulls,Raider,32,35


In [36]:
players_stats_f.columns

Index(['Name', 'matches_played', 'total_points_earned',
       'Most points_in_a_match', 'not_out_percent', 'Total_Raids',
       'Success_Raids_points_percent', 'no_of_super_raids', 'super_10s',
       'total_raid_points', 'avg_raid_points_per_match', 'no_of_super_tackles',
       'high_5s', 'total_tackle_points', 'avg_tackle_per_match',
       'total_tackles', 'tackle_strike_rate', 'season', 'Successful_raids',
       'Successful_tackles', 'Super_raids', 'Super_tackles', 'Team', 'Profile',
       'Successful_Raid_percent', 'Successful_Tackle_percent'],
      dtype='object')

In [37]:
# clean the data, remove % from the data and convert them into float
players_stats_f['not_out_percent'] = players_stats_f['not_out_percent'].apply(lambda x: x.replace('%',''))
players_stats_f['tackle_strike_rate'] = players_stats_f['tackle_strike_rate'].apply(lambda x: x.replace('%',''))

In [38]:
players_stats_f['not_out_percent'].astype(float)
players_stats_f['tackle_strike_rate'].astype(float)

1       48.14
2       27.50
3       50.00
4        0.00
5       47.05
7        0.00
8        0.00
9        0.00
10      33.33
11      38.46
12       0.00
14      33.00
15      41.66
17       0.00
18       0.00
19      15.38
20       0.00
21      36.84
23      40.00
24      27.77
25       0.00
26       0.00
28      60.00
29      33.33
31      42.30
32      31.57
33      41.17
34      14.28
35      33.33
36      59.01
        ...  
658      0.00
660     33.33
661     52.17
663      0.00
664      0.00
666      0.00
667      0.00
669     28.57
670     33.33
672     33.33
673     33.33
675     66.66
677     40.00
678     47.61
679     27.27
680     40.47
681     34.54
683    100.00
684     27.77
685      0.00
686    100.00
688      0.00
690      0.00
692     12.50
694      0.00
696     25.00
698      0.00
700     50.00
702     25.00
704      0.00
Name: tackle_strike_rate, Length: 540, dtype: float64

In [39]:
#['Name', 'matches_played', 'total_points_earned','Most points_in_a_match', 'not_out_percent', 'Total_Raids',
#       'Success_Raids_points_percent', 'no_of_super_raids', 'super_10s','total_raid_points', 'avg_raid_points_per_match', 
#       'no_of_super_tackles','high_5s', 'total_tackle_points', 'avg_tackle_per_match','total_tackles',
#        'tackle_strike_rate', 'season', 'Successful_raids','Successful_tackles', 'Super_raids', 'Super_tackles', 
#        'Team', 'Profile','Successful_Raid_percent', 'Successful_Tackle_percent']
#players_stats_f_raids = players_stats_f.loc[(players_stats_f['Successful_Raid_percent']>0) & (players_stats_f['Successful_Raid_percent']<80)][['Name','matches_played','total_points_earned','Most points_in_a_match','not_out_percent','no_of_super_raids','avg_raid_points_per_match','season','Successful_Raid_percent']]

'''
For predicting the Successful raid percentage, filter the raiders & all rounders and 
create the data frame with percentage between 0-95. A few observations have 100 percent but they are one match-winders.
'''
players_stats_f_raids = players_stats_f.loc[(players_stats_f['Profile'].isin(['Raider','All Rounder'])) 
                                            & (players_stats_f['matches_played']>7) 
                                            & (players_stats_f['Successful_Raid_percent']>0) 
                                            & (players_stats_f['Successful_Raid_percent']<95)][['Name','Profile','matches_played','Successful_raids','Total_Raids','Success_Raids_points_percent','no_of_super_raids','Most points_in_a_match','avg_raid_points_per_match','season','Successful_Raid_percent']]

#players_stats_f_raids = players_stats_f.loc[players_stats_f['Successful_Raid_percent']>0][['Name','matches_played','avg_raid_points_per_match','season','Successful_Raid_percent']]
players_stats_f_raids

Unnamed: 0,Name,Profile,matches_played,Successful_raids,Total_Raids,Success_Raids_points_percent,no_of_super_raids,Most points_in_a_match,avg_raid_points_per_match,season,Successful_Raid_percent
1,pawan kumar sehrawat,Raider,19,187.0,343,70.55,6,29,12.73,7,54
2,pawan kumar sehrawat,Raider,24,209.0,375,72.26,12,22,11.29,6,55
3,pawan kumar sehrawat,Raider,9,5.0,24,37.50,0,4,1.00,5,20
4,pawan kumar sehrawat,Raider,10,8.0,33,33.33,0,4,1.10,4,24
5,pawan kumar sehrawat,Raider,13,36.0,110,40.90,1,7,3.46,3,32
7,pardeep narwal,Raider,19,190.0,383,63.44,12,26,12.78,7,49
8,pardeep narwal,Raider,21,185.0,391,59.59,6,27,11.09,6,47
9,pardeep narwal,Raider,26,271.0,586,62.96,18,34,14.19,5,46
10,pardeep narwal,Raider,16,100.0,263,49.80,3,18,8.18,4,38
11,pardeep narwal,Raider,16,83.0,187,62.03,10,24,7.25,3,44


In [40]:
'''
Dataset from season 1 to 6 will be used to train the model
Dataset for season 7 will be used to predict the highest SUCCESSFUL RAID percentage.
'''
players_stats_f_raids1_6 = players_stats_f_raids.loc[players_stats_f_raids['season']!='7']
players_stats_f_raids1_7 = players_stats_f_raids.loc[players_stats_f_raids['season']=='7']



In [41]:
'''
This is a Regression problem as the output is a continuous variable - successul raid/tackle percentage
Use various ML algorithms and check the r2 score and MAE (mean absolute error) for each of those.
Pick the model with highest r2 score or lowest MAE
'''
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn import metrics

y = players_stats_f_raids1_6.loc[:, 'Successful_Raid_percent']
X = players_stats_f_raids1_6.drop(['Name','Successful_Raid_percent','season','Profile'],axis=1)


# split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3, 
                                                    random_state = 1)
scaler = StandardScaler()
train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.transform(X_test)

ridge = Ridge(alpha=0.001)
tree_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor(n_estimators=25,criterion='mae')
knn = KNeighborsRegressor()

ridge.fit(train_scaled, y_train)
tree_model.fit(train_scaled, y_train)
rf_model.fit(train_scaled, y_train)
knn.fit(train_scaled, y_train)

y_train_pred = ridge.predict(train_scaled)
print('mae: ridge: train: ',mean_absolute_error(y_true=y_train, y_pred=y_train_pred))
print('r2: ridge: train: ',r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = tree_model.predict(test_scaled)
print('mae: ridge: test: ',mean_absolute_error(y_true=y_test, y_pred=y_test_pred))
print('r2: ridge: test: ',r2_score(y_true=y_test, y_pred=y_test_pred))

y_train_pred = tree_model.predict(train_scaled)
print('mae: decision tree: train: ',mean_absolute_error(y_true=y_train, y_pred=y_train_pred))
print('r2: decision tree: train: ',r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = tree_model.predict(test_scaled)
print('mae: decision tree: test: ',mean_absolute_error(y_true=y_test, y_pred=y_test_pred))
print('r2: decision tree: test: ',r2_score(y_true=y_test, y_pred=y_test_pred))

y_train_pred = rf_model.predict(train_scaled)
print('mae: random forest: train: ',mean_absolute_error(y_true=y_train, y_pred=y_train_pred))
print('r2: random forest: train: ',r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = rf_model.predict(test_scaled)
print('mae: random forest: test: ',mean_absolute_error(y_true=y_test, y_pred=y_test_pred))
print('r2: random forest: test: ',r2_score(y_true=y_test, y_pred=y_test_pred))
data_comp =[]
for true_val, pred_val in zip(y_test,y_test_pred):
    temp=[]
    temp.append(true_val)
    temp.append(pred_val)
    data_comp.append(temp)
    
rf_prediction = pd.DataFrame(data_comp,columns=['True_Value','Predicted_Value'])

y_train_pred = knn.predict(train_scaled)
print('mae: knn: train: ',mean_absolute_error(y_true=y_train, y_pred=y_train_pred))
print('r2: knn: train: ',r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = knn.predict(test_scaled)
print('mae: knn: test: ',mean_absolute_error(y_true=y_test, y_pred=y_test_pred))
print('r2: knn: test: ',r2_score(y_true=y_test, y_pred=y_test_pred))



mae: ridge: train:  1.905527167503767
r2: ridge: train:  0.9249247009225243
mae: ridge: test:  3.2857142857142856
r2: ridge: test:  0.7685835927387867
mae: decision tree: train:  0.0
r2: decision tree: train:  1.0
mae: decision tree: test:  3.2857142857142856
r2: decision tree: test:  0.7685835927387867
mae: random forest: train:  0.9903571428571428
r2: random forest: train:  0.9817695636213195
mae: random forest: test:  2.625306122448979
r2: random forest: test:  0.8729034883347775
mae: knn: train:  2.4892857142857143
r2: knn: train:  0.8832749669321804
mae: knn: test:  2.6979591836734698
r2: knn: test:  0.8391144914048544


#### RandomForestRegressor is the best model and we will use it to predict the highest SUCCESSFUL Raid percentage and the corresponding player using Season 7 data.

In [42]:
rf_prediction.loc[rf_prediction['True_Value'] == rf_prediction['True_Value'].max()]

Unnamed: 0,True_Value,Predicted_Value
47,52,48.88


In [43]:
rf_prediction.loc[rf_prediction['Predicted_Value'] == rf_prediction['Predicted_Value'].max()]

Unnamed: 0,True_Value,Predicted_Value
26,46,50.56


In [44]:
players_stats_f_raids1_7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 1 to 550
Data columns (total 11 columns):
Name                            55 non-null object
Profile                         55 non-null object
matches_played                  55 non-null int64
Successful_raids                55 non-null float64
Total_Raids                     55 non-null int64
Success_Raids_points_percent    55 non-null float64
no_of_super_raids               55 non-null int64
Most points_in_a_match          55 non-null int64
avg_raid_points_per_match       55 non-null float64
season                          55 non-null object
Successful_Raid_percent         55 non-null int64
dtypes: float64(3), int64(5), object(3)
memory usage: 5.2+ KB


In [45]:
players_stats_f_raids1_6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161 entries, 2 to 684
Data columns (total 11 columns):
Name                            161 non-null object
Profile                         161 non-null object
matches_played                  161 non-null int64
Successful_raids                161 non-null float64
Total_Raids                     161 non-null int64
Success_Raids_points_percent    161 non-null float64
no_of_super_raids               161 non-null int64
Most points_in_a_match          161 non-null int64
avg_raid_points_per_match       161 non-null float64
season                          161 non-null object
Successful_Raid_percent         161 non-null int64
dtypes: float64(3), int64(5), object(3)
memory usage: 15.1+ KB


In [46]:
X_to_predict = players_stats_f_raids1_7.drop(['Name','Successful_Raid_percent','season','Profile'],axis=1)
pred_scaled = scaler.transform(X_to_predict)
y_true_7 = players_stats_f_raids1_7.loc[:, 'Successful_Raid_percent']
y_pred_7 = rf_model.predict(pred_scaled)
print('mae: random forest: test: ',mean_absolute_error(y_true=y_true_7, y_pred=y_pred_7))
data_comp =[]
for true_val, pred_val in zip(y_true_7,y_pred_7):
    temp=[]
    temp.append(true_val)
    temp.append(pred_val)
    data_comp.append(temp)
    
rf_prediction = pd.DataFrame(data_comp,columns=['True_Value','Predicted_Value'])


mae: random forest: test:  4.617454545454546


In [47]:
players_stats_f_raids1_7_n = players_stats_f_raids1_7.reset_index(drop=True)
players_stats_f_raids1_7_n.head()

Unnamed: 0,Name,Profile,matches_played,Successful_raids,Total_Raids,Success_Raids_points_percent,no_of_super_raids,Most points_in_a_match,avg_raid_points_per_match,season,Successful_Raid_percent
0,pawan kumar sehrawat,Raider,19,187.0,343,70.55,6,29,12.73,7,54
1,pardeep narwal,Raider,19,190.0,383,63.44,12,26,12.78,7,49
2,naveen kumar,Raider,20,201.0,408,62.74,2,19,12.8,7,49
3,maninder singh,Raider,18,161.0,284,61.61,6,19,9.72,7,56
4,vikash kandola,Raider,14,119.0,284,48.59,5,13,9.85,7,41


In [48]:
rf_prediction_f = pd.concat([rf_prediction,players_stats_f_raids1_7_n['Name']],axis=1,ignore_index =True)
rf_prediction_f = rf_prediction_f.rename(columns={0:'True_Value',1:'Predicted_Value',2:'Name'})
rf_prediction_f.head()

Unnamed: 0,True_Value,Predicted_Value,Name
0,54,50.08,pawan kumar sehrawat
1,49,49.8,pardeep narwal
2,49,50.44,naveen kumar
3,56,46.32,maninder singh
4,41,38.8,vikash kandola


In [49]:
print(rf_prediction_f.loc[rf_prediction_f['True_Value'] == rf_prediction_f['True_Value'].max()])
print(rf_prediction_f.loc[rf_prediction_f['Predicted_Value'] == rf_prediction_f['Predicted_Value'].max()])
print('\n')
print('Player with highest SUCCESSFUL RAID percentage: ',rf_prediction_f.loc[rf_prediction_f['Predicted_Value'] == rf_prediction_f['Predicted_Value'].max()]['Name'].values[0])

    True_Value  Predicted_Value                    Name
5           60            46.80  siddharth sirish desai
27          60            45.92     prashanth kumar rai
43          60            33.00             sonu jaglan
   True_Value  Predicted_Value          Name
2          49            50.44  naveen kumar


Player with highest SUCCESSFUL RAID percentage:  naveen kumar


#### player with the highest SUCCESSFUL RAID percentage: Naveen Kumar

In [50]:
rf_prediction_f.loc[rf_prediction_f['Predicted_Value'] > 45]

Unnamed: 0,True_Value,Predicted_Value,Name
0,54,50.08,pawan kumar sehrawat
1,49,49.8,pardeep narwal
2,49,50.44,naveen kumar
3,56,46.32,maninder singh
5,60,46.8,siddharth sirish desai
27,60,45.92,prashanth kumar rai


In [51]:
players_stats_f_raids1_7_n.head(5)

Unnamed: 0,Name,Profile,matches_played,Successful_raids,Total_Raids,Success_Raids_points_percent,no_of_super_raids,Most points_in_a_match,avg_raid_points_per_match,season,Successful_Raid_percent
0,pawan kumar sehrawat,Raider,19,187.0,343,70.55,6,29,12.73,7,54
1,pardeep narwal,Raider,19,190.0,383,63.44,12,26,12.78,7,49
2,naveen kumar,Raider,20,201.0,408,62.74,2,19,12.8,7,49
3,maninder singh,Raider,18,161.0,284,61.61,6,19,9.72,7,56
4,vikash kandola,Raider,14,119.0,284,48.59,5,13,9.85,7,41


In [52]:
players_stats_f.columns

Index(['Name', 'matches_played', 'total_points_earned',
       'Most points_in_a_match', 'not_out_percent', 'Total_Raids',
       'Success_Raids_points_percent', 'no_of_super_raids', 'super_10s',
       'total_raid_points', 'avg_raid_points_per_match', 'no_of_super_tackles',
       'high_5s', 'total_tackle_points', 'avg_tackle_per_match',
       'total_tackles', 'tackle_strike_rate', 'season', 'Successful_raids',
       'Successful_tackles', 'Super_raids', 'Super_tackles', 'Team', 'Profile',
       'Successful_Raid_percent', 'Successful_Tackle_percent'],
      dtype='object')

In [53]:
'''
For predicting the Successful raid percentage, filter the defenders & all rounders and 
create the data frame with percentage between 0-90. A few observations have 100 percent but they are one match-winders.
'''

players_stats_f_tackles = players_stats_f.loc[(players_stats_f['Profile'].isin(['Defender','All Rounder']))
                                              & (players_stats_f['matches_played']>7) 
                                              & (players_stats_f['Successful_Tackle_percent']>0) 
                                              & (players_stats_f['Successful_Tackle_percent']<90)][['Name','matches_played','Profile','total_tackle_points','total_tackles','Successful_tackles','tackle_strike_rate','avg_tackle_per_match','season','Successful_Tackle_percent']]

players_stats_f_tackles

Unnamed: 0,Name,matches_played,Profile,total_tackle_points,total_tackles,Successful_tackles,tackle_strike_rate,avg_tackle_per_match,season,Successful_Tackle_percent
31,deepak niwas hooda,18,All Rounder,11,26,10.0,42.30,0.50,7,38
32,deepak niwas hooda,22,All Rounder,12,38,11.0,31.57,0.50,6,28
33,deepak niwas hooda,24,All Rounder,14,34,13.0,41.17,0.54,5,38
34,deepak niwas hooda,16,All Rounder,4,28,4.0,14.28,0.25,4,14
35,deepak niwas hooda,12,All Rounder,7,21,7.0,33.33,0.58,3,33
36,deepak niwas hooda,15,All Rounder,36,61,32.0,59.01,2.13,2,52
37,deepak niwas hooda,14,All Rounder,2,22,2.0,9.09,0.14,1,9
57,rohit gulia,18,All Rounder,5,17,8.0,29.41,0.27,7,47
58,rohit gulia,19,All Rounder,3,19,3.0,15.78,0.15,6,15
59,rohit gulia,19,All Rounder,3,20,3.0,15,0.15,5,15


In [54]:
# Segregate the Season 7 data from others. Season 1-6 data will be used to train the model
# Season 7 data will be used to predict the SUCCESSFUL Tackle percentage for each player.

players_stats_f_tackles1_6 = players_stats_f_tackles.loc[players_stats_f_tackles['season']!='7']
players_stats_f_tackles1_7 = players_stats_f_tackles.loc[players_stats_f_tackles['season']=='7']

In [55]:
'''
This is a Regression problem as the output is a continuous variable - successul raid/tackle percentage
Use various ML algorithms and check the r2 score and MAE (mean absolute error) for each of those.
Pick the model with highest r2 score or lowest MAE
'''
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn import metrics

y = players_stats_f_tackles1_6.loc[:, 'Successful_Tackle_percent']
X = players_stats_f_tackles1_6.drop(['Name','Successful_Tackle_percent','season','Profile'],axis=1)


# split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3, 
                                                    random_state = 10)
scaler = StandardScaler()
train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.transform(X_test)

ridge = Ridge(alpha=0.001)
tree_model = DecisionTreeRegressor(criterion='mae')
rf_model = RandomForestRegressor(n_estimators=25,criterion='mae')
knn = KNeighborsRegressor()

ridge.fit(train_scaled, y_train)
tree_model.fit(train_scaled, y_train)
rf_model.fit(train_scaled, y_train)
knn.fit(train_scaled, y_train)

y_train_pred = ridge.predict(train_scaled)
print('mae: ridge: train: ',mean_absolute_error(y_true=y_train, y_pred=y_train_pred))
print('r2: ridge: train: ',r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = tree_model.predict(test_scaled)
print('mae: ridge: test: ',mean_absolute_error(y_true=y_test, y_pred=y_test_pred))
print('r2: ridge: test: ',r2_score(y_true=y_test, y_pred=y_test_pred))

y_train_pred = tree_model.predict(train_scaled)
print('mae: decision tree: train: ',mean_absolute_error(y_true=y_train, y_pred=y_train_pred))
print('r2: decision tree: train: ',r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = tree_model.predict(test_scaled)
print('mae: decision tree: test: ',mean_absolute_error(y_true=y_test, y_pred=y_test_pred))
print('r2: decision tree: test: ',r2_score(y_true=y_test, y_pred=y_test_pred))

y_train_pred = rf_model.predict(train_scaled)
print('mae: random forest: train: ',mean_absolute_error(y_true=y_train, y_pred=y_train_pred))
print('r2: random forest: train: ',r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = rf_model.predict(test_scaled)
print('mae: random forest: test: ',mean_absolute_error(y_true=y_test, y_pred=y_test_pred))
print('r2: random forest: test: ',r2_score(y_true=y_test, y_pred=y_test_pred))
data_comp =[]
for true_val, pred_val in zip(y_test,y_test_pred):
    temp=[]
    temp.append(true_val)
    temp.append(pred_val)
    data_comp.append(temp)
    
rf_prediction_tackle = pd.DataFrame(data_comp,columns=['True_Value','Predicted_Value'])

y_train_pred = knn.predict(train_scaled)
print('mae: knn: train: ',mean_absolute_error(y_true=y_train, y_pred=y_train_pred))
print('r2: knn: train: ',r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = knn.predict(test_scaled)
print('mae: knn: test: ',mean_absolute_error(y_true=y_test, y_pred=y_test_pred))
print('r2: knn: test: ',r2_score(y_true=y_test, y_pred=y_test_pred))

mae: ridge: train:  0.99759343759795
r2: ridge: train:  0.9841425323802815
mae: ridge: test:  3.782608695652174
r2: ridge: test:  0.8342618965445312
mae: decision tree: train:  0.0
r2: decision tree: train:  1.0
mae: decision tree: test:  3.782608695652174
r2: decision tree: test:  0.8342618965445312
mae: random forest: train:  1.0373831775700932
r2: random forest: train:  0.9822608111280307
mae: random forest: test:  2.6217391304347832
r2: random forest: test:  0.9125045174062935
mae: knn: train:  2.973831775700935
r2: knn: train:  0.889059714351379
mae: knn: test:  3.2391304347826075
r2: knn: test:  0.8867607492727625


#### RandomForestClassifier performs the best

In [56]:
rf_prediction_tackle.loc[rf_prediction_tackle['True_Value'] == rf_prediction_tackle['True_Value'].max()]

Unnamed: 0,True_Value,Predicted_Value
36,60,56.32


In [57]:
rf_prediction_tackle.loc[rf_prediction_tackle['Predicted_Value'] == rf_prediction_tackle['Predicted_Value'].max()]

Unnamed: 0,True_Value,Predicted_Value
7,53,58.96


In [58]:
'''
Predict for Season 7 for all the players (defenders and all rounders) - SUCCESSFUL TACKLE percentage.
'''
X_to_predict = players_stats_f_tackles1_7.drop(['Name','Successful_Tackle_percent','season','Profile'],axis=1)
pred_scaled = scaler.transform(X_to_predict)
y_true_7 = players_stats_f_tackles1_7.loc[:, 'Successful_Tackle_percent']
y_pred_7 = rf_model.predict(pred_scaled)
print('mae: random forest: test: ',mean_absolute_error(y_true=y_true_7, y_pred=y_pred_7))
data_comp =[]
for true_val, pred_val in zip(y_true_7,y_pred_7):
    temp=[]
    temp.append(true_val)
    temp.append(pred_val)
    data_comp.append(temp)
    
rf_prediction_tackle = pd.DataFrame(data_comp,columns=['True_Value','Predicted_Value'])

mae: random forest: test:  4.676969696969697


In [59]:
players_stats_f_tackles1_7_n = players_stats_f_tackles1_7.reset_index(drop=True)
players_stats_f_tackles1_7_n = players_stats_f_tackles1_7_n.sort_values(by='Successful_Tackle_percent',ascending=False)
players_stats_f_tackles1_7_n.head()

Unnamed: 0,Name,matches_played,Profile,total_tackle_points,total_tackles,Successful_tackles,tackle_strike_rate,avg_tackle_per_match,season,Successful_Tackle_percent
25,parvesh bhainswal,18,Defender,40,76,46.0,52.63,2.11,7,60
6,fazel atrachali,17,Defender,55,92,54.0,59.78,3.05,7,58
19,nitesh kumar,17,Defender,43,75,44.0,57.33,2.23,7,58
4,sumit,17,Defender,58,106,60.0,54.71,3.41,7,56
7,sandeep narwal,17,All Rounder,40,65,37.0,61.53,2.05,7,56


In [60]:
rf_prediction_tackle_f = pd.concat([rf_prediction_tackle,players_stats_f_tackles1_7_n['Name']],axis=1,ignore_index =True)
rf_prediction_tackle_f = rf_prediction_tackle_f.rename(columns={0:'True_Value',1:'Predicted_Value',2:'Name'})
rf_prediction_tackle_f = rf_prediction_tackle_f.sort_values(by='Predicted_Value',ascending = False)
rf_prediction_tackle_f.head()

Unnamed: 0,True_Value,Predicted_Value,Name
5,54,58.36,vishal bhardwaj
6,58,56.6,fazel atrachali
27,53,54.6,vishal
7,56,54.24,sandeep narwal
10,48,52.24,nitin rawal


In [61]:
#print(rf_prediction_tackle_f.loc[rf_prediction_tackle_f['True_Value'] == rf_prediction_tackle_f['True_Value'].max()])
print(rf_prediction_tackle_f.loc[rf_prediction_tackle_f['Predicted_Value'] == rf_prediction_tackle_f['Predicted_Value'].max()])
print('\n')
print('Player with Highest SUCCESSFUL RAID percentage: ',rf_prediction_tackle_f.loc[rf_prediction_tackle_f['Predicted_Value'] == rf_prediction_tackle_f['Predicted_Value'].max()]['Name'].values[0])

   True_Value  Predicted_Value             Name
5          54            58.36  vishal bhardwaj


Player with Highest SUCCESSFUL RAID percentage:  vishal bhardwaj


In [62]:
rf_prediction_tackle_f.loc[rf_prediction_tackle_f['Predicted_Value'] > 48]

Unnamed: 0,True_Value,Predicted_Value,Name
5,54,58.36,vishal bhardwaj
6,58,56.6,fazel atrachali
27,53,54.6,vishal
7,56,54.24,sandeep narwal
10,48,52.24,nitin rawal
19,58,51.6,nitesh kumar
9,54,51.56,baldev singh
4,56,51.0,sumit
36,54,49.72,amit hooda
3,53,49.52,sandeep kumar dhull


#### player with the highest SUCCESSFUL TACKLE percentage: Vishal Bhardwaj 