In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn


In [51]:
# Load data of fixtures
# fixtures = pd.read_csv('dataset/matches.csv')

# Load ball to ball dataset
deliveries = pd.read_csv('worldcup2023.csv')

In [52]:
deliveries = deliveries[['start_date', 'match_id', 'venue', 'innings', 'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes', 'penalty', 'wicket_type', 'player_dismissed']]
deliveries.head()

Unnamed: 0,start_date,match_id,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed
0,2023-10-05,1384392,"Narendra Modi Stadium, Ahmedabad",1,0.1,England,New Zealand,JM Bairstow,DJ Malan,TA Boult,0,0,,,,,,,
1,2023-10-05,1384392,"Narendra Modi Stadium, Ahmedabad",1,0.2,England,New Zealand,JM Bairstow,DJ Malan,TA Boult,6,0,,,,,,,
2,2023-10-05,1384392,"Narendra Modi Stadium, Ahmedabad",1,0.3,England,New Zealand,JM Bairstow,DJ Malan,TA Boult,1,0,,,,,,,
3,2023-10-05,1384392,"Narendra Modi Stadium, Ahmedabad",1,0.4,England,New Zealand,DJ Malan,JM Bairstow,TA Boult,1,0,,,,,,,
4,2023-10-05,1384392,"Narendra Modi Stadium, Ahmedabad",1,0.5,England,New Zealand,JM Bairstow,DJ Malan,TA Boult,4,0,,,,,,,


In [53]:
# total teams
teams = np.append(deliveries['batting_team'], deliveries['bowling_team'])
teams = np.unique(teams)

player_teamwise = pd.DataFrame(columns=['player', 'team'])
for team in teams:
    temp = np.array([])
    temp = np.append(temp, deliveries[deliveries['batting_team'] == team]['striker'])
    temp = np.append(temp, deliveries[deliveries['batting_team'] == team]['non_striker'])
    temp = np.append(temp, deliveries[deliveries['bowling_team'] == team]['bowler'])
    temp = np.unique(temp)
    # pd.DataFrame({'player': temp, 'team': team})
    player_teamwise = pd.concat([player_teamwise, pd.DataFrame({'player': temp, 'team': team})], ignore_index=True)


In [54]:
# Batting stats
players = player_teamwise['player']

runs = np.array([])
sr = np.array([])


for player in players:
    matches_bats = deliveries[deliveries['striker'] == player]
    most_runs = matches_bats['runs_off_bat'].sum()
    strike_rate = (most_runs / len(matches_bats)) * 100

    runs = np.append(runs, most_runs)
    sr = np.append(sr, strike_rate)
    # print(player, most_runs, strike_rate)

print(len(runs))
print(len(sr))
player_teamwise['most_runs'] = runs
player_teamwise['strike_rate'] = sr

player_teamwise

150
150


  strike_rate = (most_runs / len(matches_bats)) * 100


Unnamed: 0,player,team,most_runs,strike_rate
0,Azmatullah Omarzai,Afghanistan,353.0,95.405405
1,Fazalhaq Farooqi,Afghanistan,2.0,22.222222
2,Hashmatullah Shahidi,Afghanistan,310.0,70.776256
3,Ibrahim Zadran,Afghanistan,376.0,73.725490
4,Ikram Alikhil,Afghanistan,89.0,86.407767
...,...,...,...,...
145,MD Shanaka,Sri Lanka,80.0,96.385542
146,MDKJ Perera,Sri Lanka,149.0,97.385621
147,P Nissanka,Sri Lanka,332.0,87.368421
148,PVD Chameera,Sri Lanka,6.0,15.384615


In [55]:
# Bowling stats
players = player_teamwise['player']

eco = np.array([])
avg = np.array([])
sr = np.array([])

for player in players:
    matches_bowls = deliveries[deliveries['bowler'] == player]
    balls_bowled = len(matches_bowls)
    wickets = matches_bowls['wicket_type'].count()
    runs_given = matches_bowls['runs_off_bat'].sum() + matches_bowls['extras'].sum()

    overs_bowled = len(matches_bowls)/6
    economy = runs_given/overs_bowled
    bowling_average = runs_given/wickets
    bowling_strike_rate = len(matches_bowls)/wickets

    eco = np.append(eco, economy)
    avg = np.append(avg, bowling_average)
    sr = np.append(sr, bowling_strike_rate)

player_teamwise['economy'] = eco
player_teamwise['avg'] = avg
player_teamwise['sr'] = sr

  economy = runs_given/overs_bowled
  bowling_average = runs_given/wickets
  bowling_strike_rate = len(matches_bowls)/wickets
  bowling_average = runs_given/wickets
  bowling_strike_rate = len(matches_bowls)/wickets


In [56]:
player_teamwise

Unnamed: 0,player,team,most_runs,strike_rate,economy,avg,sr
0,Azmatullah Omarzai,Afghanistan,353.0,95.405405,6.919149,38.714286,33.571429
1,Fazalhaq Farooqi,Afghanistan,2.0,22.222222,5.542169,32.857143,35.571429
2,Hashmatullah Shahidi,Afghanistan,310.0,70.776256,,,
3,Ibrahim Zadran,Afghanistan,376.0,73.725490,,,
4,Ikram Alikhil,Afghanistan,89.0,86.407767,,,
...,...,...,...,...,...,...,...
145,MD Shanaka,Sri Lanka,80.0,96.385542,5.818182,inf,inf
146,MDKJ Perera,Sri Lanka,149.0,97.385621,,,
147,P Nissanka,Sri Lanka,332.0,87.368421,,,
148,PVD Chameera,Sri Lanka,6.0,15.384615,6.347368,100.500000,95.000000


In [57]:
player_teamwise.to_csv('player_stats.csv', index=False)

Performance of 4 (semi-finalists) teams in previous matches

In [58]:
from sklearn import preprocessing
id_encoder = preprocessing.LabelEncoder()

icc_data = []
teams = player_teamwise['team'].unique()
teams[2]

'Bangladesh'

In [60]:
icc_data = []
for team in teams:
    indian_matches = deliveries[deliveries['bowling_team'] == team]
    matches = indian_matches['match_id'].unique()
    scores = pd.DataFrame(columns=np.append(['player'], matches))
    ############################################################################################
    first_row = [team]
    icc_data.append(first_row)
    ############################################################################################
    indian_squad = player_teamwise[player_teamwise['team'] == team]
    print(indian_squad)
    for player in indian_squad['player']:
        temp = []
        player_on_strike = indian_matches[indian_matches['bowler'] == player]
        scores['player'] = player
        records = player_on_strike.groupby('match_id').sum()['runs_off_bat']
        records += player_on_strike.groupby('match_id').sum()['extras']
        
        records = records
        print(player)
        print(records)
        temp.append(player)
        for match in matches:
            if match in records.keys():
                temp.append(records[match])
            else:
                temp.append('-')
        icc_data.append(temp)

                  player         team  most_runs  strike_rate    economy  \
0     Azmatullah Omarzai  Afghanistan      353.0    95.405405   6.919149   
1       Fazalhaq Farooqi  Afghanistan        2.0    22.222222   5.542169   
2   Hashmatullah Shahidi  Afghanistan      310.0    70.776256        NaN   
3         Ibrahim Zadran  Afghanistan      376.0    73.725490        NaN   
4          Ikram Alikhil  Afghanistan       89.0    86.407767        NaN   
5          Mohammad Nabi  Afghanistan       55.0    71.428571   4.129032   
6       Mujeeb Ur Rahman  Afghanistan       51.0   127.500000   5.532000   
7      Najibullah Zadran  Afghanistan        7.0    33.333333        NaN   
8          Naveen-ul-Haq  Afghanistan       16.0    64.000000   6.650602   
9             Noor Ahmad  Afghanistan       26.0    81.250000   4.600000   
10    Rahmanullah Gurbaz  Afghanistan      280.0    96.551724        NaN   
11           Rahmat Shah  Afghanistan      320.0    74.592075  12.000000   
12          

  records = player_on_strike.groupby('match_id').sum()['wicket_type']


KeyError: 'wicket_type'

In [None]:
for d in icc_data:
    print(d)

In [61]:
from sklearn import preprocessing
id_encoder = preprocessing.LabelEncoder()

# indian_matches = deliveries[deliveries['batting_team'] == 'Afghanistan']
indian_matches = deliveries[deliveries['bowling_team'] == 'India']
matches = indian_matches['match_id'].unique()

scores = pd.DataFrame(columns=np.append(['player'], matches))
scores.head()
############################################################################################
first_row = ['player', 1384396, 1384400, 1384403, 1384408, 1384412, 1384420, 1384424, 1384428]
main_data = []
main_data.append(first_row)
############################################################################################

In [62]:

indian_squad = player_teamwise[player_teamwise['team'] == 'India']
for player in indian_squad['player']:
    temp = []
    player_on_strike = indian_matches[indian_matches['bowler'] == player]
    scores['player'] = player
    records = player_on_strike.groupby('match_id').sum()['runs_off_bat']
    records += player_on_strike.groupby('match_id').sum()['extras']
    
    records = records
    print(player)
    print(records)
    
    temp.append(player)
    for match in matches:
        if match in records.keys():
            temp.append(records[match])
        else:
            temp.append('-')
    main_data.append(temp)

print(scores)

HH Pandya
match_id
1384396    28
1384400    45
1384403    34
1384408     8
Name: runs_off_bat, dtype: int64
Ishan Kishan
Series([], Name: runs_off_bat, dtype: int64)
JJ Bumrah
match_id
1384396    36
1384400    40
1384403    20
1384408    41
1384412    45
1384420    32
1384424     8
1384428    14
Name: runs_off_bat, dtype: int64
KL Rahul
Series([], Name: runs_off_bat, dtype: int64)
Kuldeep Yadav
match_id
1384396    42
1384400    40
1384403    35
1384408    47
1384412    76
1384420    24
1384424     3
1384428     7
Name: runs_off_bat, dtype: int64
Mohammed Shami
match_id
1384412    59
1384420    22
1384424    19
1384428    18
Name: runs_off_bat, dtype: int64
Mohammed Siraj
match_id
1384396    26
1384400    78
1384403    52
1384408    61
1384412    45
1384420    35
1384424    21
1384428    11
Name: runs_off_bat, dtype: int64
R Ashwin
match_id
1384396    34
Name: runs_off_bat, dtype: int64
RA Jadeja
match_id
1384396    33
1384400    38
1384403    38
1384408    38
1384412    48
1384420    1

  records = player_on_strike.groupby('match_id').sum()['runs_off_bat']
  records += player_on_strike.groupby('match_id').sum()['extras']
  records = player_on_strike.groupby('match_id').sum()['runs_off_bat']
  records += player_on_strike.groupby('match_id').sum()['extras']
  records = player_on_strike.groupby('match_id').sum()['runs_off_bat']
  records += player_on_strike.groupby('match_id').sum()['extras']
  records = player_on_strike.groupby('match_id').sum()['runs_off_bat']
  records += player_on_strike.groupby('match_id').sum()['extras']
  records = player_on_strike.groupby('match_id').sum()['runs_off_bat']
  records += player_on_strike.groupby('match_id').sum()['extras']
  records = player_on_strike.groupby('match_id').sum()['runs_off_bat']
  records += player_on_strike.groupby('match_id').sum()['extras']
  records = player_on_strike.groupby('match_id').sum()['runs_off_bat']
  records += player_on_strike.groupby('match_id').sum()['extras']
  records = player_on_strike.groupby('mat

In [63]:
for d in main_data:
    print(d)


['player', 1384396, 1384400, 1384403, 1384408, 1384412, 1384420, 1384424, 1384428]
['HH Pandya', 28, 45, 34, 8, '-', '-', '-', '-']
['Ishan Kishan', '-', '-', '-', '-', '-', '-', '-', '-']
['JJ Bumrah', 36, 40, 20, 41, 45, 32, 8, 14]
['KL Rahul', '-', '-', '-', '-', '-', '-', '-', '-']
['Kuldeep Yadav', 42, 40, 35, 47, 76, 24, 3, 7]
['Mohammed Shami', '-', '-', '-', '-', 59, 22, 19, 18]
['Mohammed Siraj', 26, 78, 52, 61, 45, 35, 21, 11]
['R Ashwin', 34, '-', '-', '-', '-', '-', '-', '-']
['RA Jadeja', 33, 38, 38, 38, 48, 16, 4, 33]
['RG Sharma', '-', '-', '-', '-', '-', '-', '-', '-']
['SA Yadav', '-', '-', '-', '-', '-', '-', '-', '-']
['SN Thakur', '-', 31, 12, 59, '-', '-', '-', '-']
['SS Iyer', '-', '-', '-', '-', '-', '-', '-', '-']
['Shubman Gill', '-', '-', '-', '-', '-', '-', '-', '-']
['V Kohli', '-', '-', '-', 2, '-', '-', '-', '-']


In [None]:
# save main_data to csv using pandas
df = pd.DataFrame(icc_data)
df.to_csv('india_batting.csv', index=False)


In [None]:
from sklearn.preprocessing import LabelEncoder

team_encoder = LabelEncoder()

deliveries['batting_team'] = team_encoder.fit_transform(deliveries['batting_team'])
deliveries['bowling_team'] = team_encoder.fit_transform(deliveries['bowling_team'])


Label Encodings of playing team as per the dataset

[0 1 2 3 4 5 6 7 8 9]

['Afghanistan' 'Australia' 'Bangladesh' 'England' 'India' 'Netherlands'
 'New Zealand' 'Pakistan' 'South Africa' 'Sri Lanka']

In [None]:
# Labels encoded to the teams

teams = deliveries['batting_team'].unique()
print(np.sort(teams))
print(team_encoder.inverse_transform(np.sort(teams)))

Label Encodings of players

In [None]:
players = np.append(deliveries['striker'].unique(), np.append(deliveries['non_striker'].unique(), deliveries['bowler'].unique()))
players = np.unique(players)
# print(len(players))

player_encoder = LabelEncoder()
player_encoder.fit(players)

deliveries['striker'] = player_encoder.transform(deliveries['striker'])
deliveries['non_striker'] = player_encoder.transform(deliveries['non_striker'])
deliveries['bowler'] = player_encoder.transform(deliveries['bowler'])

Label Encodings of Stadiums

In [None]:
grounds = deliveries['venue'].unique()

ground_encoder = LabelEncoder()
ground_encoder.fit(grounds)

deliveries['venue'] = ground_encoder.transform(deliveries['venue'])

In [None]:
deliveries.head()

In [None]:
deliveries = deliveries[['venue', 'innings', 'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler', 'runs_off_bat']]
deliveries.head()