In [1]:
import json
import csv
import re
import os
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
import holoviews as hv
from holoviews import opts
hv.extension('bokeh', 'matplotlib')
import hvplot.pandas
import panel as pn
import param
pn.extension()

from math import pi
from bokeh.palettes import Category20c, Category20
from bokeh.plotting import figure
from bokeh.transform import cumsum

from geopy.geocoders import Nominatim

In [2]:
ball = pd.read_csv('data\\ball_by_ball.csv')
column_types = {
    'over': 'int',            
    'batter_runs': 'int',          
    'extras_runs': 'int',
    'total_runs': 'int'}
ball = ball.astype(column_types)
ball.sort_values(by='over', inplace=True)
ball['score'] = ball['total_runs'].cumsum()
ball['partnership'] = ball.apply(lambda row: set([row['batter'], row['non_striker']]), axis=1) #fix score

In [10]:
match = pd.read_csv('data\\matches.csv')
match['start_date'] = match['dates'].apply(lambda x: x[:10])
#match['teams'] = match['teams'].apply(lambda x: set(x.split(', ')))
match['start_date'] = pd.to_datetime(match['start_date'])
match['start_date'] = match['start_date'].dt.date
match['players_team1'] = match['players_team1'].apply(lambda x: x.split(', ') if isinstance(x, str) else x)
match['players_team2'] = match['players_team2'].apply(lambda x: x.split(', ') if isinstance(x, str) else x)
match['players'] = match.apply(lambda x : set(x['players_team1']) | set(x['players_team1']),axis = 1)


In [11]:
print(match.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 820 entries, 0 to 819
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   balls_per_over   820 non-null    int64 
 1   city             820 non-null    object
 2   dates            820 non-null    object
 3   event_name       820 non-null    object
 4   match_number     820 non-null    object
 5   gender           820 non-null    object
 6   match_type       820 non-null    object
 7   venue            820 non-null    object
 8   season           820 non-null    object
 9   team_type        820 non-null    object
 10  toss_decision    820 non-null    object
 11  toss_winner      820 non-null    object
 12  outcome_result   820 non-null    object
 13  player_of_match  810 non-null    object
 14  team_1           820 non-null    object
 15  team_2           820 non-null    object
 16  players_team1    820 non-null    object
 17  players_team2    820 non-null    ob

In [12]:
def get_all_players_by_team(team):
    players = set()
    players.update(match.loc[match['team_2'] == team, 'players_team2'].explode())
    players.update(match.loc[match['team_1'] == team, 'players_team1'].explode())
    return players

In [13]:
def get_summary_freq(player):
    df = ball[ball['batter']==player]
    if df.empty:
         freq_by_ball = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0}
    else :
        freq_by_ball = df['total_runs'].value_counts().to_dict()

    return freq_by_ball
    

In [68]:
def get_runs_stats(player):
    df = ball[ball['batter']==player]
    if df.empty:
        match_scores = [0]
        freq_by_ball = {'0':0,'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0}
    else :
        match_scores = df.groupby(['match_id','innings']).agg(total =('batter_runs','sum'))
        freq_by_ball = df['total_runs'].value_counts().to_dict()

    innings = len(df.groupby(['match_id','innings']))
    matches = len(df.groupby(['match_id']))
    
    total = sum(freq_by_ball.values())
    # prob_by_ball = {key : value/total for key, value in freq_by_ball.items()}
    stats = {'balls_played' : total,
             'frequency' : freq_by_ball,
             'mean_runs' : match_scores.mean().iloc[0],
             'highest_runs' : match_scores.max().iloc[0],
             'lowest_runs' : match_scores.min().iloc[0],
             'innings' : innings,
             'matches' : matches} #include total runs, fixes matches played

    return stats
    

In [71]:
def create_batsman_df(team):
    players = get_all_players_by_team(team)
    df = pd.DataFrame({'players': list(players)})
    
    #df = df[df['total_balls_played']>=300]
    
    # df['total_runs'] = df['summary'].apply(lambda x: sum({key: key*value for key, value in x.items()}.values()))

    df['player_details'] = df['players'].apply(get_runs_stats)
    
    details_df = pd.json_normalize(df['player_details'])
    df = pd.concat([df.drop(columns=['player_details']), details_df], axis=1)
    
    print(df.head(10))
    print(df.describe())
    print(df.info())
    return df


In [72]:
ind = create_batsman_df('India')

        players  balls_played  mean_runs  highest_runs  lowest_runs  innings  \
0        SS Das           105  28.000000            28           28        1   
1     RG Sharma          7436  38.468468           212            0      111   
2      SA Yadav            20   8.000000             8            8        1   
3      A Mishra          1108  20.903226            84            0       31   
4      MM Patel           131   4.727273            15            0       11   
5    VVS Laxman         10968  41.945736           200            0      129   
6    D Padikkal           103  65.000000            65           65        1   
7      R Ashwin          6359  23.351351           124            0      148   
8      SK Raina          1449  24.774194           120            0       31   
9  Ishan Kishan            91  26.000000            52            1        3   

   matches  frequency.0  frequency.1  frequency.4  frequency.2  frequency.6  \
0        1           89         10.0    

lets try two approaches

in my first approach i will calculate p of shot played by each batsmen and ball delivered by bowler independent of each other

in my second approach i will calculate p of shot played by a batsman for a given bowler

In [27]:
batsman =  ind.copy(deep =True)


In [28]:
len(ind)

88

In [31]:
def create_bowler_df(team):
    players = get_all_players_by_team(team)
    df = pd.DataFrame({'players': list(players)})
    df['matches_bowled'] = df['players'].apply(lambda x: ball[ball['bowler']==x]['match_id'].nunique())
    df['overs_bowled'] = df['players'].apply(lambda x: len(ball[ball['bowler']==x].groupby(['match_id','over'])))
    print(df.head())
    print(df.describe())
    print(df.info())
    return df
    #introduce innings

In [32]:
bowlers = create_bowler_df('India')

          players  matches_bowled  overs_bowled
0    SR Tendulkar              40           215
1         NV Ojha               0             0
2  Mohammed Shami              64          1721
3    Yuvraj Singh              23           144
4         P Kumar               6           233
       matches_bowled  overs_bowled
count       88.000000     88.000000
mean        12.738636    333.193182
std         21.868728    718.668226
min          0.000000      0.000000
25%          0.000000      0.000000
50%          3.000000     57.000000
75%         14.000000    218.250000
max        104.000000   3999.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   players         88 non-null     object
 1   matches_bowled  88 non-null     int64 
 2   overs_bowled    88 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 2.2+ KB
None
