# Simulator for VKholi

## Grab data of an ODI from espn cricinfo

In [138]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np

def extract_batting_data(series_id, match_id):

    URL = 'https://www.espncricinfo.com/series/'+ str(series_id) + '/scorecard/' + str(match_id)
    page = requests.get(URL)
    bs = BeautifulSoup(page.content, "html.parser")

    table_body=bs.find_all('tbody')
    batsmen_df = pd.DataFrame(columns=["Name","Desc","Runs", "Balls", "4s", "6s", "SR", "Team"])
    for i, table in enumerate(table_body[0:4:2]):
        rows = table.find_all('tr')
        for row in rows[::2]:
            cols=row.find_all('td')
            cols=[x.text.strip() for x in cols]
            if cols[0] == 'Extras':
                continue
            if len(cols) > 7:
                batsmen_df = batsmen_df.append(pd.Series(
                [re.sub(r"\W+", ' ', cols[0].split("(c)")[0]).strip(), cols[1], 
                cols[2], cols[3], cols[5], cols[6], cols[7], i+1], 
                index=batsmen_df.columns ), ignore_index=True)
            else:
                batsmen_df = batsmen_df.append(pd.Series(
                [re.sub(r"\W+", ' ', cols[0].split("(c)")[0]).strip(), cols[1], 
                0, 0, 0, 0, 0, i+1], index = batsmen_df.columns), ignore_index=True)
                
    for i in range(2):
        dnb_row = bs.find_all("tfoot")[i].find_all("div")
        for c in dnb_row:
            dnb_cols = c.find_all('span')
            dnb = [x.text.strip().split("(c)")[0] for x in dnb_cols]
            dnb = filter(lambda item: item, [re.sub(r"\W+", ' ', x).strip() for x in dnb])
            for dnb_batsman in dnb:
                batsmen_df = batsmen_df.append(pd.Series([dnb_batsman, "DNB", 0, 0, 0, 0, 0, i+1], index = batsmen_df.columns), ignore_index =True)

    return batsmen_df
    
df = extract_batting_data(series_id = 19846, match_id = 1223957)

## Finding the probabilities of runs scored such as 4s , 6s ,3s, 2s
- Probability of 3s scored by assuming the remaning runs that are divisible by 3 are 3s
- Scores of 1 and 2 were compounded together as 1s and 2s 
- Score for 1s and 2s are assumed as the remaining scores after 3s , 4s , 6s have been calculated.

In [139]:
df.loc[df['Name'] == 'V Kohli','6s'] = 1
df.head()

Unnamed: 0,Name,Desc,Runs,Balls,4s,6s,SR,Team
0,S Dhawan,c Agar b Abbott,16,27,2,0,59.25,1
1,Shubman Gill,lbw b Agar,33,39,3,1,84.61,1
2,V Kohli,c †Carey b Hazlewood,63,78,5,1,80.76,1
3,SS Iyer,c Labuschagne b Zampa,19,21,2,0,90.47,1
4,KL Rahul,lbw b Agar,5,11,0,0,45.45,1


In [140]:
df['p_4s'] = df['4s'].astype(int)/ df['Balls'].astype(int)
df['p_6s'] = df['6s'].astype(int)/ df['Balls'].astype(int)

df['n_3s'] = (df['Runs'].astype(int) - (df['4s'].astype(int)*4 + df['6s'].astype(int)*6)) //3
df['p_3s'] = df['n_3s'].astype(int)/df['Balls'].astype(int)

df['n_1s_2s'] =df['Runs'].astype(int)-( df['4s'].astype(int)*4 + df['6s'].astype(int)*6 + df['n_3s'].astype(int)*3)
df['2s'] = df['n_1s_2s'].astype(int)/df['Balls'].astype(int)

In [141]:
df = df[df['Name'] == 'V Kohli']

Manipulated the data to add a six total , since kolhi did not hit any six this particular match

## Conditional probability (when u > v)

- Since I have taken only one line of data , I have calculated conditional probabilities prior 
- Example 
    - P( scoring 1s and 2s / 3s,4s,6s ) = p(scoring 1s and 2s , 3s , 4s , 6s ) / p(scoring 3s,4s,6s) = 8.660090095472535e-09

In [142]:
p_2s = 0.012821 # These values were calculated from the table 
p_3s = 0.179487
p_4s = 0.064103
p_6s = 0.012821
p_all = P_1sand2s*p_3s*p_4s*p_6s #P(AnBnCnD)

In [143]:
def bayes(c1,c2,c3): # Simple function to calculate the Conditional probabilities
    p = p_all / c1*c2*c3
    return p
X= [bayes(p_3s,p_4s,p_6s), #P(1s and 2s / 4s,6s,3s)
bayes(p_1sand2s,p_3s,p_4s), #P(6s / 1s and 2s,6s,3s)
bayes(p_1sand2s,p_4s,p_6s), #P(3s / 1s and 2s , 4s,6s)
bayes(p_1sand2s,p_3s,p_6s)] #P(4s /1s and 2s , 3s,6s)
X

[8.660090095472535e-09,
 1.6972452708675887e-06,
 1.2123653310709608e-07,
 3.39459644911991e-07]

In [156]:
# Since there wasn't any data for calculating # wides , It was assumed 
v  = 0.1  #probability of wide or no ball = # wide balls to a player / total balls to a player

## Assumption for 1 runs

- It was assumed that the probability of scoring 1 run is all 1 - sum of p(scoring all other kind of runs)

In [162]:
s1 = 1- (df['2s'][2]+df['p_3s'][2]+df['p_4s'][2]+df['p_6s'][2])  # P(scoring 1 run)

In [174]:
df['2s'][2],df['p_3s'][2],df['p_4s'][2],df['p_6s'][2]

(0.01282051282051282,
 0.15384615384615385,
 0.0641025641025641,
 0.01282051282051282)

In [178]:
# for i in range(100):
#     print(np.random.multinomial(1,[df['2s'][2],df['p_3s'][2],df['p_4s'][2],df['p_6s'][2]]))

## Scoring Algo:
- I used Multinomial Distribution and recorded the runs as per the paper .
- for wickets I assumed that virat's strike rate is normally distributed with mean 60 for odis and Standard deviation of 5. 
- I assumed that virat gets out if he scored more that his average strike rate.
- I Could apply wicket of virat kolhi based on his current matches average so far

In [221]:
def uni(u,v,R,wickets):
    if u < v : # signals wide or no ball
        y = np.random.multinomial(1,[df['2s'][2],df['p_3s'][2],df['p_4s'][2],df['p_6s'][2]])
        R += 1 + s1 +2*y[0]+3*y[1]+4*y[2]+6*y[3]
    else:
        R += s1 +2*X[0] + 3*X[2] + 4*X[3] + 6*X[1]
    return R

In [280]:
wickets = 0
R = 0
runs = 0 
avg = [] # for strike rate
n = 150 # number of balls
sr = 1000 # assumed value 
for i in range(n):
    if i == np.random.randint(0,n) : # Random number between 0 to n
        mean = sum(avg)/i
        sr = np.random.normal(mean,5) # Strike rate for this match 
    if wickets == 10:
        xb = 0
    else:
        u = np.random.uniform(0,1) 
        runs += uni(u,v,R,wickets) 
        avg.append(runs)
    print('Ball ',i+1,' :',int(runs))
    if runs > sr: #Wickets are based on his strike rate
        wickets += 1
        print('Virat Kohli out for :', int(runs))
        print('Score :', int(runs) , ', wickets :' , 1)
        break
else:
    print('Score by Kohli :', int(runs), '*', 'from ',n,'balls')
# print(u)

Ball  1  : 0
Ball  2  : 5
Ball  3  : 6
Ball  4  : 7
Ball  5  : 7
Ball  6  : 8
Ball  7  : 16
Ball  8  : 17
Ball  9  : 17
Ball  10  : 18
Ball  11  : 19
Ball  12  : 20
Ball  13  : 20
Ball  14  : 21
Ball  15  : 22
Ball  16  : 23
Ball  17  : 23
Ball  18  : 31
Ball  19  : 32
Ball  20  : 33
Ball  21  : 33
Ball  22  : 34
Ball  23  : 35
Ball  24  : 36
Ball  25  : 36
Ball  26  : 41
Ball  27  : 42
Ball  28  : 43
Ball  29  : 43
Ball  30  : 44
Ball  31  : 45
Ball  32  : 46
Ball  33  : 46
Ball  34  : 47
Ball  35  : 48
Ball  36  : 49
Ball  37  : 49
Ball  38  : 50
Ball  39  : 51
Ball  40  : 52
Ball  41  : 58
Ball  42  : 58
Ball  43  : 59
Ball  44  : 60
Ball  45  : 61
Ball  46  : 61
Ball  47  : 62
Ball  48  : 63
Ball  49  : 64
Ball  50  : 64
Ball  51  : 65
Ball  52  : 66
Ball  53  : 67
Ball  54  : 67
Ball  55  : 68
Ball  56  : 69
Ball  57  : 70
Ball  58  : 70
Ball  59  : 71
Ball  60  : 72
Ball  61  : 73
Ball  62  : 80
Ball  63  : 81
Ball  64  : 82
Ball  65  : 83
Ball  66  : 83
Ball  67  : 84
Ball  68  