# A class for an Innings
The working notebook for a class that, given a particular match, can produce tables of each innings in a match. Separating the batting stats, fall of wickets and bowling figures. 

In [1]:
# Imports
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import re

In [2]:
windies = 'https://www.espncricinfo.com/series/19309/scorecard/1193500/afghanistan-vs-west-indies-only-test-west-indies-in-india-2019-20'
windies_soup = BeautifulSoup(requests.get(windies).text, features="html.parser")

The page that we are looking at is broken up into tabs that fold and hide for each innings. We can identify these by looking for a particular class found in the html: `sub-module scorecard`.   

This means that is a game is in progress, it won't necessarily have 4, but completed games may. 

In [3]:
scorecard_modules = windies_soup.find_all('article', {"class": "sub-module scorecard"})
len(scorecard_modules)

4

I'm going to create a class for an innings, we will be able to use this again later when working up to a match level but I think it's best that we just clean it an innings at a time for now. 

In [4]:
class Innings():
    def __init__(self, raw_html):
        self.raw_html = raw_html
        self.title = raw_html.find('h2').text
        self.total = raw_html.find('div', {"class": "scorecard-section batsmen"}).find_all('div', {'class':'wrap total'})[0].find_all('div')[1].text
        
    def batting(self):
        '''Using a raw scorecard tab, find the batting details'''
        batsmen_rows = self.raw_html.find('div', {"class": "scorecard-section batsmen"}).find_all('div', {"class": "flex-row"})

        # Loop through all the rows, and keep only those that contain batting information
        batting = []    
        for i in batsmen_rows:
            details = i.find('div', {"class": "wrap batsmen"})
            if details != None:
                batting.append(details)

        # Create a pandas DataFrame and add column names 
        inningsdf = pd.DataFrame([self.clean_batting_row(i) for i in batting])
        
        # Scorecards can contain an extra column: minutes
        if len(inningsdf.columns) == 8: 
            summary_columns = ['batsman', 'how_out', 'runs', 
                             'balls_faced', 'minutes', 'fours', 'sixes', 
                             'strike_rate']
        else:
            summary_columns = ['batsman', 'how_out', 'runs', 
                             'balls_faced', 'fours', 'sixes', 
                             'strike_rate']
            
        inningsdf.columns = summary_columns
        
        # Generate batting position as the index
        inningsdf.index = inningsdf.index + 1
        
        # Add some flags about the players 
        inningsdf['is_out'] = inningsdf['how_out'].str.lower() != 'not out'
        inningsdf['is_keeper'] = inningsdf['batsman'].str.contains('†')
        inningsdf['is_captain'] = inningsdf['batsman'].str.contains('\(c\)')
        
        # Clean the visual indicators of captaincy and wicket keeper from name column
        inningsdf['batsman'] = inningsdf['batsman'].str.replace('†', '')
        inningsdf['batsman'] = inningsdf['batsman'].str.replace('\(c\)', '')
        
        return(inningsdf)
    
    def bowling(self):
        bowling_section = self.raw_html.find('div', {'class':'scorecard-section bowling'})
        bowling_headers = [x.text for x in bowling_section.find('thead').find_all('th')]
        bowling_body = pd.DataFrame([[y.text for y in x.find_all('td')] for x in bowling_section.find('tbody').find_all('tr')])
        bowling_body.columns = bowling_headers
        bowling_body.drop('', inplace=True, axis=1)
        return(bowling_body)
    
    def clean_batting_row(self, row):
        clean = [row.find('div', {"class": "cell batsmen"}).text]
        clean.append(row.find('div', {"class":"cell commentary"}).text)
        clean = clean + [x.text for x in row.find_all('div', {"class": "cell runs"})]
        return(clean)
    
    def fall_of_wickets(self):
        fall_of_wickets = [x.find('div', {"class": "wrap dnb"}) for x in self.raw_html.find('div', {"class": "scorecard-section batsmen"}).find_all('div', {"class": "flex-row"}) if x.find('div', {"class": "wrap dnb"}) != None] 
        clean_fow = fall_of_wickets[0].text.replace('Fall of wickets: ', '').split('), ')
        cleaner_fow = [x.replace('(', '').replace(')', '').split(', ') for x in clean_fow]
        fow_df = pd.DataFrame([split_fow(x[0]) for x in cleaner_fow], columns = ['wicket', 'runs', 'out_batsman'])
        fow_df['overs'] = [x[1].replace(' ov', '') for x in cleaner_fow]
        fow_df['partnership'] =  fow_df['runs'].astype(int) - fow_df['runs'].astype(int).shift(1)
        fow_df['partnership'] = fow_df['partnership'].fillna(fow_df.loc[0, 'runs'])
        return(fow_df)

In [5]:
print(Innings(scorecard_modules[0]).title)
print(Innings(scorecard_modules[1]).title)
print(Innings(scorecard_modules[2]).title)
print(Innings(scorecard_modules[3]).title)

Afghanistan 1st Innings
West Indies 1st Innings
Afghanistan 2nd Innings
West Indies 2nd Innings


In [6]:
Innings(scorecard_modules[0]).batting()

Unnamed: 0,batsman,how_out,runs,balls_faced,minutes,fours,sixes,strike_rate,is_out,is_keeper,is_captain
1,Ibrahim Zadran,c Holder b Cornwall,17,48,48,2,0,35.41,True,False,False
2,Javed Ahmadi,c Brooks b Warrican,39,81,105,5,1,48.14,True,False,False
3,Ihsanullah,c Hope b Cornwall,24,46,68,3,0,52.17,True,False,False
4,Rahmat Shah,c Holder b Cornwall,4,11,17,1,0,36.36,True,False,False
5,Asghar Afghan,c †Dowrich b Cornwall,4,16,26,0,0,25.0,True,False,False
6,Nasir Jamal,c Hope b Cornwall,2,15,10,0,0,13.33,True,False,False
7,Afsar Zazai,lbw b Cornwall,32,70,99,3,0,45.71,True,True,False
8,Rashid Khan,c Cornwall b Holder,1,15,20,0,0,6.66,True,False,True
9,Amir Hamza,c †Dowrich b Holder,34,84,92,5,0,40.47,True,False,False
10,Yamin Ahmadzai,c Warrican b Cornwall,18,22,29,3,1,81.81,True,False,False


In [7]:
england = 'https://www.espncricinfo.com/series/19297/scorecard/1187672/new-zealand-vs-england-2nd-test-england-in-new-zealand-2019-20'
england_soup = BeautifulSoup(requests.get(england).text, features="html.parser")
scorecards = england_soup.find_all('article', {"class": "sub-module scorecard"})

In [8]:
Innings(scorecards[0]).batting()

Unnamed: 0,batsman,how_out,runs,balls_faced,minutes,fours,sixes,strike_rate,is_out,is_keeper,is_captain
1,JA Raval,c Root b Broad,5,21,28,0,0,23.8,True,False,False
2,TWM Latham,b Broad,105,172,249,16,0,61.04,True,False,False
3,KS Williamson,c Root b Woakes,4,20,30,0,0,20.0,True,False,True
4,LRPL Taylor,c Root b Woakes,53,100,155,8,0,53.0,True,False,False
5,HM Nicholls,c Broad b Curran,16,48,64,3,0,33.33,True,False,False
6,BJ Watling,c Burns b Broad,55,192,263,7,0,28.64,True,True,False
7,DJ Mitchell,c Archer b Broad,73,159,250,8,1,45.91,True,False,False
8,MJ Santner,c Woakes b Archer,23,39,59,1,2,58.97,True,False,False
9,TG Southee,c †Pope b Woakes,18,16,23,3,0,112.5,True,False,False
10,MJ Henry,not out,5,7,19,1,0,71.42,False,False,False


In [9]:
for i, j in enumerate(scorecards):
    print("%s: %s" % (Innings(scorecards[i]).title, Innings(scorecards[i]).total))

New Zealand 1st Innings: 375 all out (129.1 Overs, RR: 2.9, 572 minutes)
England 1st Innings: 476 all out (162.5 Overs, RR: 2.92, 706 minutes)
New Zealand 2nd Innings: 241/2 (75 Overs, RR: 3.21, 318 minutes)


## Fall of wickets

If you look at the scorecard page then you can see that the fall of wickets is found on its own line. This is something that I would to extract and use to calculate the partnerships; there is a summary tab that we can obtain this through, but getting it from two sources could provide us with a good method of checking our working. 

In [10]:
def split_fow(row):
    '''Split a simple fall of wicket string into the 3 main parts'''
    fow_re = re.compile('^([0-9]*)\-([0-9]*) (.*)$').search(row)
    wickets = fow_re.group(1)
    runs = fow_re.group(2)
    batsman = fow_re.group(3)
    return([wickets, runs, batsman])

Loop through the fall of wicket row that is part of the scorecard and clean it to make it available for putting into a `pd.DataFrame`. 

In [11]:
fall_of_wickets = [x.find('div', {"class": "wrap dnb"}) for x in scorecards[0].find('div', {"class": "scorecard-section batsmen"}).find_all('div', {"class": "flex-row"}) if x.find('div', {"class": "wrap dnb"}) != None] 
clean_fow = fall_of_wickets[0].text.replace('Fall of wickets: ', '').split('), ')
cleaner_fow = [x.replace('(', '').replace(')', '').split(', ') for x in clean_fow]
cleaner_fow

[['1-16 Jeet Raval', '6.6 ov'],
 ['2-39 Kane Williamson', '13.6 ov'],
 ['3-155 Ross Taylor', '47.4 ov'],
 ['4-182 Tom Latham', '55.6 ov'],
 ['5-191 Henry Nicholls', '63.1 ov'],
 ['6-315 BJ Watling', '116.5 ov'],
 ['7-330 Daryl Mitchell', '120.4 ov'],
 ['8-357 Tim Southee', '125.1 ov'],
 ['9-375 Mitchell Santner', '128.6 ov'],
 ['10-375 Neil Wagner', '129.1 ov']]

In [12]:
fow_df = pd.DataFrame([split_fow(x[0]) for x in cleaner_fow], columns = ['wicket', 'runs', 'out_batsman'])
fow_df['overs'] = [x[1].replace(' ov', '') for x in cleaner_fow]
fow_df['partnership'] =  fow_df['runs'].astype(int) - fow_df['runs'].astype(int).shift(1)
fow_df['partnership'] = fow_df['partnership'].fillna(fow_df.loc[0, 'runs'])
fow_df

Unnamed: 0,wicket,runs,out_batsman,overs,partnership
0,1,16,Jeet Raval,6.6,16
1,2,39,Kane Williamson,13.6,23
2,3,155,Ross Taylor,47.4,116
3,4,182,Tom Latham,55.6,27
4,5,191,Henry Nicholls,63.1,9
5,6,315,BJ Watling,116.5,124
6,7,330,Daryl Mitchell,120.4,15
7,8,357,Tim Southee,125.1,27
8,9,375,Mitchell Santner,128.6,18
9,10,375,Neil Wagner,129.1,0


We can then wrap that logic into a method that we can place inside our `Innings` class. 

In [13]:
Innings(scorecards[0]).fall_of_wickets()

Unnamed: 0,wicket,runs,out_batsman,overs,partnership
0,1,16,Jeet Raval,6.6,16
1,2,39,Kane Williamson,13.6,23
2,3,155,Ross Taylor,47.4,116
3,4,182,Tom Latham,55.6,27
4,5,191,Henry Nicholls,63.1,9
5,6,315,BJ Watling,116.5,124
6,7,330,Daryl Mitchell,120.4,15
7,8,357,Tim Southee,125.1,27
8,9,375,Mitchell Santner,128.6,18
9,10,375,Neil Wagner,129.1,0


## Bowling figures 
Now that we have done batting section of the scorecard, we can work on the bowling segment of the scorecards, and then we can completely recreate the scorecard using the scraper and our `Innings` class. 

In [14]:
bowling_section = scorecard_modules[0].find('div', {'class':'scorecard-section bowling'})
bowling_headers = [x.text for x in bowling_section.find('thead').find_all('th')]
bowling_body = pd.DataFrame([[y.text for y in x.find_all('td')] for x in bowling_section.find('tbody').find_all('tr')])
bowling_body.columns = bowling_headers
bowling_body.drop('', inplace=True, axis=1)
bowling_body

Unnamed: 0,Bowling,O,M,R,W,Econ,WD,NB
0,KAJ Roach,8.0,1,33,0,4.12,0,0
1,JO Holder,17.0,10,22,2,1.29,0,0
2,RRS Cornwall,25.3,5,75,7,2.94,0,0
3,JA Warrican,13.0,1,35,1,2.69,0,0
4,RL Chase,5.0,0,10,0,2.0,0,0


This does everything that we need to be able to print the scorecard out using the class. 

In [15]:
Innings(scorecards[0]).bowling()

Unnamed: 0,Bowling,O,M,R,W,Econ,WD,NB
0,SCJ Broad,28.0,7,73,4,2.6,0,0
1,JC Archer,28.0,8,75,1,2.67,0,0
2,CR Woakes,31.0,6,83,3,2.67,0,0
3,SM Curran,23.1,7,63,2,2.71,0,0
4,JE Root,3.0,0,14,0,4.66,0,0
5,BA Stokes,13.0,5,36,0,2.76,0,0
6,JL Denly,3.0,0,13,0,4.33,0,0


## Print the whole scorecard 

In [21]:
for innings in scorecards:
    innings_object = Innings(innings)
    print(innings_object.title)
    print(innings_object.batting())
    print(innings_object.total)
    print(innings_object.bowling())
    print()

New Zealand 1st Innings
           batsman            how_out runs balls_faced minutes fours sixes  \
1         JA Raval     c Root b Broad    5          21      28     0     0   
2       TWM Latham            b Broad  105         172     249    16     0   
3   KS Williamson     c Root b Woakes    4          20      30     0     0   
4      LRPL Taylor    c Root b Woakes   53         100     155     8     0   
5      HM Nicholls   c Broad b Curran   16          48      64     3     0   
6      BJ Watling     c Burns b Broad   55         192     263     7     0   
7      DJ Mitchell   c Archer b Broad   73         159     250     8     1   
8       MJ Santner  c Woakes b Archer   23          39      59     1     2   
9       TG Southee   c †Pope b Woakes   18          16      23     3     0   
10        MJ Henry            not out    5           7      19     1     0   
11        N Wagner  c Sibley b Curran    0           1       1     0     0   

   strike_rate  is_out  is_keeper  is_c