# Building a class for a player
I think that it would be very useful for us to wrap the functionality of the analysis into a class; one that we can just pass the link to the page and then we have access to all their details. 

In [1]:
# Required imports
import requests
import pandas as pd
from bs4 import BeautifulSoup

### Initial workings
This notebook contains the workings of getting to the class, and I will save a version of the class in a `.py` file ([here](../CricketPlayer.py)) as well so we can easily import that into other notebooks later.  

The final version of the class is kept in that final class file linked above. 

In [2]:
class CricketPlayer:
    def __init__(self, innings_by_innings_link):
        self.link = innings_by_innings_link
        self.soup = BeautifulSoup(requests.get(innings_by_innings_link).text, features="html.parser")
        
    def view_raw_html(self):
        return(self.soup)
    
    def raw_innings(self):
        for caption in self.soup.find_all('caption'):
            if caption.get_text() == 'Innings by innings list':
                main_table = caption.find_parent('table', {'class': 'engineTable'})
                
        columns = [header.get_text() for header in main_table.find('thead').find_all('tr')[0].find_all('th')]
        rows = []

        for innings in [row for row in main_table.find('tbody').find_all('tr')]:
            rows.append([stat.get_text() for stat in innings.find_all('td')])
            
        return(pd.DataFrame(rows, columns=columns))
    
    def innings(self):
        raw_innings = self.raw_innings()
        raw_innings['Opposition'] = raw_innings['Opposition'].str.replace('v ', '')
        raw_innings.replace('-', np.nan, inplace=True)
        raw_innings.columns = raw_innings.columns.str.lower().str.replace(' ', '_')
        raw_innings['is_out'] = raw_innings.score.astype('str').apply(lambda x: False if x in ['nan', 'DNB'] else False if '*' in x else True)
        raw_innings['did_bowl'] = raw_innings.overs.astype('str').apply(lambda x: False if x in ['nan', 'DNB'] else True)
        raw_innings['did_bat'] = raw_innings.score.str.replace('*', '').astype('str').apply(lambda x: True if x.isnumeric() else False)
        raw_innings['score'] = raw_innings['score'].str.replace('*', '')
        return(raw_innings[['inns', 'score', 'did_bat', 'is_out', 'overs', 'conc', 'wkts', 'did_bowl', 'ct', 'st', 'opposition', 'ground', 'start_date']])
    
    def batting_summary(self):
        innings = self.innings()
        total_at_bats = innings.did_bat.sum()
        dismissals = innings.is_out.sum()
        total_runs = innings.score[innings.did_bat].dropna().astype('int').sum()
        return(pd.DataFrame({'Innings': total_at_bats, 
                             'Dismissals': dismissals, 
                             'Total Runs': total_runs, 
                             'Average': round(total_runs/dismissals, 4)}, index=['Overall']))

In [3]:
virat_kohli = 'http://stats.espncricinfo.com/ci/engine/player/253802.html?class=1;template=results;type=allround;view=innings'
steve_smith = 'http://stats.espncricinfo.com/ci/engine/player/267192.html?class=1;template=results;type=allround;view=innings'
kane_williamson = 'http://stats.espncricinfo.com/ci/engine/player/277906.html?class=1;template=results;type=allround;view=innings'
joe_root = 'http://stats.espncricinfo.com/ci/engine/player/303669.html?class=1;template=results;type=allround;view=innings'

In [4]:
root = CricketPlayer(joe_root)

In [5]:
innings = root.raw_innings()
innings

Unnamed: 0,Inns,Score,Overs,Conc,Wkts,Ct,St,Unnamed: 8,Opposition,Ground,Start Date,Unnamed: 12
0,1,73,-,-,-,-,-,,v India,Nagpur,13 Dec 2012,Test # 2066
1,2,-,1.0,5,0,0,0,,v India,Nagpur,13 Dec 2012,Test # 2066
2,3,20*,-,-,-,-,-,,v India,Nagpur,13 Dec 2012,Test # 2066
3,1,4,-,-,-,-,-,,v New Zealand,Dunedin,6 Mar 2013,Test # 2077
4,2,-,5.0,8,0,0,0,,v New Zealand,Dunedin,6 Mar 2013,Test # 2077
5,3,0,-,-,-,-,-,,v New Zealand,Dunedin,6 Mar 2013,Test # 2077
6,1,10,-,-,-,-,-,,v New Zealand,Wellington,14 Mar 2013,Test # 2080
7,2,-,1.0,6,0,0,0,,v New Zealand,Wellington,14 Mar 2013,Test # 2080
8,3,-,2.0,12,0,0,0,,v New Zealand,Wellington,14 Mar 2013,Test # 2080
9,1,-,2.0,5,0,0,0,,v New Zealand,Auckland,22 Mar 2013,Test # 2084


In [6]:
# Remove 'v ' from the beginning of each opposition
innings['Opposition'] = innings['Opposition'].str.replace('v ', '')

In [7]:
# Remove the empty columns that are used for spacing and links on the website 
innings.drop('', axis=1, inplace = True)
innings.head()

Unnamed: 0,Inns,Score,Overs,Conc,Wkts,Ct,St,Opposition,Ground,Start Date
0,1,73,-,-,-,-,-,India,Nagpur,13 Dec 2012
1,2,-,1.0,5,0,0,0,India,Nagpur,13 Dec 2012
2,3,20*,-,-,-,-,-,India,Nagpur,13 Dec 2012
3,1,4,-,-,-,-,-,New Zealand,Dunedin,6 Mar 2013
4,2,-,5.0,8,0,0,0,New Zealand,Dunedin,6 Mar 2013


I think we want to replace the blank '-' with `np.nan` to neaten up the output. 

In [8]:
import numpy as np
innings.replace('-', np.nan, inplace=True)

In [9]:
innings.head()

Unnamed: 0,Inns,Score,Overs,Conc,Wkts,Ct,St,Opposition,Ground,Start Date
0,1,73,,,,,,India,Nagpur,13 Dec 2012
1,2,,1.0,5.0,0.0,0.0,0.0,India,Nagpur,13 Dec 2012
2,3,20*,,,,,,India,Nagpur,13 Dec 2012
3,1,4,,,,,,New Zealand,Dunedin,6 Mar 2013
4,2,,5.0,8.0,0.0,0.0,0.0,New Zealand,Dunedin,6 Mar 2013


Clean up the column names. 

In [10]:
innings.columns = innings.columns.str.lower().str.replace(' ', '_')

In [11]:
innings.head()

Unnamed: 0,inns,score,overs,conc,wkts,ct,st,opposition,ground,start_date
0,1,73,,,,,,India,Nagpur,13 Dec 2012
1,2,,1.0,5.0,0.0,0.0,0.0,India,Nagpur,13 Dec 2012
2,3,20*,,,,,,India,Nagpur,13 Dec 2012
3,1,4,,,,,,New Zealand,Dunedin,6 Mar 2013
4,2,,5.0,8.0,0.0,0.0,0.0,New Zealand,Dunedin,6 Mar 2013


It will be easier if we have a flag for working out whether the batsman got out or not. 

In [12]:
innings['is_out'] = innings.score.astype('str').apply(lambda x: np.nan if x == 'nan' else False if '*' in x else True)

In [13]:
innings.head()

Unnamed: 0,inns,score,overs,conc,wkts,ct,st,opposition,ground,start_date,is_out
0,1,73,,,,,,India,Nagpur,13 Dec 2012,True
1,2,,1.0,5.0,0.0,0.0,0.0,India,Nagpur,13 Dec 2012,
2,3,20*,,,,,,India,Nagpur,13 Dec 2012,False
3,1,4,,,,,,New Zealand,Dunedin,6 Mar 2013,True
4,2,,5.0,8.0,0.0,0.0,0.0,New Zealand,Dunedin,6 Mar 2013,


Add a flag to indicate whether the player bowled in that particular innings. 

In [14]:
innings['did_bowl'] = innings.overs.astype('str').apply(lambda x: False if x in ['nan', 'DNB'] else True)

In [15]:
innings[['inns', 'score', 'is_out', 'overs', 'conc', 
         'wkts', 'did_bowl', 'ct', 'st', 'opposition', 'ground', 'start_date']].head()

Unnamed: 0,inns,score,is_out,overs,conc,wkts,did_bowl,ct,st,opposition,ground,start_date
0,1,73,True,,,,False,,,India,Nagpur,13 Dec 2012
1,2,,,1.0,5.0,0.0,True,0.0,0.0,India,Nagpur,13 Dec 2012
2,3,20*,False,,,,False,,,India,Nagpur,13 Dec 2012
3,1,4,True,,,,False,,,New Zealand,Dunedin,6 Mar 2013
4,2,,,5.0,8.0,0.0,True,0.0,0.0,New Zealand,Dunedin,6 Mar 2013


Cleaned up `raw_innings()` so that we have another function that returns a clean `pd.DataFrame`. 

In [16]:
innings['did_bat'] = innings.score.str.replace('*', '').astype('str').apply(lambda x: True if x.isnumeric() else False)

We can check that this correctly filters out the 'DNB' values. 

In [17]:
innings[innings.score == 'DNB']

Unnamed: 0,inns,score,overs,conc,wkts,ct,st,opposition,ground,start_date,is_out,did_bowl,did_bat
89,4,DNB,,,,,,West Indies,St George's,21 Apr 2015,True,False,False
153,4,DNB,,,,,,Sri Lanka,Chester-le-Street,27 May 2016,True,False,False
226,4,DNB,,,,,,West Indies,Lord's,7 Sep 2017,True,False,False
327,3,DNB,,,,,,New Zealand,Mount Maunganui,21 Nov 2019,True,False,False


In [18]:
# Innings that he didn't bat 
root.innings()[np.invert(root.innings().did_bat)].shape

(168, 13)

In [19]:
# Innings that he did bat 
root.innings()[root.innings().did_bat].shape

(160, 13)

In [20]:
# Total shape of data
root.innings().shape

(328, 13)

In [21]:
not_outs = root.innings()[(root.innings().did_bat) & np.invert(root.innings().is_out)]

We can join back into the raw scraping from the website to check that our not out flag is working correctly. 

In [22]:
root.raw_innings().iloc[not_outs.index]

Unnamed: 0,Inns,Score,Overs,Conc,Wkts,Ct,St,Unnamed: 8,Opposition,Ground,Start Date,Unnamed: 12
2,3,20*,-,-,-,-,-,,v India,Nagpur,13 Dec 2012,Test # 2066
32,4,13*,-,-,-,-,-,,v Australia,Manchester,1 Aug 2013,Test # 2092
44,4,26*,-,-,-,-,-,,v Australia,Brisbane,21 Nov 2013,Test # 2103
57,1,200*,-,-,-,-,-,,v Sri Lanka,Lord's,12 Jun 2014,Test # 2124
66,2,154*,-,-,-,-,-,,v India,Nottingham,9 Jul 2014,Test # 2128
80,2,149*,-,-,-,-,-,,v India,The Oval,15 Aug 2014,Test # 2137
87,2,182*,-,-,-,-,-,,v West Indies,St George's,21 Apr 2015,Test # 2158
113,4,38*,-,-,-,-,-,,v Australia,Birmingham,29 Jul 2015,Test # 2173
123,4,33*,-,-,-,-,-,,v Pakistan,Abu Dhabi,13 Oct 2015,Test # 2180
142,4,4*,-,-,-,-,-,,v South Africa,Johannesburg,14 Jan 2016,Test # 2199


In [23]:
root.raw_innings().iloc[root.innings()[root.innings().did_bat].index]

Unnamed: 0,Inns,Score,Overs,Conc,Wkts,Ct,St,Unnamed: 8,Opposition,Ground,Start Date,Unnamed: 12
0,1,73,-,-,-,-,-,,v India,Nagpur,13 Dec 2012,Test # 2066
2,3,20*,-,-,-,-,-,,v India,Nagpur,13 Dec 2012,Test # 2066
3,1,4,-,-,-,-,-,,v New Zealand,Dunedin,6 Mar 2013,Test # 2077
5,3,0,-,-,-,-,-,,v New Zealand,Dunedin,6 Mar 2013,Test # 2077
6,1,10,-,-,-,-,-,,v New Zealand,Wellington,14 Mar 2013,Test # 2080
10,2,45,-,-,-,-,-,,v New Zealand,Auckland,22 Mar 2013,Test # 2084
12,4,29,-,-,-,-,-,,v New Zealand,Auckland,22 Mar 2013,Test # 2084
13,1,40,-,-,-,-,-,,v New Zealand,Lord's,16 May 2013,Test # 2088
15,3,71,-,-,-,-,-,,v New Zealand,Lord's,16 May 2013,Test # 2088
17,1,104,-,-,-,-,-,,v New Zealand,Leeds,24 May 2013,Test # 2089


We can use the clean data, with our defined flags to generate a summary data frame for the entire innings set. 

In [24]:
root.batting_summary()

Unnamed: 0,Innings,Dismissals,Total Runs,Average
Overall,160,148,7045,47.6014
