# Outline for cricket scorecard scraper

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np

In [2]:
url = 'http://www.howstat.com/cricket/Statistics/Matches/MatchScorecard_ODI.asp?MatchCode='
page = '1619'

source = requests.get(f'{url}{page}').text
soup = BeautifulSoup(source, 'lxml')

## Getting all of the data that repeats

In [3]:
repeatables = np.array([item.text.strip() for item in soup.find_all(class_="TextBlack8")])
date, location, result, rr_inn_1, rr_inn_2 = repeatables[[0,1,4,6,8]]

rr_inn_1 = float(rr_inn_1.split('@')[1].split('rpo')[0].strip())
rr_inn_2 = float(rr_inn_2.split('@')[1].split('rpo')[0].strip())

In [4]:
date, location, result, rr_inn_1, rr_inn_2

('2nd January, 2000',
 'Eden Park, Auckland',
 'New Zealand won by 3 wickets [Duckworth-Lewis]',
 5.36,
 5.54)

In [5]:
repeatables_2 = np.array([item.text.strip() for item in soup.find_all(class_="TextBlackBold8")])
team_1, team_1_rr_wicks, team_1_runs, team_2, team_2_rr_wicks, team_2_runs = repeatables_2[[7, 15, 16, 25, 33, 34]]

try: 
    team_1_wicks_lost = int(team_1_rr_wicks.split('\r')[0].split('wickets')[0].strip())
except ValueError:
    team_1_wicks_lost = 10
team_1_rr = float(team_1_rr_wicks.split('@')[1].split('rpo')[0].strip())
team_1_runs = int(team_1_runs)

team_2 = team_2.split('\xa0')[0]
try: 
    team_2_wicks_lost = int(team_2_rr_wicks.split('\r')[0].split('wickets')[0].strip())
except ValueError:
    team_2_wicks_lost = 10
team_2_rr = float(team_2_rr_wicks.split('@')[1].split('rpo')[0].strip())
team_2_runs = int(team_2_runs)

team_1, team_1_rr, team_1_wicks_lost, team_1_runs, team_2, team_2_rr, team_2_wicks_lost, team_2_runs

('West Indies', 5.36, 7, 268, 'New Zealand', 5.54, 7, 250)

In [6]:
# Determining the winner of each match (0-loss, 1-win, 2-draw)
if result.split(" ")[0] == "Sri":
        winner = "Sri Lanka"
elif result.split(" ")[0] == "New":
    winner = "New Zealand"
elif result.split(" ")[0] == "South":
    winner = "South Africa"
elif result.split(" ")[0] == "West":
    winner = "West Indies"
else:
    winner = result.split(" ")[0]

if winner == team_1:
    team_1_win = 1
    team_2_win = 0
elif winner == team_2:
    team_1_win = 0
    team_2_win = 1
else:
    team_1_win = 2
    team_2_win = 2

## Getting all the non-repeatable data

In [7]:
# Making a list of the players
players = np.array([item.text for item in soup.find_all(class_="LinkOff")])
players = players[3:-2]
players

array(['S L Campbell', 'R D Jacobs', 'B C Lara', 'S Chanderpaul',
       'R L Powell', 'F A Rose', 'N O Perry', 'J C Adams', 'M V Dillon',
       'R D King', 'C A Walsh', 'D J Nash', 'C L Cairns', "S B O'Connor",
       'C Z Harris', 'D L Vettori', 'N J Astle', 'C M Spearman',
       'N J Astle', 'S P Fleming', 'C D McMillan', 'C L Cairns',
       'C Z Harris', 'R G Twose', 'A C Parore', 'D J Nash', 'D L Vettori',
       "S B O'Connor", 'C A Walsh', 'F A Rose', 'R D King', 'M V Dillon',
       'N O Perry'], dtype='<U31')

In [8]:
players_team_1 = list(players[0:11])
players_team_1

['S L Campbell',
 'R D Jacobs',
 'B C Lara',
 'S Chanderpaul',
 'R L Powell',
 'F A Rose',
 'N O Perry',
 'J C Adams',
 'M V Dillon',
 'R D King',
 'C A Walsh']

In [9]:
players_team_2 = [x for x in players if x not in players_team_1]
players_team_2 = list(set(players_team_2))
players_team_2

['C M Spearman',
 'A C Parore',
 'N J Astle',
 'D J Nash',
 'R G Twose',
 'C D McMillan',
 'C Z Harris',
 "S B O'Connor",
 'D L Vettori',
 'C L Cairns',
 'S P Fleming']

In [10]:
# Grabbing all of the data from the html
tds = [item.text.strip() for item in soup.find('table').find_all('table')[4].find_all('table')[1].find_all('td')]
tds = np.array(list(map(lambda x: x.replace('\x86', '').replace('*',''),tds)))

In [11]:
# Grabing all of the player scorecard info
player_scorecards = []
used_index = []
for player in players:
    indices = np.where(tds==player)[0]
    if len(indices)==1:
        player_scorecards.append([tds[indices][0], tds[indices+1][0], 
                                  tds[indices+2][0], tds[indices+3][0], 
                                  tds[indices+4][0], tds[indices+5][0], 
                                  tds[indices+6][0]])
        used_index.append(indices[0])
    elif len(indices)==2 and indices[0] not in used_index:
        player_scorecards.append([tds[indices][0], tds[indices+1][0], 
                                  tds[indices+2][0], tds[indices+3][0], 
                                  tds[indices+4][0], tds[indices+5][0], 
                                  tds[indices+6][0]])
        used_index.append(indices[0])
    else:
        player_scorecards.append([tds[indices][1], tds[indices+1][1], 
                                  tds[indices+2][1], tds[indices+3][1], 
                                  tds[indices+4][1], tds[indices+5][1], 
                                  tds[indices+6][1]])

# Appending the date, location, and result for each player
for item in player_scorecards:
    item.append(date)
    item.append(location)
    item.append(result)

In [12]:
# Isolating the batters and bowlers from the scorecard
batters = []
bowlers = []

for item in player_scorecards:
    try:
        float(item[1])
        bowlers.append(item)
    except ValueError:
        batters.append(item)

# Adding the repeat data info for the batsmen
for batter in batters[0:11]:
    batter.append(1)
    batter.append(team_1)
    batter.append(team_1_rr)
    batter.append(team_1_wicks_lost)
    batter.append(team_1_runs)
    batter.append(team_1_win)

for batter in batters[11:]:
    batter.append(2)
    batter.append(team_2)
    batter.append(team_2_rr)
    batter.append(team_2_wicks_lost)
    batter.append(team_2_runs)
    batter.append(team_2_win)

In [13]:
# Removing wickets taken as % of team wickets
for x in bowlers:
    del x[6]

# Adding team name to bowler
for bowler in bowlers:
    if bowler[0] in players_team_1:
        bowler.append(2)
        bowler.append(team_1)
        bowler.append(team_2_rr)
        bowler.append(team_2_wicks_lost)
        bowler.append(team_2_runs)
        bowler.append(team_1_win)
    elif bowler[0] in players_team_2:
        bowler.append(1)
        bowler.append(team_2)
        bowler.append(team_1_rr)
        bowler.append(team_1_wicks_lost)
        bowler.append(team_1_runs)
        bowler.append(team_2_win)

In [14]:
batters_columns = ['Player', 'Dismissal', 'Runs', 'BallsFaced', 'Fours', 'Sixes', 
                   'StrikeRate', 'Date', 'Location', 'Result', 'Innings', 
                   'Team' , 'BatSideRR', 'BatSideWicksLost', 'BatSideScore', 'Win']
batters_df = pd.DataFrame(batters, columns=batters_columns)
batters_df

Unnamed: 0,Player,Dismissal,Runs,BallsFaced,Fours,Sixes,StrikeRate,Date,Location,Result,Innings,Team,BatSideRR,BatSideWicksLost,BatSideScore,Win
0,S L Campbell,st Parore b Vettori,51.0,67.0,6.0,0.0,76.12,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,West Indies,5.36,7,268,0
1,R D Jacobs,lbw b Harris,65.0,61.0,7.0,2.0,106.56,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,West Indies,5.36,7,268,0
2,B C Lara,c Nash b O'Connor,76.0,81.0,3.0,2.0,93.83,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,West Indies,5.36,7,268,0
3,S Chanderpaul,run out,11.0,40.0,0.0,0.0,27.5,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,West Indies,5.36,7,268,0
4,R L Powell,c Twose b Nash,35.0,30.0,3.0,2.0,116.67,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,West Indies,5.36,7,268,0
5,F A Rose,c McMillan b O'Connor,2.0,6.0,0.0,0.0,33.33,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,West Indies,5.36,7,268,0
6,N O Perry,not out,2.0,5.0,0.0,0.0,40.0,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,West Indies,5.36,7,268,0
7,J C Adams,c Fleming b Astle,0.0,1.0,0.0,0.0,0.0,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,West Indies,5.36,7,268,0
8,M V Dillon,not out,13.0,9.0,2.0,0.0,144.44,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,West Indies,5.36,7,268,0
9,R D King,,,,,,,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,West Indies,5.36,7,268,0


In [15]:
bowlers_columns = ['Player', 'Overs', 'Maidens', 'Runs', 'Wickets', 'EconRate', 
                   'Date', 'Location', 'Result', 'Innings', 'Team', 
                   'BatSideRR', 'BatSideWicketsLost', 'BatSideScore', 'Win']
bowlers_df = pd.DataFrame(bowlers, columns=bowlers_columns)
bowlers_df

Unnamed: 0,Player,Overs,Maidens,Runs,Wickets,EconRate,Date,Location,Result,Innings,Team,BatSideRR,BatSideWicketsLost,BatSideScore,Win
0,D J Nash,8.0,0,62,1,7.75,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,New Zealand,5.36,7,268,1
1,C L Cairns,5.0,0,29,0,5.8,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,New Zealand,5.36,7,268,1
2,S B O'Connor,8.0,0,62,2,7.75,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,New Zealand,5.36,7,268,1
3,C Z Harris,10.0,0,40,1,4.0,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,New Zealand,5.36,7,268,1
4,D L Vettori,10.0,1,28,1,2.8,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,New Zealand,5.36,7,268,1
5,N J Astle,9.0,0,37,1,4.11,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],1,New Zealand,5.36,7,268,1
6,C A Walsh,10.0,0,48,1,4.8,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],2,West Indies,5.54,7,250,0
7,F A Rose,9.0,0,71,3,7.89,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],2,West Indies,5.54,7,250,0
8,R D King,8.1,3,24,3,2.94,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],2,West Indies,5.54,7,250,0
9,M V Dillon,9.0,1,41,0,4.56,"2nd January, 2000","Eden Park, Auckland",New Zealand won by 3 wickets [Duckworth-Lewis],2,West Indies,5.54,7,250,0
