# Outline for cricket scorecard scraper

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np

In [2]:
page = '1627'

source = requests.get(f'http://www.howstat.com/cricket/Statistics/Matches/MatchScorecard_ODI.asp?MatchCode={page}').text

soup = BeautifulSoup(source, 'lxml')

## Getting all of the data that repeats

In [3]:
repeatables = np.array([item.text.strip() for item in soup.find_all(class_="TextBlack8")])
date, location, result, rr_inn_1, rr_inn_2 = repeatables[[0,1,4,6,8]]
rr_inn_1 = float(rr_inn_1.split('@')[1].split('rpo')[0].strip())
rr_inn_2 = float(rr_inn_2.split('@')[1].split('rpo')[0].strip())

In [4]:
date, location, result, rr_inn_1, rr_inn_2

('12th January, 2000',
 'Melbourne Cricket Ground, Melbourne',
 'Australia won by 28 runs',
 5.38,
 4.82)

In [5]:
repeatables_2 = np.array([item.text.strip() for item in soup.find_all(class_="TextBlackBold8")])
team_1, team_1_rr_wicks, team_1_runs, team_2, team_2_rr_wicks, team_2_runs = repeatables_2[[7, 15, 16, 25, 33, 34]]

team_1_wicks_lost = int(team_1_rr_wicks.split('\r')[0].split('wickets')[0].strip())
team_1_rr = float(team_1_rr_wicks.split('@')[1].split('rpo')[0].strip())
team_1_runs = int(team_1_runs)

team_2 = team_2.split('\xa0')[0]
team_2_wicks_lost = int(team_2_rr_wicks.split('\r')[0].split('wickets')[0].strip())
team_2_rr = float(team_2_rr_wicks.split('@')[1].split('rpo')[0].strip())
team_2_runs = int(team_2_runs)

team_1, team_1_rr, team_1_wicks_lost, team_1_runs, team_2, team_2_rr, team_2_wicks_lost, team_2_runs

('Australia', 5.38, 7, 269, 'India', 4.82, 6, 241)

In [6]:
# Determining the winner of each match (0-loss, 1-win, 2-draw)
winner = result.split(" ")[0]

if winner == team_1:
    team_1_win = 1
    team_2_win = 0
elif winner == team_2:
    team_1_win = 0
    team_2_win = 1
else:
    team_1_win = 2
    team_2_win = 2

## Getting all the non-repeatable data

In [7]:
# Making a list of the players
players = np.array([item.text for item in soup.find_all(class_="LinkOff")])
players = players[3:-2]
players

array(['M E Waugh', 'A C Gilchrist', 'R T Ponting', 'M G Bevan',
       'S R Waugh', 'D R Martyn', 'A Symonds', 'S Lee', 'D W Fleming',
       'B Lee', 'G D McGrath', 'J Srinath', 'A B Agarkar', 'B K V Prasad',
       'S R Tendulkar', 'A Kumble', 'R R Singh', 'S C Ganguly',
       'V V S Laxman', 'S C Ganguly', 'S S Dighe', 'S R Tendulkar',
       'R Dravid', 'R R Singh', 'J J Martin', 'A B Agarkar', 'A Kumble',
       'J Srinath', 'B K V Prasad', 'G D McGrath', 'D W Fleming', 'B Lee',
       'S Lee', 'A Symonds', 'D R Martyn'], dtype='<U31')

In [8]:
players_team_1 = list(players[0:11])
players_team_1

['M E Waugh',
 'A C Gilchrist',
 'R T Ponting',
 'M G Bevan',
 'S R Waugh',
 'D R Martyn',
 'A Symonds',
 'S Lee',
 'D W Fleming',
 'B Lee',
 'G D McGrath']

In [9]:
players_team_2 = [x for x in players if x not in players_team_1]
players_team_2 = list(set(players_team_2))
players_team_2

['J Srinath',
 'A B Agarkar',
 'S S Dighe',
 'S R Tendulkar',
 'S C Ganguly',
 'B K V Prasad',
 'R Dravid',
 'V V S Laxman',
 'R R Singh',
 'A Kumble',
 'J J Martin']

In [10]:
# Grabbing all of the data from the html
tds = [item.text.strip() for item in soup.find('table').find_all('table')[4].find_all('table')[1].find_all('td')]
tds = np.array(list(map(lambda x: x.replace('\x86', '').replace('*',''),tds)))

In [11]:
# Grabing all of the player scorecard info
player_scorecards = []
used_index = []
for player in players:
    indices = np.where(tds==player)[0]
    if len(indices)==1:
        player_scorecards.append([tds[indices][0], tds[indices+1][0], 
                                  tds[indices+2][0], tds[indices+3][0], 
                                  tds[indices+4][0], tds[indices+5][0], 
                                  tds[indices+6][0]])
        used_index.append(indices[0])
    elif len(indices)==2 and indices[0] not in used_index:
        player_scorecards.append([tds[indices][0], tds[indices+1][0], 
                                  tds[indices+2][0], tds[indices+3][0], 
                                  tds[indices+4][0], tds[indices+5][0], 
                                  tds[indices+6][0]])
        used_index.append(indices[0])
    else:
        player_scorecards.append([tds[indices][1], tds[indices+1][1], 
                                  tds[indices+2][1], tds[indices+3][1], 
                                  tds[indices+4][1], tds[indices+5][1], 
                                  tds[indices+6][1]])

# Appending the date, location, and result for each player
for item in player_scorecards:
    item.append(date)
    item.append(location)
    item.append(result)

In [12]:
# Isolating the batters and bowlers from the scorecard
batters = []
bowlers = []

for item in player_scorecards:
    try:
        float(item[1])
        bowlers.append(item)
    except ValueError:
        batters.append(item)

# Adding the repeat data info for the batsmen
for batter in batters[0:11]:
    batter.append(1)
    batter.append(team_1)
    batter.append(team_1_rr)
    batter.append(team_1_wicks_lost)
    batter.append(team_1_runs)
    batter.append(team_1_win)

for batter in batters[11:]:
    batter.append(2)
    batter.append(team_2)
    batter.append(team_2_rr)
    batter.append(team_2_wicks_lost)
    batter.append(team_2_runs)
    batter.append(team_2_win)

In [13]:
# Removing wickets taken as % of team wickets
for x in bowlers:
    del x[6]

# Adding team name to bowler
for bowler in bowlers:
    if bowler[0] in players_team_1:
        bowler.append(2)
        bowler.append(team_1)
        bowler.append(team_2_rr)
        bowler.append(team_2_wicks_lost)
        bowler.append(team_2_runs)
        bowler.append(team_1_win)
    elif bowler[0] in players_team_2:
        bowler.append(1)
        bowler.append(team_2)
        bowler.append(team_1_rr)
        bowler.append(team_1_wicks_lost)
        bowler.append(team_1_runs)
        bowler.append(team_2_win)

In [14]:
batters_columns = ['Player', 'Dismissal', 'Runs', 'BallsFaced', 'Fours', 'Sixes', 
                   'StrikeRate', 'Date', 'Location', 'Result', 'Innings', 
                   'Team' , 'BatSideRR', 'BatSideWicksLost', 'BatSideScore', 'Win']
batters_df = pd.DataFrame(batters, columns=batters_columns)
batters_df

Unnamed: 0,Player,Dismissal,Runs,BallsFaced,Fours,Sixes,StrikeRate,Date,Location,Result,Innings,Team,BatSideRR,BatSideWicksLost,BatSideScore,Win
0,M E Waugh,c Laxman b Agarkar,7.0,15.0,1.0,0.0,46.67,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,Australia,5.38,7,269,1
1,A C Gilchrist,c Laxman b Srinath,3.0,2.0,0.0,0.0,150.0,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,Australia,5.38,7,269,1
2,R T Ponting,c Tendulkar b Srinath,115.0,121.0,9.0,1.0,95.04,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,Australia,5.38,7,269,1
3,M G Bevan,c Agarkar b Singh,41.0,54.0,3.0,0.0,75.93,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,Australia,5.38,7,269,1
4,S R Waugh,run out,23.0,31.0,3.0,0.0,74.19,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,Australia,5.38,7,269,1
5,D R Martyn,c Tendulkar b Kumble,30.0,45.0,0.0,0.0,66.67,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,Australia,5.38,7,269,1
6,A Symonds,run out,3.0,4.0,0.0,0.0,75.0,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,Australia,5.38,7,269,1
7,S Lee,not out,22.0,15.0,0.0,1.0,146.67,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,Australia,5.38,7,269,1
8,D W Fleming,not out,14.0,13.0,0.0,0.0,107.69,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,Australia,5.38,7,269,1
9,B Lee,,,,,,,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,Australia,5.38,7,269,1


In [15]:
bowlers_columns = ['Player', 'Overs', 'Maidens', 'Runs', 'Wickets', 'EconRate', 
                   'Date', 'Location', 'Result', 'Innings', 'Team', 
                   'BatSideRR', 'BatSideWicketsLost', 'BatSideScore', 'Win']
bowlers_df = pd.DataFrame(bowlers, columns=bowlers_columns)
bowlers_df

Unnamed: 0,Player,Overs,Maidens,Runs,Wickets,EconRate,Date,Location,Result,Innings,Team,BatSideRR,BatSideWicketsLost,BatSideScore,Win
0,J Srinath,10.0,0,52,2,5.2,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,India,5.38,7,269,0
1,A B Agarkar,9.0,0,47,1,5.22,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,India,5.38,7,269,0
2,B K V Prasad,10.0,0,52,0,5.2,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,India,5.38,7,269,0
3,S R Tendulkar,3.0,0,23,0,7.67,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,India,5.38,7,269,0
4,A Kumble,10.0,0,57,1,5.7,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,India,5.38,7,269,0
5,R R Singh,3.0,0,19,1,6.33,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,India,5.38,7,269,0
6,S C Ganguly,5.0,0,16,0,3.2,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,1,India,5.38,7,269,0
7,G D McGrath,10.0,1,32,1,3.2,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,2,Australia,4.82,6,241,1
8,D W Fleming,10.0,1,39,1,3.9,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,2,Australia,4.82,6,241,1
9,B Lee,10.0,0,49,0,4.9,"12th January, 2000","Melbourne Cricket Ground, Melbourne",Australia won by 28 runs,2,Australia,4.82,6,241,1
