In [1]:
import re

from bs4 import BeautifulSoup
import requests as r
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

## Match results

Fetching match results for 45 matches of icc t20 world cup using BeautifulSoup

In [2]:
url = 'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=14450;type=tournament'
page = r.get(url).text
soup = BeautifulSoup(page,'html.parser')

In [3]:
match_results = []
matches = soup.find_all('tr',class_="data1")

for match in matches:
    match_data=[]
    data = match.find_all('td')
    for d in data:
        if d.string is None:
            match_data.append('null')
            continue
        match_data.append(d.string.strip())
        if len(match_data)==7:
            link = d.find('a',class_='data-link')
            match_data.append(link['href'])
    match_results.append(match_data)
print(f"Number of matches: {len(match_results)}")

Number of matches: 45


In [5]:
#Storing the data into a DataFrame
match_results_df = pd.DataFrame(data=match_results,columns=['Team 1','Team 2','Winner','Margin','Ground','Match Date','Scorecard','Match link'])
match_results_df

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,Match link
0,Namibia,Sri Lanka,Namibia,55 runs,Geelong,"Oct 16, 2022",T20I # 1823,/ci/engine/match/1298135.html
1,Netherlands,U.A.E.,Netherlands,3 wickets,Geelong,"Oct 16, 2022",T20I # 1825,/ci/engine/match/1298136.html
2,Scotland,West Indies,Scotland,42 runs,Hobart,"Oct 17, 2022",T20I # 1826,/ci/engine/match/1298137.html
3,Ireland,Zimbabwe,Zimbabwe,31 runs,Hobart,"Oct 17, 2022",T20I # 1828,/ci/engine/match/1298138.html
4,Namibia,Netherlands,Netherlands,5 wickets,Geelong,"Oct 18, 2022",T20I # 1830,/ci/engine/match/1298139.html
5,Sri Lanka,U.A.E.,Sri Lanka,79 runs,Geelong,"Oct 18, 2022",T20I # 1832,/ci/engine/match/1298140.html
6,Ireland,Scotland,Ireland,6 wickets,Hobart,"Oct 19, 2022",T20I # 1833,/ci/engine/match/1298141.html
7,West Indies,Zimbabwe,West Indies,31 runs,Hobart,"Oct 19, 2022",T20I # 1834,/ci/engine/match/1298142.html
8,Netherlands,Sri Lanka,Sri Lanka,16 runs,Geelong,"Oct 20, 2022",T20I # 1835,/ci/engine/match/1298143.html
9,Namibia,U.A.E.,U.A.E.,7 runs,Geelong,"Oct 20, 2022",T20I # 1836,/ci/engine/match/1298144.html


In [24]:
#Dropping the matches that were abandoned and getting their match ids
abandoned_matches=[]
for index,row in match_results_df.iterrows():
    if row['Winner']=='abandoned':
        abandoned_matches.append(row['Scorecard'])
        match_results_df.drop(index,axis=0,inplace=True)
match_results_df

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,Match link
0,Namibia,Sri Lanka,Namibia,55 runs,Geelong,"Oct 16, 2022",T20I # 1823,/ci/engine/match/1298135.html
1,Netherlands,U.A.E.,Netherlands,3 wickets,Geelong,"Oct 16, 2022",T20I # 1825,/ci/engine/match/1298136.html
2,Scotland,West Indies,Scotland,42 runs,Hobart,"Oct 17, 2022",T20I # 1826,/ci/engine/match/1298137.html
3,Ireland,Zimbabwe,Zimbabwe,31 runs,Hobart,"Oct 17, 2022",T20I # 1828,/ci/engine/match/1298138.html
4,Namibia,Netherlands,Netherlands,5 wickets,Geelong,"Oct 18, 2022",T20I # 1830,/ci/engine/match/1298139.html
5,Sri Lanka,U.A.E.,Sri Lanka,79 runs,Geelong,"Oct 18, 2022",T20I # 1832,/ci/engine/match/1298140.html
6,Ireland,Scotland,Ireland,6 wickets,Hobart,"Oct 19, 2022",T20I # 1833,/ci/engine/match/1298141.html
7,West Indies,Zimbabwe,West Indies,31 runs,Hobart,"Oct 19, 2022",T20I # 1834,/ci/engine/match/1298142.html
8,Netherlands,Sri Lanka,Sri Lanka,16 runs,Geelong,"Oct 20, 2022",T20I # 1835,/ci/engine/match/1298143.html
9,Namibia,U.A.E.,U.A.E.,7 runs,Geelong,"Oct 20, 2022",T20I # 1836,/ci/engine/match/1298144.html


## Building lists and functions to be used for scraping batting and bowling data
Getting batting and bowling details of individual matches using Selenium

#### Making a matches and ids list which will be later used in the final dataframe

In [25]:
matches_list,ids_list = [],[]

for index, row  in match_results_df.iterrows():
    match_copy1 = row['Team 1'] + ' Vs ' + row['Team 2']
    match_copy2 = row['Team 2'] + ' Vs ' + row['Team 1']

    matches_list.extend([match_copy1,match_copy2])
    ids_list.extend([row['Scorecard'],row['Scorecard']])
print(f"Matches list: {matches_list[:5]}")
print(f"Ids list: {ids_list[:5]}")

Matches list: ['Namibia Vs Sri Lanka', 'Sri Lanka Vs Namibia', 'Netherlands Vs U.A.E.', 'U.A.E. Vs Netherlands', 'Scotland Vs West Indies']
Ids list: ['T20I # 1823', 'T20I # 1823', 'T20I # 1825', 'T20I # 1825', 'T20I # 1826']


In [26]:
print(f"Length of matches list: {len(matches_list)}")
print(f"Length of ids list: {len(ids_list)}")

Length of matches list: 84
Length of ids list: 84


#### Function to fetch individual player stats from each match

In [49]:
def get_batting_details(driver,team_xpaths):
    details_dict={}
    for index,team_xpath in enumerate(team_xpaths):
        xpath = team_xpath + '/tbody/tr'
        table_rows = driver.find_elements(By.XPATH,str(xpath))
        player_list=[]
        flag=False

        for row in table_rows:
            data = row.find_elements(By.TAG_NAME,"td")
            individual_stats=[]

            count=0
            for d in data:
                #To detect when 'Extras' is encountered (meaning all the batting details have been captured)
                #Applicable for batting data
                if d.text == "Extras":
                    flag=True
                    break

                #Avoiding reading details for 'M' column since it's not required
                if count==4:
                    count+=1
                    continue

                individual_stats.append(d.text) #Appending the stats in the list
                count+=1

            #Break the loop when Extras has been found (meaning the required batting details have been captured)
            #Applicable for the batting data
            if flag:
                break

            player_list.append(individual_stats) #Appending individual player stats to a final list

        details_dict[index]=player_list #Adding all the player data for a particular team to the dictionary index=0 for first team and index=1 for second team

    return(details_dict)

In [50]:
def get_bowling_details(driver,team_xpaths):
    details_dict={}
    for index,team_xpath in enumerate(team_xpaths):
        xpath = team_xpath + '/tbody/tr'
        table_rows = driver.find_elements(By.XPATH,str(xpath))
        player_list=[]
        flag=False

        for row in table_rows:
            data = row.find_elements(By.TAG_NAME,"td")
            individual_stats=[]

            count=0
            for d in data:
                #To detect when 'Extras' is encountered (meaning all the batting details have been captured)
                #Applicable for batting data
                if d.text == "Extras":
                    flag=True
                    break

                individual_stats.append(d.text) #Appending the stats in the list
                count+=1

            #Break the loop when Extras has been found (meaning the required batting details have been captured)
            #Applicable for the batting data
            if flag:
                break

            player_list.append(individual_stats) #Appending individual player stats to a final list

        details_dict[index]=player_list #Adding all the player data for a particular team to the dictionary index=0 for first team and index=1 for second team

    return(details_dict)

#### Main Function to scrape the batting data from each url and convert to dictionary

In [54]:
def scrape_batting_data(url,team_xpaths):
    #Loading the match details web page
    path = "C:/Program Files (x86)/chromedriver.exe"
    options = Options()
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1280,720")
    options.headless=True
    driver = webdriver.Chrome(options=options,executable_path=path)
    driver.get(url)

    #Extracting team names of both the teams
    team_names=[]
    team_names_cont = driver.find_elements(By.XPATH, "//span[@class='ds-text-title-xs ds-font-bold ds-capitalize']")
    for name_cont in team_names_cont:
        team_names.append(name_cont.text)

    #Scraping individual player batting stats for both the teams
    scrape_dict= get_batting_details(driver,team_xpaths)

    #Framing the final dictionary with team name and individual player stats
    batting_dict={}
    for i,team in enumerate(team_names):
        batting_position=1
        player_list=[]
        for player in scrape_dict[i]:
            temp_list=[batting_position]

            #Removing null entries
            if player==['']:
                continue

            temp_list.extend(player)
            player_list.append(temp_list)

            batting_position+=1

        batting_dict[team]=player_list
    return batting_dict

#### Main Function to scrape the bowling data from each url and convert to dictionary

In [55]:
def scrape_bowling_data(url,team_xpaths):
    #Loading the match details web page
    path = "C:/Program Files (x86)/chromedriver.exe"
    options = Options()
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1280,720")
    options.headless=True
    driver = webdriver.Chrome(options=options,executable_path=path)
    driver.get(url)

    #Extracting team names of both the teams
    team_names=[]
    team_names_cont = driver.find_elements(By.XPATH, "//span[@class='ds-text-title-xs ds-font-bold ds-capitalize']")
    for name_cont in team_names_cont:
        team_names.append(name_cont.text)

    #Scraping individual player bowling stats for both the teams
    scrape_dict= get_bowling_details(driver,team_xpaths)

    #Framing the final dictionary with team name and individual player stats
    bowling_dict={}
    counter = len(team_names) -1
    for i in range(len(team_names)):
        player_list=[]
        for player in scrape_dict[i]:
            #Removing null entries
            if player==['']:
                continue
            player_list.append(player)
        bowling_dict[team_names[counter]]=player_list
        counter-=1
    return bowling_dict

### Building the batting summary dataframe

In [56]:
#Adding each match data to batting dataframe
batting_df=pd.DataFrame(columns=['Match','Match_id','Team Innings','Batting Position','Batsman Name','Dismissal','Runs','Balls','Fours','Sixes','SR'])
count_batting,index=0,0 #Will be used for converting dictionary to dataframe

for link in match_results_df['Match link']:
    url='https://stats.espncricinfo.com'+ link #Builds the url for each match to be passed to scrape_data function
    batting_xpath1='//*[@id="main-container"]/div[5]/div[1]/div/div[3]/div[1]/div[2]/div[1]/div[2]/div/div[2]/table[1]' #xpath for batting data table of first team
    batting_xpath2='//*[@id="main-container"]/div[5]/div[1]/div/div[3]/div[1]/div[2]/div[1]/div[3]/div/div[2]/table[1]' #xpath for batting data table of second team

    #Getting Batting dictionary
    batting_xpaths=[batting_xpath1,batting_xpath2] #Combining the two xpaths in a list
    batting_dict = scrape_batting_data(url,batting_xpaths) #scrape_data function returns a player stats dictionary

    #Filling batting dataframe
    for key,value in batting_dict.items(): #key-> team name ; value-> list of each player stats
        for details in value: # To access stats of each player
            master_list=[matches_list[count_batting],ids_list[count_batting],key,details[0],details[1],details[2],details[3], details[4],details[5],details[6],details[7]]
            batting_df.loc[index]=master_list #Adding the details to the dataframe
            index+=1 #Takes care of appending each player's stats to the dataframe
        count_batting+=1 #Used since we need to repeat values of matches_list and ids_list for each player of a team

print(batting_df.shape)

(699, 11)


In [57]:
batting_df

Unnamed: 0,Match,Match_id,Team Innings,Batting Position,Batsman Name,Dismissal,Runs,Balls,Fours,Sixes,SR
0,Namibia Vs Sri Lanka,T20I # 1823,Namibia,1,Michael van Lingen,c Pramod Madushan b Chameera,3,6,0,0,50.00
1,Namibia Vs Sri Lanka,T20I # 1823,Namibia,2,Divan la Cock,c Shanaka b Pramod Madushan,9,9,1,0,100.00
2,Namibia Vs Sri Lanka,T20I # 1823,Namibia,3,Jan Nicol Loftie-Eaton,c †Mendis b Karunaratne,20,12,1,2,166.66
3,Namibia Vs Sri Lanka,T20I # 1823,Namibia,4,Stephan Baard,c DM de Silva b Pramod Madushan,26,24,2,0,108.33
4,Namibia Vs Sri Lanka,T20I # 1823,Namibia,5,Gerhard Erasmus (c),c Gunathilaka b PWH de Silva,20,24,0,0,83.33
...,...,...,...,...,...,...,...,...,...,...,...
694,Pakistan Vs England,T20I # 1879,England,3,Phil Salt,c Iftikhar Ahmed b Haris Rauf,10,9,2,0,111.11
695,Pakistan Vs England,T20I # 1879,England,4,Ben Stokes,not out,52,49,5,1,106.12
696,Pakistan Vs England,T20I # 1879,England,5,Harry Brook,c Shaheen Shah Afridi b Shadab Khan,20,23,1,0,86.95
697,Pakistan Vs England,T20I # 1879,England,6,Moeen Ali,b Mohammad Wasim,19,13,3,0,146.15


In [58]:
batting_df.Match_id.value_counts()

T20I # 1843    21
T20I # 1832    21
T20I # 1861    21
T20I # 1850    20
T20I # 1834    20
T20I # 1828    19
T20I # 1825    19
T20I # 1835    19
T20I # 1862    19
T20I # 1864    19
T20I # 1849    19
T20I # 1823    19
T20I # 1851    18
T20I # 1855    18
T20I # 1846    18
T20I # 1847    18
T20I # 1840    18
T20I # 1859    18
T20I # 1853    18
T20I # 1873    18
T20I # 1826    18
T20I # 1842    18
T20I # 1852    17
T20I # 1867    17
T20I # 1879    17
T20I # 1872    17
T20I # 1860    16
T20I # 1858    16
T20I # 1839    16
T20I # 1871    16
T20I # 1848    15
T20I # 1856    15
T20I # 1838    15
T20I # 1830    15
T20I # 1836    15
T20I # 1845    13
T20I # 1841    13
T20I # 1833    12
T20I # 1877    11
T20I # 1837    10
T20I # 1878     9
T20I # 1844     8
Name: Match_id, dtype: int64

### Building the Bowling Summary dataframe

In [59]:
bowling_df=pd.DataFrame(columns=['Match','Match_id','Bowling Team','Bowler Name','Overs','Maiden','Runs','Wickets','Economy','Zeros','Fours','Sixes','Wides','No Balls' ])
count_bowling,index=0,0

for link in match_results_df['Match link']:
    url='https://stats.espncricinfo.com'+link #Builds the url for each match to be passed to scrape_data function
    bowling_xpath1='//*[@id="main-container"]/div[5]/div[1]/div/div[3]/div[1]/div[2]/div[1]/div[2]/div/div[2]/table[2]' #xpath for bowling data table of first team
    bowling_xpath2='//*[@id="main-container"]/div[5]/div[1]/div/div[3]/div[1]/div[2]/div[1]/div[3]/div/div[2]/table[2]' #xpath for bowling data table of second team

    #Getting Bowling dictionary
    bowling_xpaths=[bowling_xpath1,bowling_xpath2] #Combining the two xpaths in a list
    bowling_dict = scrape_bowling_data(url,bowling_xpaths) #scrape_data function returns a player stats dictionary

    #Filling bowling dataframe
    for key,value in bowling_dict.items(): #key-> team name ; value-> list of each player stats
        for details in value: # To access stats of each player
            master_list=[matches_list[count_bowling],ids_list[count_bowling],key,details[0],details[1],details[2],details[3], details[4],details[5],details[6],details[7],details[8],details[9],details[10]]
            bowling_df.loc[index]=master_list #Adding the details to the dataframe
            index+=1 #Takes care of appending each player's stats to the dataframe
        count_bowling+=1 #Used since we need to repeat values of matches_list and ids_list for each player of a team

bowling_df

Unnamed: 0,Match,Match_id,Bowling Team,Bowler Name,Overs,Maiden,Runs,Wickets,Economy,Zeros,Fours,Sixes,Wides,No Balls
0,Namibia Vs Sri Lanka,T20I # 1823,Sri Lanka,Maheesh Theekshana,4,0,23,1,5.75,7,0,0,2,0
1,Namibia Vs Sri Lanka,T20I # 1823,Sri Lanka,Dushmantha Chameera,4,0,39,1,9.75,6,3,1,2,0
2,Namibia Vs Sri Lanka,T20I # 1823,Sri Lanka,Pramod Madushan,4,0,37,2,9.25,6,3,1,0,0
3,Namibia Vs Sri Lanka,T20I # 1823,Sri Lanka,Chamika Karunaratne,4,0,36,1,9.00,7,3,1,1,0
4,Namibia Vs Sri Lanka,T20I # 1823,Sri Lanka,Wanindu Hasaranga de Silva,4,0,27,1,6.75,8,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Pakistan Vs England,T20I # 1879,Pakistan,Naseem Shah,4,0,30,0,7.50,15,3,1,1,0
496,Pakistan Vs England,T20I # 1879,Pakistan,Haris Rauf,4,0,23,2,5.75,13,3,0,1,0
497,Pakistan Vs England,T20I # 1879,Pakistan,Shadab Khan,4,0,20,1,5.00,10,1,0,0,0
498,Pakistan Vs England,T20I # 1879,Pakistan,Mohammad Wasim,4,0,38,1,9.50,5,5,0,2,0


## Player Summary
Getting personal details of players of each team using BeautifulSoup

In [60]:
def get_player_list(url):
    team_page = r.get(url).text
    soup = BeautifulSoup(team_page,'html.parser')
    players_info = soup.find_all('div',class_='ds-flex-1')
    players_list=[]
    count=0

    for player in players_info:

        #Skipping first entry which has irrelevant data
        if count==0:
            count+=1
            continue

        details = player.find_all(recursive=False)
        temp_list=[]
        for detail in details:
            temp_list.append(detail.text)

        details_list=temp_list[:2] #Appending name and playing role data

        #Extracting age, batting style and bowling style data from the text
        temp_string = temp_list[2]
        age = re.findall(": (\d+)",temp_string)
        bat = re.findall("Batting: ([A-Za-z\s]+)Bowling",temp_string)
        bowl = re.findall("Bowling: ([A-Za-z\s]+)",temp_string)
        details_list.extend([' '.join(age),' '.join(bat),' '.join(bowl)])

        players_list.append(details_list) #Appending the details of each player to the final list
        count+=1
    return players_list

In [61]:
player_info_dict = {}
base_url="https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/squads"
main_page = r.get(base_url).text
base_soup = BeautifulSoup(main_page,'html.parser')
team_links = base_soup.find_all('a',class_="ds-inline-flex ds-items-start ds-leading-none")
team_count=1
for link in team_links:
    if team_count>16:
        break
    url = 'https://www.espncricinfo.com' + link['href']
    team_name_list = link.text.split()[:-1]
    team_name = ' '.join(team_name_list)
    player_list= get_player_list(url)
    player_info_dict[team_name]=player_list
    team_count+=1
player_info_dict

{'Afghanistan': [['Mohammad Nabi(c)',
   'Allrounder',
   '37',
   'Right hand Bat',
   'Right arm Offbreak'],
  ['Azmatullah Omarzai',
   'Allrounder',
   '22',
   'Right hand Bat',
   'Right arm Medium fast'],
  ['Darwish Rasooli',
   'Top order Batter',
   '22',
   'Right hand Bat',
   'Right arm Offbreak'],
  ['Fareed Ahmad', 'Bowler', '28', 'Left hand Bat', 'Left arm Fast medium'],
  ['Fazalhaq Farooqi',
   'Bowler',
   '21',
   'Right hand Bat',
   'Left arm Fast medium'],
  ['Gulbadin Naib',
   'Batting Allrounder',
   '31',
   'Right hand Bat',
   'Right arm Medium fast'],
  ['Ibrahim Zadran',
   'Opening Batter',
   '20',
   'Right hand Bat',
   'Right arm Medium fast'],
  ['Mohammad Saleem', 'Bowler', '20', 'Right hand Bat', 'Right arm Fast'],
  ['Mujeeb Ur Rahman', 'Bowler', '21', 'Right hand Bat', 'Right arm Offbreak'],
  ['Najibullah Zadran',
   'Middle order Batter',
   '29',
   'Left hand Bat',
   'Right arm Offbreak'],
  ['Naveen-ul-Haq', 'Bowler', '22', 'Right hand Bat

Building the player info dataframe

In [62]:
player_info_df = pd.DataFrame(columns=['Team','Player Name','Age','Player Role','Batting Style','Bowling Style'])
index=0
for team_name, team_players in player_info_dict.items():
    for player_info in team_players:
        df_row = [team_name,player_info[0],player_info[2],player_info[1],player_info[3],player_info[4]]
        player_info_df.loc[index]=df_row
        index+=1
player_info_df

Unnamed: 0,Team,Player Name,Age,Player Role,Batting Style,Bowling Style
0,Afghanistan,Mohammad Nabi(c),37,Allrounder,Right hand Bat,Right arm Offbreak
1,Afghanistan,Azmatullah Omarzai,22,Allrounder,Right hand Bat,Right arm Medium fast
2,Afghanistan,Darwish Rasooli,22,Top order Batter,Right hand Bat,Right arm Offbreak
3,Afghanistan,Fareed Ahmad,28,Bowler,Left hand Bat,Left arm Fast medium
4,Afghanistan,Fazalhaq Farooqi,21,Bowler,Right hand Bat,Left arm Fast medium
...,...,...,...,...,...,...
249,Zimbabwe,Blessing Muzarabani,25,Bowler,Right hand Bat,Right arm Fast medium
250,Zimbabwe,Richard Ngarava,24,Bowler,Left hand Bat,Left arm Fast medium
251,Zimbabwe,Milton Shumba,21,Top order Batter,Left hand Bat,Slow Left arm Orthodox
252,Zimbabwe,Sikandar Raza,36,Batting Allrounder,Right hand Bat,Right arm Offbreak


## Converting all the dataframes to csv files

In [63]:
match_results_df.to_csv('match_results.csv',index=False)

In [64]:
batting_df.to_csv('batting_info.csv',index=False)

In [65]:
bowling_df.to_csv('bowling_info.csv',index=False)

In [66]:
player_info_df.to_csv('player_info.csv',index=False)