In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import csv
import os

# Final Function to Get Match Summery Data

In [5]:
summery_card_url = 'https://www.espncricinfo.com/records/tournament/team-match-results/icc-men-s-t20-world-cup-2022-23-14450'

def get_match_summery_data(file_path):
    # Open the page
    driver = webdriver.Chrome()
    time.sleep(2)
    driver.get(summery_card_url)
    page_data = driver.page_source
    
    # scrapping the data via bs4
    soup = BeautifulSoup(page_data, 'html.parser')
    req_div = soup.find('div', class_ = 'ds-overflow-x-auto ds-scrollbar-hide')
    
    # getting table header
    table_header = [td.text.strip() for td in req_div.table.thead.tr.find_all('td')]
    # col name for scorecard link
    table_header.append('ScoreCard Link')
    
    # getting table data
    table_data = []
    for row in req_div.table.tbody.find_all('tr'):
        column = []
        scorecard_link, i = '', 1
        for col in row.find_all('td'):
            column.append(col.text.strip())
            ## ADDITIONAL Col for Scorecard Link
            if i == 7:
                scorecard_link = 'https://espncricinfo.com' + col.span.a['href']
                column.append(scorecard_link)
            i += 1
            
        table_data.append(column)
        
    driver.quit()
    
    # Saving the summery data to file_path
    try:
        with open(file_path, 'w') as file:
            csvWriter = csv.writer(file)

            # writing the header
            csvWriter.writerow(table_header)
            # writing the data
            csvWriter.writerows(table_data)
    except Exception as e:
        print(f"{e}")
    

In [6]:
get_match_summery_data('match_summery_data.csv')

In [7]:
df = pd.read_csv('match_summery_data.csv')
df.head()

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,ScoreCard Link
0,England,Pakistan,England,5 wickets,Melbourne,"Nov 13, 2022",T20I # 1879,https://espncricinfo.com/series/icc-men-s-t20-...
1,England,India,England,10 wickets,Adelaide,"Nov 10, 2022",T20I # 1878,https://espncricinfo.com/series/icc-men-s-t20-...
2,New Zealand,Pakistan,Pakistan,7 wickets,Sydney,"Nov 9, 2022",T20I # 1877,https://espncricinfo.com/series/icc-men-s-t20-...
3,India,Zimbabwe,India,71 runs,Melbourne,"Nov 6, 2022",T20I # 1873,https://espncricinfo.com/series/icc-men-s-t20-...
4,Bangladesh,Pakistan,Pakistan,5 wickets,Adelaide,"Nov 6, 2022",T20I # 1872,https://espncricinfo.com/series/icc-men-s-t20-...


In [10]:
### Renaming the Scorecard Col to Match_id
df.rename(columns= {'Scorecard': 'Match_id'}, inplace = True)

In [11]:
df.head(2)

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Match_id,ScoreCard Link
0,England,Pakistan,England,5 wickets,Melbourne,"Nov 13, 2022",T20I # 1879,https://espncricinfo.com/series/icc-men-s-t20-...
1,England,India,England,10 wickets,Adelaide,"Nov 10, 2022",T20I # 1878,https://espncricinfo.com/series/icc-men-s-t20-...


In [12]:
df.to_csv('match_summery_data.csv', index = False)

# Final Function To get the Batting and Bowling ScoreCard of Each Game

In [13]:
def get_batting_and_bowling_data(url, match_id, driver):
    driver.get(url)
    time.sleep(2) 
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    teams = []
    teams_tag = soup.find_all('div', class_='ds-flex ds-px-4 ds-border-b ds-border-line ds-py-3 ds-bg-ui-fill-translucent-hover')
    for team in teams_tag:
        teams.append(team.find('span', class_='ds-text-title-xs ds-font-bold ds-capitalize').text.strip())

    batting_first = teams[0].strip()
    batting_second = teams[1].strip()
    
    tables = soup.find_all("table", class_="ci-scorecard-table")
    
    batting_data = []
    bowling_data = []
    
    for i in range(2):  # Assuming two innings
        batting_team = teams[i]
        bowling_team = teams[1] if i == 0 else teams[0]
        
        
        if i < len(tables):
            batting_table = tables[i]
            bowling_table = batting_table.find_next("table")
        
            ## Batting Data 
            batting_pos = 1
            for tr in batting_table.tbody.find_all('tr'):
                columns = [col.text.strip() for col in tr.find_all('td')]
                
                if len(columns) == 8:  # Valid batting row
                    additional_data = [match_id, batting_team, bowling_team, batting_pos]
                    batting_data.append(additional_data + columns)
                    batting_pos += 1
        
            ## Bowling Data 
            for tr in bowling_table.tbody.find_all('tr'):
                columns = [col.text.strip() for col in tr.find_all('td')]
                
                if len(columns) == 11:  # Valid bowling row
                    additional_data = [match_id, bowling_team, batting_team]
                    bowling_data.append(additional_data + columns)
        
    return batting_data, bowling_data


def get_ScoreCard(summary_file_path):
    df = pd.read_csv(summary_file_path, usecols=['Match_id', 'ScoreCard Link'])
    
    driver = webdriver.Chrome()

    for i in range(df.shape[0]):
        match_id = df.iloc[i]['Match_id']
        scorecard_link = df.iloc[i]['ScoreCard Link']
        
        batsman_card, bowling_card = get_batting_and_bowling_data(scorecard_link, match_id, driver)
        
        # Check if file exists to decide on writing the header
        batting_file_exists = os.path.exists('Batting_ScoreCard.csv')
        bowling_file_exists = os.path.exists('Bowling_ScoreCard.csv')

        with open('Batting_ScoreCard.csv', 'a', newline='') as file:
            csvWriter = csv.writer(file)
            
            if not batting_file_exists:
                header = ['Match_ID', 'Team', 'Opponent', 'Batting Position', 'Batsman_Name', 
                          'Dismissal', 'Runs', 'Balls', 'Minutes Spent', '4s', '6s', 'Strike Rate']
                csvWriter.writerow(header)
            
            csvWriter.writerows(batsman_card)
        
        with open('Bowling_ScoreCard.csv', 'a', newline='') as file2:
            csvWriter = csv.writer(file2)
            
            if not bowling_file_exists:
                header2 = ['Match_ID', 'Team', 'Opponent', 'Bowler_Name', 'Overs', 'Maidens', 
                           'Runs', 'Wickets', 'Economy', '0s', '4s', '6s', 'Wides', 'No Balls']
                csvWriter.writerow(header2)
            
            csvWriter.writerows(bowling_card)

    driver.quit()


In [14]:
get_ScoreCard('match_summery_data.csv')

In [15]:
df1 = pd.read_csv('Batting_ScoreCard.csv')
df1.head()

Unnamed: 0,Match_ID,Team,Opponent,Batting Position,Batsman_Name,Dismissal,Runs,Balls,Minutes Spent,4s,6s,Strike Rate
0,T20I # 1879,Pakistan,England,1,Mohammad Rizwan †,b Curran,15,14,24,0,1,107.14
1,T20I # 1879,Pakistan,England,2,Babar Azam (c),c & b Rashid,32,28,58,2,0,114.28
2,T20I # 1879,Pakistan,England,3,Mohammad Haris,c Stokes b Rashid,8,12,15,1,0,66.66
3,T20I # 1879,Pakistan,England,4,Shan Masood,c Livingstone b Curran,38,28,46,2,1,135.71
4,T20I # 1879,Pakistan,England,5,Iftikhar Ahmed,c †Buttler b Stokes,0,6,8,0,0,0.0


In [16]:
df2 = pd.read_csv("Bowling_ScoreCard.csv")
df2.head()

Unnamed: 0,Match_ID,Team,Opponent,Bowler_Name,Overs,Maidens,Runs,Wickets,Economy,0s,4s,6s,Wides,No Balls
0,T20I # 1879,England,Pakistan,Ben Stokes,4.0,0,32,1,8.0,6,1,0,2,1
1,T20I # 1879,England,Pakistan,Chris Woakes,3.0,0,26,0,8.66,7,2,1,2,0
2,T20I # 1879,England,Pakistan,Sam Curran,4.0,0,12,3,3.0,15,0,0,0,0
3,T20I # 1879,England,Pakistan,Adil Rashid,4.0,1,22,2,5.5,10,1,0,1,0
4,T20I # 1879,England,Pakistan,Chris Jordan,4.0,0,27,2,6.75,9,3,0,0,0
