In [3]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import csv
import os
import json
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## AIM 1 - Get the Summery of All Matches 

#### Rough Work

In [3]:
summery_card_url = 'https://www.espncricinfo.com/records/tournament/team-match-results/icc-men-s-t20-world-cup-2022-23-14450'

In [5]:
driver = webdriver.Chrome()
time.sleep(2)

driver.get(summery_card_url)
page_data = driver.page_source
driver.quit()

In [7]:
soup = BeautifulSoup(page_data, 'html.parser')

In [9]:
req_div = soup.find('div', class_ = 'ds-overflow-x-auto ds-scrollbar-hide')

In [28]:
for td in req_div.table.thead.tr.find_all('td'):
    print(td.text)

Team 1
Team 2
Winner
Margin
Ground
Match Date
Scorecard


In [66]:
data = []
for row in req_div.table.tbody.find_all('tr'):
    cols = []
    scr, i = '', 1
    for col in row.find_all('td'):
        cols.append(col.text)
        if i == 7:
            scr = col.span.a['href']
        cols.append(scr)
        i += 1
    print(cols)
    break

['England', '', 'Pakistan', '', 'England', '', '5 wickets', '', 'Melbourne', '', 'Nov 13, 2022', '', 'T20I # 1879', '/series/icc-men-s-t20-world-cup-2022-23-1298134/england-vs-pakistan-final-1298179/full-scorecard']


In [56]:
data[:3]

[['England',
  'Pakistan',
  'England',
  '5 wickets',
  'Melbourne',
  'Nov 13, 2022',
  'T20I # 1879'],
 ['England',
  'India',
  'England',
  '10 wickets',
  'Adelaide',
  'Nov 10, 2022',
  'T20I # 1878'],
 ['New Zealand',
  'Pakistan',
  'Pakistan',
  '7 wickets',
  'Sydney',
  'Nov 9, 2022',
  'T20I # 1877']]

# Final Function to Get Match Summery Data

In [276]:
summery_card_url = 'https://www.espncricinfo.com/records/tournament/team-match-results/icc-men-s-t20-world-cup-2022-23-14450'

def get_match_summery_data(file_path):
    # Open the page
    driver = webdriver.Chrome()
    time.sleep(2)
    driver.get(summery_card_url)
    page_data = driver.page_source
    
    # scrapping the data via bs4
    soup = BeautifulSoup(page_data, 'html.parser')
    req_div = soup.find('div', class_ = 'ds-overflow-x-auto ds-scrollbar-hide')
    
    # getting table header
    table_header = [td.text.strip() for td in req_div.table.thead.tr.find_all('td')]
    # col name for scorecard link
    table_header.append('ScoreCard Link')
    
    # getting table data
    table_data = []
    for row in req_div.table.tbody.find_all('tr'):
        column = []
        scorecard_link, i = '', 1
        for col in row.find_all('td'):
            column.append(col.text.strip())
            ## ADDITIONAL Col for Scorecard Link
            if i == 7:
                scorecard_link = 'https://espncricinfo.com' + col.span.a['href']
                column.append(scorecard_link)
            i += 1
            
        table_data.append(column)
        
    driver.quit()
    
    # Saving the summery data to file_path
    try:
        with open(file_path, 'w') as file:
            csvWriter = csv.writer(file)

            # writing the header
            csvWriter.writerow(table_header)
            # writing the data
            csvWriter.writerows(table_data)
    except Exception as e:
        print(f"{e}")
    

In [277]:
get_match_summery_data('match_summery_data.csv')

In [278]:
df = pd.read_csv('match_summery_data.csv')
df.head()

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,ScoreCard Link
0,England,Pakistan,England,5 wickets,Melbourne,"Nov 13, 2022",T20I # 1879,https://espncricinfo.com/series/icc-men-s-t20-...
1,England,India,England,10 wickets,Adelaide,"Nov 10, 2022",T20I # 1878,https://espncricinfo.com/series/icc-men-s-t20-...
2,New Zealand,Pakistan,Pakistan,7 wickets,Sydney,"Nov 9, 2022",T20I # 1877,https://espncricinfo.com/series/icc-men-s-t20-...
3,India,Zimbabwe,India,71 runs,Melbourne,"Nov 6, 2022",T20I # 1873,https://espncricinfo.com/series/icc-men-s-t20-...
4,Bangladesh,Pakistan,Pakistan,5 wickets,Adelaide,"Nov 6, 2022",T20I # 1872,https://espncricinfo.com/series/icc-men-s-t20-...


In [279]:
### Renaming the Scorecard Col to Match_id
df.rename(columns= {'Scorecard': 'Match_id'}, inplace = True)

In [280]:
df.to_csv('match_summery_data.csv', index = False)

## AIM 2 - Get the Batting ScoreCard for each of the match

#### Rough Work

In [4]:
url = 'https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/india-vs-pakistan-16th-match-group-2-1298150/full-scorecard'

In [5]:
driver = webdriver.Chrome()
time.sleep(3)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()

In [6]:
teams_tag = soup.find_all('div', class_ = 'ds-flex ds-px-4 ds-border-b ds-border-line ds-py-3 ds-bg-ui-fill-translucent-hover')

In [7]:
teams_tag

[<div class="ds-flex ds-px-4 ds-border-b ds-border-line ds-py-3 ds-bg-ui-fill-translucent-hover"><div class="ds-flex ds-flex-col ds-grow ds-justify-center"><span class="ds-text-title-xs ds-font-bold ds-text-typo"><span class="ds-text-title-xs ds-font-bold ds-capitalize">Pakistan</span><span class="ds-text-compact-xs ds-font-regular">  (20 ovs maximum)</span></span></div></div>,
 <div class="ds-flex ds-px-4 ds-border-b ds-border-line ds-py-3 ds-bg-ui-fill-translucent-hover"><div class="ds-flex ds-flex-col ds-grow ds-justify-center"><span class="ds-text-title-xs ds-font-bold ds-text-typo"><span class="ds-text-title-xs ds-font-bold ds-capitalize">India</span><span class="ds-text-compact-xs ds-font-regular">  (T: 160 runs from 20 ovs)</span></span></div></div>]

In [8]:
for tag in teams_tag:
    print(tag.find('span', class_ = 'ds-text-title-xs ds-font-bold ds-capitalize').text)

Pakistan
India


In [11]:
table = soup.find_all("table", class_="ci-scorecard-table")
print(len(table))

2


In [12]:
tables[0].tbody.tr.text

'Mohammad Rizwan\xa0†c Kumar b Arshdeep Singh412251033.33'

In [13]:
tables[0].next_sibling.tbody.tr.text

'Bhuvneshwar Kumar402215.50171130'

In [14]:
tables[1].tbody.tr.text

'KL Rahul\xa0 b Naseem Shah48130050.00'

In [15]:
batting_data = []
for tr in tables[1].tbody.find_all('tr'):
    column = []
    for col in tr.find_all('td'):
        column.append(col.text.strip())
    if len(column) == 8:   
        batting_data.append(column)
        
batting_data[0]

['KL Rahul', 'b Naseem Shah', '4', '8', '13', '0', '0', '50.00']

In [36]:
batting_data = []
for tr in tables[1].tbody.find_all('tr'):
    column = []
    for i, col in enumerate(tr.find_all('td')):
        if i == 0:
            s = col.find('a')
            if s is not None:
                print(col.text)
                print(s['href'])
        else:
            continue

KL Rahul 
/cricketers/kl-rahul-422108
Rohit Sharma (c)
/cricketers/rohit-sharma-34102
Virat Kohli 
/cricketers/virat-kohli-253802
Suryakumar Yadav 
/cricketers/suryakumar-yadav-446507
Axar Patel 
/cricketers/axar-patel-554691
Hardik Pandya 
/cricketers/hardik-pandya-625371
Dinesh Karthik †
/cricketers/dinesh-karthik-30045
Ravichandran Ashwin 
/cricketers/ravichandran-ashwin-26421
Did not bat: Mohammed Shami, Bhuvneshwar Kumar, Arshdeep Singh 
/cricketers/mohammed-shami-481896


In [29]:
bowling_data = []
for tr in tables[1].next_sibling.tbody.find_all('tr'):
    column = []
    for col in tr.find_all('td'):
        column.append(col.text.strip())
    if len(column) == 11:   
        bowling_data.append(column)
        
bowling_data

[['Shaheen Shah Afridi', '4', '0', '34', '0', '8.50', '7', '4', '0', '1', '0'],
 ['Naseem Shah', '4', '0', '23', '1', '5.75', '10', '1', '0', '0', '0'],
 ['Haris Rauf', '4', '0', '36', '2', '9.00', '11', '2', '2', '1', '0'],
 ['Shadab Khan', '4', '0', '21', '0', '5.25', '10', '2', '0', '0', '0'],
 ['Mohammad Nawaz', '4', '0', '42', '2', '10.50', '8', '0', '4', '2', '1']]

In [311]:
batting_data = []
bowling_data = []
def get_batting_and_bowling_data(url):
    driver = webdriver.Chrome()
    time.sleep(3)
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    
    teams = []
    teams_tag = soup.find_all('div', class_='ds-flex ds-px-4 ds-border-b ds-border-line ds-py-3 ds-bg-ui-fill-translucent-hover')
    for team in teams_tag:
        teams.append(team.find('span', class_='ds-text-title-xs ds-font-bold ds-capitalize').text.strip())
    
    batting_first = teams[0].strip()
    batting_second = teams[1].strip()
    
    tables = soup.find_all("table", class_="ci-scorecard-table")
    
    for i in range(2):
        batting_team = teams[i]
        bowling_team = teams[1] if i == 0 else teams[0]
        
        table_data = []
        batting_pos = 1
        batting_table = table[i]
        bowling_table = batting_table.next_sibling
        ## batting data
        for tr in batting_table.tbody.find_all('tr'):
            column = []
            for col in tr.find_all('td'):
                column.append(col.text.strip())
            if len(column) == 8:
                match = batting_first +' vs '+ batting_second
                additional_data = [match, batting_team, batting_pos]
                column = additional_data + column
                table_data.append(column)
                batting_pos += 1
        batting_data.extend(table_data)
    
        ## Bowling Data
        bowling_table_data = []
        for tr in bowling_table.tbody.find_all('tr'):
            column = []
            for col in tr.find_all('td'):
                column.append(col.text.strip())
            if len(column) == 11:
                match = batting_first +' vs '+ batting_second
                additional_data = [match , bowling_team]
                column = additional_data + column
                bowling_table_data.append(column)
        bowling_data.extend(bowling_table_data)
        
    return batting_data, bowling_data


In [312]:
temp = get_batting_and_bowling_data(url)

In [320]:
for batsman in temp[0]:
    print(batsman)
    break

['Pakistan vs India', 'Pakistan', 1, 'Mohammad Rizwan\xa0†', 'c Kumar b Arshdeep Singh', '4', '12', '25', '1', '0', '33.33']


In [319]:
for bowler in temp[1]:
    print(bowler)
    break

['Pakistan vs India', 'India', 'Bhuvneshwar Kumar', '4', '0', '22', '1', '5.50', '17', '1', '1', '3', '0']


In [287]:
df =  pd.read_csv('match_summery_data.csv', usecols=['Match_id', 'ScoreCard Link'])

In [318]:
df.head(1)

Unnamed: 0,Match_id,ScoreCard Link
0,T20I # 1879,https://espncricinfo.com/series/icc-men-s-t20-...


In [292]:
df.shape

(42, 2)

In [293]:
df.iloc[0]

Match_id                                                T20I # 1879
ScoreCard Link    https://espncricinfo.com/series/icc-men-s-t20-...
Name: 0, dtype: object

In [294]:
df.iloc[0]['Match_id']

'T20I # 1879'

# Final Function To get the Batting and Bowling ScoreCard of Each Game

In [315]:
def get_batting_and_bowling_data(url, match_id, driver):
    driver.get(url)
    time.sleep(2) 
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    teams = []
    teams_tag = soup.find_all('div', class_='ds-flex ds-px-4 ds-border-b ds-border-line ds-py-3 ds-bg-ui-fill-translucent-hover')
    for team in teams_tag:
        teams.append(team.find('span', class_='ds-text-title-xs ds-font-bold ds-capitalize').text.strip())

    batting_first = teams[0].strip()
    batting_second = teams[1].strip()
    
    tables = soup.find_all("table", class_="ci-scorecard-table")
    
    batting_data = []
    bowling_data = []
    
    for i in range(2):  # Assuming two innings
        batting_team = teams[i]
        bowling_team = teams[1] if i == 0 else teams[0]
        
        
        if i < len(tables):
            batting_table = tables[i]
            bowling_table = batting_table.find_next("table")
        
            ## Batting Data 
            batting_pos = 1
            for tr in batting_table.tbody.find_all('tr'):
                columns = [col.text.strip() for col in tr.find_all('td')]
                
                if len(columns) == 8:  # Valid batting row
                    additional_data = [match_id, batting_team, bowling_team, batting_pos]
                    batting_data.append(additional_data + columns)
                    batting_pos += 1
        
            ## Bowling Data 
            for tr in bowling_table.tbody.find_all('tr'):
                columns = [col.text.strip() for col in tr.find_all('td')]
                
                if len(columns) == 11:  # Valid bowling row
                    additional_data = [match_id, bowling_team, batting_team]
                    bowling_data.append(additional_data + columns)
        
    return batting_data, bowling_data


def get_ScoreCard(summary_file_path):
    df = pd.read_csv(summary_file_path, usecols=['Match_id', 'ScoreCard Link'])
    
    driver = webdriver.Chrome()

    for i in range(df.shape[0]):
        match_id = df.iloc[i]['Match_id']
        scorecard_link = df.iloc[i]['ScoreCard Link']
        
        batsman_card, bowling_card = get_batting_and_bowling_data(scorecard_link, match_id, driver)
        
        # Check if file exists to decide on writing the header
        batting_file_exists = os.path.exists('Batting_ScoreCard.csv')
        bowling_file_exists = os.path.exists('Bowling_ScoreCard.csv')

        with open('Batting_ScoreCard.csv', 'a', newline='') as file:
            csvWriter = csv.writer(file)
            
            if not batting_file_exists:
                header = ['Match_ID', 'Team', 'Opponent', 'Batting Position', 'Batsman_Name', 
                          'Dismissal', 'Runs', 'Balls', 'Minutes Spent', '4s', '6s', 'Strike Rate']
                csvWriter.writerow(header)
            
            csvWriter.writerows(batsman_card)
        
        with open('Bowling_ScoreCard.csv', 'a', newline='') as file2:
            csvWriter = csv.writer(file2)
            
            if not bowling_file_exists:
                header2 = ['Match_ID', 'Team', 'Opponent', 'Bowler_Name', 'Overs', 'Maidens', 
                           'Runs', 'Wickets', 'Economy', '0s', '4s', '6s', 'Wides', 'No Balls']
                csvWriter.writerow(header2)
            
            csvWriter.writerows(bowling_card)

    driver.quit()


In [316]:
# get_ScoreCard('match_summery_data.csv')

In [321]:
df1 = pd.read_csv('Batting_ScoreCard.csv')
df1.head()

Unnamed: 0,Match_ID,Team,Opponent,Batting Position,Batsman_Name,Dismissal,Runs,Balls,Minutes Spent,4s,6s,Strike Rate
0,T20I # 1879,Pakistan,England,1,Mohammad Rizwan †,b Curran,15,14,24,0,1,107.14
1,T20I # 1879,Pakistan,England,2,Babar Azam (c),c & b Rashid,32,28,58,2,0,114.28
2,T20I # 1879,Pakistan,England,3,Mohammad Haris,c Stokes b Rashid,8,12,15,1,0,66.66
3,T20I # 1879,Pakistan,England,4,Shan Masood,c Livingstone b Curran,38,28,46,2,1,135.71
4,T20I # 1879,Pakistan,England,5,Iftikhar Ahmed,c †Buttler b Stokes,0,6,8,0,0,0.0


In [322]:
df2 = pd.read_csv("Bowling_ScoreCard.csv")
df2.head()

Unnamed: 0,Match_ID,Team,Opponent,Bowler_Name,Overs,Maidens,Runs,Wickets,Economy,0s,4s,6s,Wides,No Balls
0,T20I # 1879,England,Pakistan,Ben Stokes,4.0,0,32,1,8.0,6,1,0,2,1
1,T20I # 1879,England,Pakistan,Chris Woakes,3.0,0,26,0,8.66,7,2,1,2,0
2,T20I # 1879,England,Pakistan,Sam Curran,4.0,0,12,3,3.0,15,0,0,0,0
3,T20I # 1879,England,Pakistan,Adil Rashid,4.0,1,22,2,5.5,10,1,0,1,0
4,T20I # 1879,England,Pakistan,Chris Jordan,4.0,0,27,2,6.75,9,3,0,0,0
