In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
import time
import csv
import os

In [3]:
def get_scorecard_links(fixture_result_url):
    data = []
    driver = webdriver.Chrome()
    driver.get(fixture_result_url)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source)
    driver.quit()
    
    div = soup.find_all('div', class_ = 'ds-mb-4')[0].find_all('div', class_ = 'ds-border-line')
    for tag in div:
        base = 'https://espncricinfo.com'
        link = tag.find('a')['href']
        data.append(base+link)
        
    return data

In [4]:
wc24 = get_scorecard_links(url)

In [6]:
wc22 = get_scorecard_links('https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/match-schedule-fixtures-and-results')

In [7]:
wc21 = get_scorecard_links('https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2021-22-1267897/match-schedule-fixtures-and-results')

In [8]:
wc16 = get_scorecard_links('https://www.espncricinfo.com/series/world-t20-2015-16-901359/match-schedule-fixtures-and-results')

In [9]:
wc14 = get_scorecard_links('https://www.espncricinfo.com/series/world-t20-2013-14-628368/match-schedule-fixtures-and-results')

In [12]:
wc12 = get_scorecard_links('https://www.espncricinfo.com/series/icc-world-twenty20-2012-13-531597/match-schedule-fixtures-and-results')

In [14]:
wc10 = get_scorecard_links('https://www.espncricinfo.com/series/icc-world-twenty20-2010-412671/match-schedule-fixtures-and-results')

In [15]:
wc09 = get_scorecard_links('https://www.espncricinfo.com/series/icc-world-twenty20-2009-335113/match-schedule-fixtures-and-results')

In [16]:
wc07 = get_scorecard_links('https://www.espncricinfo.com/series/icc-world-twenty20-2007-08-286109/match-schedule-fixtures-and-results')

In [27]:
t20wc_history_scorecards_links = {
    '2024 T20WC' : wc24,
    '2022 T20WC' : wc22,
    '2021 T20WC' : wc21,
    '2016 T20WC' : wc16,
    '2014 T20WC' : wc14,
    '2012 T20WC' : wc12,
    '2010 T20WC' : wc10,
    '2009 T20WC' : wc09,
    '2007 T20WC' : wc07
}

In [36]:
def extract_match_result(result):
    try:
        if "won by" in result:
            parts = result.split(" won by ")
            winner = parts[0]
            margin = parts[1].split(' (')[0]
            return winner, margin
        elif "tied" in result and "(" in result:
            winner = result.split("(")[1].split(" won")[0]
            return winner, "Super Over win"
        elif "No result" in result or "abandoned" in result:
            return "No result", ''
        return "Unknown", ''
    except Exception as e:
        print(f"Error extracting match result: {e}")
        return "Unknown", ''

def make_match_summary(scorecard_link, match_no, wc, soup):
    try:
        div = soup.find_all('div', class_='lg:ds-py-3')
        if not div:
            return [''] * 9

        data1 = div[0].find('div', class_='ds-mb-1')
        if not data1:
            return [''] * 9

        data1 = data1.text.split(',')
        if len(data1) < 3:
            return [''] * 9

        ground, date = "", ""

        if len(data1) == 5:
            ground = data1[1]
            date = data1[2] + data1[3]
        elif len(data1) == 6:
            ground = data1[2]
            date = data1[3] + data1[4]
        else:
            ground = data1[3]
            date = data1[4] + data1[5]

        data2 = soup.find_all('div', class_='ds-flex')
        if not data2:
            return [''] * 9
        
        teams = data2[0].text.strip().split('vs')
        if len(teams) < 2:
            return [''] * 9
        team1, team2 = teams[0].strip(), teams[1].strip()

        result_tag = div[0].find('p', class_='ds-text-tight-s ds-font-medium ds-truncate ds-text-typo')
        if not result_tag:
            return [''] * 9

        winner, margin = extract_match_result(result_tag.text)
        wc_parts = wc.split(' ')
        if len(wc_parts) < 2:
            return [''] * 9

        match_id = f"{wc_parts[1]} {wc_parts[0]} #{match_no}"

        return [team1, team2, winner, margin, ground, date, wc, match_id, scorecard_link]
    
    except Exception as e:
        return [''] * 9


def get_batting_and_bowling_data(match_id, wc, soup):
    try:
        teams = [team.find('span', class_='ds-text-title-xs ds-font-bold ds-capitalize').text.strip() for team in 
                 soup.find_all('div', class_='ds-flex ds-px-4 ds-border-b ds-border-line ds-py-3 ds-bg-ui-fill-translucent-hover')]
        
        if len(teams) < 2:
            return [], [], []
        
        tables = soup.find_all("table", class_="ci-scorecard-table")
        if not tables:
            return [], [], []
        
        profile_link_data = []
        batting_data, bowling_data = [], []
        
        for i in range(min(2, len(tables))):
            batting_team, bowling_team = teams[i], teams[1 - i]
            batting_table, bowling_table = tables[i], tables[i].find_next("table")
            if not batting_table or not bowling_table:
                continue
            
            batting_pos = 1
            for tr in batting_table.tbody.find_all('tr'):
                columns = []
                for i, col in enumerate(tr.find_all('td')):
                    text = col.text.strip()
                    if text != '':
                        columns.append(text)
                    if i == 0:
                        profile_link = col.find('a')
                        if profile_link is not None:
                            profile_link = profile_link['href']
                            full_profile_link = 'https://espncricinfo.com'+ profile_link
                            profile_link_data.append([col.text.strip(), full_profile_link])
                if len(columns) == 8:
                    batting_data.append([wc, match_id, batting_team, bowling_team, batting_pos] + columns)
                    batting_pos += 1
            
            for tr in bowling_table.tbody.find_all('tr'):
                columns = []
                for i, col in enumerate(tr.find_all('td')):
                    text = col.text.strip()
                    if text != '':
                        columns.append(text)
                    if i == 0:
                        profile_link = col.find('a')
                        if profile_link is not None:
                            profile_link = profile_link['href']
                            full_profile_link = 'https://espncricinfo.com'+profile_link
                            profile_link_data.append([col.text.strip(), full_profile_link])
                if len(columns) == 11:
                    bowling_data.append([wc, match_id, bowling_team, batting_team] + columns)
        
        return batting_data, bowling_data, profile_link_data
    
    except Exception as e:
        print(f"Error fetching batting and bowling data: {e}")
        return [], [], []

def save_to_csv(file_path, header, data):
    try:
        file_exists = os.path.exists(file_path)
        with open(file_path, 'a') as file:
            csvWriter = csv.writer(file)
            if not file_exists:
                csvWriter.writerow(header)
            csvWriter.writerows(data)
    except Exception as e:
        print(f"Error saving to CSV {file_path}: {e}")

def get_ScoreCard(match_id, wc, soup):
    try:
        batsman_card, bowling_card, profile_data = get_batting_and_bowling_data(match_id, wc, soup)
        
        if batsman_card:
            save_to_csv('T20WC_History_Batting_ScoreCard.csv', 
                        ['Edition', 'Match_ID', 'Team', 'Opponent', 'Batting Position', 'Batsman_Name', 
                         'Dismissal', 'Runs', 'Balls', 'Minutes Spent', '4s', '6s', 'Strike Rate'], 
                        batsman_card)
        
        if bowling_card:
            save_to_csv('T20WC_History_Bowling_ScoreCard.csv', 
                        ['Edition', 'Match_ID', 'Team', 'Opponent', 'Bowler_Name', 'Overs', 'Maidens', 
                         'Runs', 'Wickets', 'Economy', '0s', '4s', '6s', 'Wides', 'No Balls'], 
                        bowling_card)
        if profile_data:
            save_to_csv('profile_link.csv', 
                        ['Player_Name', 'Profile_link'], 
                        profile_data)
        
    except Exception as e:
        print(f"Error in get_ScoreCard: {e}")


def get_t20_wc_all_games_history(t20wc_history_scorecards_links):
    try:
        driver = webdriver.Chrome()
        t20_wc_all_games_summary = []
        
        for wc, scorecard_links_list in t20wc_history_scorecards_links.items():
#             if wc == '2022 T20WC':
            for i, link in enumerate(scorecard_links_list):
                retry_attempts = 3 
                success = False
                while retry_attempts > 0:
                    try:
                        driver.get(link)
                        time.sleep(2)
                        success = True
                        break  
                    except TimeoutException:
                        retry_attempts -= 1
                        print(f"Timeout occurred. Retrying...")

                if not success:
                    print(f"Skipping {link} due to repeated timeouts.")
                    continue 

                try:
                    page_data = driver.page_source
                    soup = BeautifulSoup(page_data, 'html.parser')
                    match_summary = make_match_summary(link, i + 1, wc, soup)
                    if any(match_summary):
                        match_ID = match_summary[-2]
                        get_ScoreCard(match_ID, wc, soup)
                        t20_wc_all_games_summary.append(match_summary)
                except Exception as e:
                    print(f"Unexpected error while processing {link}: {e}")
                    continue
        
        driver.quit()
        save_to_csv('t20wc_all_games.csv', 
                    ['team1', 'team2', 'winner', 'margin', 'ground', 'date', 'Edition', 'Match_id', 'ScoreCard Link'], 
                    t20_wc_all_games_summary)

    except WebDriverException as e:
        print(f"WebDriver Error: {e}")
    except Exception as e:
        print(f"Unexpected Error: {e}")
    finally:
        try:
            driver.quit()
        except:
            pass


In [37]:
get_t20_wc_all_games_history(t20wc_history_scorecards_links)

In [38]:
df = pd.read_csv('profile_link.csv')

In [39]:
### Handling wrong data case

In [40]:
import numpy as np

In [41]:
df['Player_Name'] = df['Player_Name'].apply(lambda x: x if not x[0].isdigit() else np.nan)

In [42]:
df.dropna(inplace = True)

In [43]:
df

Unnamed: 0,Player_Name,Profile_link
0,Aaron Johnson,https://espncricinfo.com/cricketers/aaron-john...
1,Navneet Dhaliwal,https://espncricinfo.com/cricketers/navneet-dh...
2,Pargat Singh,https://espncricinfo.com/cricketers/pargat-sin...
3,Nicholas Kirton,https://espncricinfo.com/cricketers/nicholas-k...
4,Shreyas Movva †,https://espncricinfo.com/cricketers/shreyas-mo...
...,...,...
9649,Sreesanth,https://espncricinfo.com/cricketers/sreesanth-...
9650,Joginder Sharma,https://espncricinfo.com/cricketers/joginder-s...
9651,Yusuf Pathan,https://espncricinfo.com/cricketers/yusuf-path...
9652,Irfan Pathan,https://espncricinfo.com/cricketers/irfan-path...


In [44]:
#### Handling did not bat case

In [45]:
def get_name(name):
    if 'Did' in name:
        return name.split(',')[0].split(':')[1].strip()
    else:
        return name

In [46]:
df['Player_Name'] = df['Player_Name'].apply(get_name)

In [47]:
### Dropping the Duplicates

In [48]:
df.drop_duplicates(inplace = True)

In [49]:
df

Unnamed: 0,Player_Name,Profile_link
0,Aaron Johnson,https://espncricinfo.com/cricketers/aaron-john...
1,Navneet Dhaliwal,https://espncricinfo.com/cricketers/navneet-dh...
2,Pargat Singh,https://espncricinfo.com/cricketers/pargat-sin...
3,Nicholas Kirton,https://espncricinfo.com/cricketers/nicholas-k...
4,Shreyas Movva †,https://espncricinfo.com/cricketers/shreyas-mo...
...,...,...
9407,Nadif Chowdhury,https://espncricinfo.com/cricketers/nadif-chow...
9431,Andre Nel,https://espncricinfo.com/cricketers/andre-nel-...
9459,Chris Tremlett,https://espncricinfo.com/cricketers/chris-trem...
9473,Joginder Sharma,https://espncricinfo.com/cricketers/joginder-s...


In [50]:
df.to_csv('profile_link.csv', index = False)

In [51]:
#### Getting the players description

In [52]:

def get_one_player_description(driver, profile_link):
    driver.get(profile_link)
    time.sleep(2) 

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    name, img_link, team, batting_style, bowling_style, playing_role = '', '', '', '', '', ''
    
    name_tag = soup.find('h1', class_='ds-text-title-l ds-font-bold')
    if name_tag:
        name = name_tag.text.strip()

    img_tag = soup.find('img', alt=name)  # Find image with player's name in alt
    if img_tag:
        img_link = img_tag['src']

    team_tag = soup.find('span', class_='ds-text-comfortable-s')
    if team_tag:
        team = team_tag.text.strip()
    
    details = soup.find_all('span', class_='ds-text-tight-s')
    if len(details) >= 3:
        batting_style = details[0].text.strip()
        bowling_style = details[1].text.strip()
        playing_role = details[2].text.strip()
    
    return [name, img_link, team, batting_style, bowling_style, playing_role]

def get_all_player_description(profile_link_file_path):
    driver = webdriver.Chrome() 

    df = pd.read_csv(profile_link_file_path)
    all_links = df['Profile_link']
    
    data = []
    for link in all_links:
        player_data = get_one_player_description(driver, link)
        data.append(player_data)

    # Save to CSV
    file_exists = os.path.exists('t20WC_Player_description.csv')
    header = ['Name', 'Image Link', 'Team', 'Batting Style', 'Bowling Style', 'Playing Role']
    
    with open('t20WC_Player_description.csv', 'a', encoding='utf-8') as file:
        csv_writer = csv.writer(file)
        if not file_exists:
            csv_writer.writerow(header)
        csv_writer.writerows(data)
    
    driver.quit()  
    return 

In [None]:
get_all_player_description('profile_link.csv')