In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from datetime import datetime
import pandas as pd
import time
import os

In [2]:
def create_driver(executable_path='/Applications/Chdriver/chromedriver'):
    chrome_options = Options()
    service = Service(executable_path=executable_path)
    return webdriver.Chrome(service=service)

def get_season_links(driver, url):
    seasons = []
    
    driver.get(url)
    season_blocks = driver.find_elements(By.CLASS_NAME, 'season-block')
    for block in season_blocks[:2]:
        season_elements = block.find_elements(By.CLASS_NAME, 'season-links')
        for element in season_elements:
            season_links = element.find_elements(By.TAG_NAME, 'a')
            for link in season_links:
                season_link = link.get_attribute('href')
                seasons.append(season_link)
    return seasons

def get_series_links(driver, urls, filterd_series):
    series_links = []
    
    for url in urls[2:]:
        driver.get(url)
        driver.implicitly_wait(10)

        section1 = driver.find_element(By.XPATH, '//*[@id="viewport"]/div[7]/div[2]/main/section/section[7]')
        section2 = driver.find_element(By.XPATH, '//*[@id="viewport"]/div[7]/div[2]/main/section/section[10]')

        for section in [section1, section2]:
            series_elements = section.find_elements(By.CLASS_NAME, 'teams')
            for element in series_elements:
                series = element.find_element(By.TAG_NAME, 'a').text
                series_link = element.find_element(By.TAG_NAME, 'a').get_attribute('href')

                if any(filterd.lower() in series.lower() for filterd in filterd_series):
                    series_links.append(series_link)
    return series_links

def get_match_links(urls, batch=20):
    match_links = []
    
    for i in range(0, len(urls), batch):
        driver = create_driver()
        batch_urls = urls[i:i+batch]
        
        for url in batch_urls:
            driver.get(url)
            driver.implicitly_wait(5)

            match_elements = driver.find_elements(By.CSS_SELECTOR, 'div.ds-grow.ds-px-4.ds-border-r.ds-border-line-default-translucent')
            for element in match_elements:
                match_link = element.find_element(By.TAG_NAME, 'a').get_attribute('href')
                match_links.append(match_link)
        
        driver.delete_all_cookies()
        driver.quit()
        time.sleep(3)
    return match_links

In [5]:
service = Service(executable_path='/Applications/Chdriver/chromedriver')
driver = webdriver.Chrome(service=service)

url = 'https://www.espncricinfo.com/ci/engine/series/index.html'
filterd = ['India', 'Australia', 'England', 'Pakistan', 'South Africa', 'West Indies', 'New Zealand',
    'Sri Lanka', 'Bangladesh', 'Zimbabwe', 'Afghanistan', 'Syed Mushtaq', 'Big Bash', 'Pakistan Super', 
    'Super Smash', 'International League', 'SA20', 'Bangladesh Premier', 'Idian Premier', 'Vitality Blast',
    'Major Clubs', 'Lanka Premier', 'The Hundred', 'Caribbean Premier', 'CSA T20', 'National T20', 'Shpageeza',
    'Major League', 'Afghanistan Premier', ' Abu Dhabi T20', 'T20 World Cup']

season_urls = get_season_links(driver, url)
series_urls = get_series_links(driver, season_urls, filterd)
match_urls = get_match_links(series_urls)
            
driver.quit()

In [15]:
filtered_urls = [url for url in match_urls if 'qualifier' not in url.lower() 
                and 'region' not in url.lower() and 'unofficial' not in url.lower()
                and 'major-clubs' not in url.lower() and 'qlf' not in url.lower() and 'the-hundred' not in url.lower()]

batter_data = {
    'Sl No': [],
    'Name': [],
    'Team': [],
    'Runs': [],
    'Balls': [],
    'Fours': [],
    'Sixes': [],
    'Strikes': [],
    'Series': [],
    'Match': [],
    'Date': [],
    'Link': []
}

bowler_data = {
    'Sl No': [],
    'Name': [],
    'Team': [],
    'Overs': [],
    'Runs': [],
    'Maidens': [],
    'Wickets': [],
    'Wides': [],
    'No Ball': [],
    'Economy': [],
    'Series': [],
    'Match': [],
    'Date': [],
    'Link': []
}

cnt = 5660
batch_cnt = 188
batch = 30

for i in range(5660, len(filtered_urls), batch):
    batch_urls = filtered_urls[i:i+batch]
    driver = create_driver()
    batch_cnt += 1
       
    for url in batch_urls:
        driver.get(url)
        driver.implicitly_wait(2)
        
        cnt += 1
        print(f'Reading batch {batch_cnt} match {cnt}')

        try:   
            info = driver.find_element(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div/div[3]/div[1]/div[1]/div/div[1]/div[1]/div/div[1]/div[2]').text
            parts = info.split('\n')
            info_parts = parts[0].split(',')

            if len(info_parts) > 5:
                match_no = info_parts[0].split()[0].strip()
                place = info_parts[2].strip()
                month = info_parts[3].strip()
                year = info_parts[4].strip()
                date_str = ', '.join([month, year])
                try:
                    date_time = datetime.strptime(date_str, "%B %d, %Y")
                except ValueError:
                    date_parts = date_str.split(' ')
                    day = date_parts[1] if '-' in date_str else date_parts[1][:-1]
                    date_str_fixed = ' '.join([date_parts[0], day, date_parts[-1]])
                    date_time = datetime.strptime(date_str_fixed, "%B %d %Y")
                date = date_time.strftime("%Y-%m-%d")
            elif len(info_parts) < 5:
                match_no = '-'
                place = info_parts[0].strip()
                month = info_parts[1].strip()
                year = info_parts[2].strip()
                date_str = ', '.join([month, year])
                try:
                    date_time = datetime.strptime(date_str, "%B %d, %Y")
                except ValueError:
                    date_parts = date_str.split(' ')
                    day = date_parts[1] if '-' in date_str else date_parts[1][:-1]
                    date_str_fixed = ' '.join([date_parts[0], day, date_parts[-1]])
                    date_time = datetime.strptime(date_str_fixed, "%B %d %Y")
                date = date_time.strftime("%Y-%m-%d")
            else:
                match_no = info_parts[0].split()[0].strip()
                place = info_parts[1].strip()
                month = info_parts[2].strip()
                year = info_parts[3].strip()
                date_str = ', '.join([month, year])
                try:
                    date_time = datetime.strptime(date_str, "%B %d, %Y")
                except ValueError:
                    date_parts = date_str.split(' ')
                    day = date_parts[1] if '-' in date_str else date_parts[1][:-1]
                    date_str_fixed = ' '.join([date_parts[0], day, date_parts[-1]])
                    date_time = datetime.strptime(date_str_fixed, "%B %d %Y")
                date = date_time.strftime("%Y-%m-%d")
            series = parts[1].strip()

            team1 = driver.find_element(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div/div[3]/div[1]/div[1]/div/div[1]/div[2]/div[1]/div/div/div[2]/div/div[1]/div[1]/a/span').text
            team2 = driver.find_element(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div/div[3]/div[1]/div[1]/div/div[1]/div[2]/div[1]/div/div/div[2]/div/div[2]/div[1]/a/span').text

            batting1 = driver.find_element(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div/div[3]/div[1]/div[2]/div[1]/div[2]/div/div[2]/table[1]')
            batting2 = driver.find_element(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div/div[3]/div[1]/div[2]/div[1]/div[3]/div/div[2]/table[1]')

            for batting, team in zip([batting1,batting2], [team1,team2]):
                batters = batting.find_elements(By.CSS_SELECTOR, 'td.ds-w-0.ds-whitespace-nowrap.ds-min-w-max.ds-flex.ds-items-center')
                for batter in batters: 
                    name = batter.find_element(By.TAG_NAME, 'a').text
                    link = batter.find_element(By.TAG_NAME, 'a').get_attribute('href')
                    batter_data['Name'].append(name)
                    batter_data['Team'].append(team)
                    batter_data['Series'].append(series)
                    batter_data['Match'].append(match_no)
                    batter_data['Date'].append(date)
                    batter_data['Link'].append(link)
                    batter_data['Sl No'].append(cnt)
                runs = batting.find_elements(By.CSS_SELECTOR, 'td:nth-child(3)')
                for run in runs[:-2]:
                    batter_data['Runs'].append(run.text)
                balls = batting.find_elements(By.CSS_SELECTOR, 'td:nth-child(4)')
                for ball in balls[:-2]:
                    batter_data['Balls'].append(ball.text)
                fours = batting.find_elements(By.CSS_SELECTOR, 'td:nth-child(6)')
                for four in fours:
                    batter_data['Fours'].append(four.text)
                sixes = batting.find_elements(By.CSS_SELECTOR, 'td:nth-child(7)')
                for six in sixes:
                    batter_data['Sixes'].append(six.text)
                strikes = batting.find_elements(By.CSS_SELECTOR, 'td:nth-child(8)')
                for strike in strikes:
                    batter_data['Strikes'].append(strike.text)

            bowling1 = driver.find_element(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div/div[3]/div[1]/div[2]/div[1]/div[2]/div/div[2]/table[2]')
            bowling2 = driver.find_element(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div/div[3]/div[1]/div[2]/div[1]/div[3]/div/div[2]/table[2]')

            for bowling, team in zip([bowling1, bowling2], [team2, team1]):
                bowlers = bowling.find_elements(By.CSS_SELECTOR, 'td.ds-flex.ds-items-center')
                for bowler in bowlers:
                    name = bowler.find_element(By.TAG_NAME, 'span').text
                    link = bowler.find_element(By.TAG_NAME, 'a').get_attribute('href')
                    bowler_data['Name'].append(name)
                    bowler_data['Link'].append(link)
                    bowler_data['Team'].append(team)
                    bowler_data['Series'].append(series)
                    bowler_data['Match'].append(match_no)
                    bowler_data['Date'].append(date)
                    bowler_data['Sl No'].append(cnt)
                overs = bowling.find_elements(By.CSS_SELECTOR, 'td:nth-child(2)')
                for over in overs:
                    bowler_data['Overs'].append(over.text)
                maidens = bowling.find_elements(By.CSS_SELECTOR, 'td:nth-child(3)')
                for maiden in maidens:
                    bowler_data['Maidens'].append(maiden.text)
                runs = bowling.find_elements(By.CSS_SELECTOR, 'td:nth-child(4)')
                for run in runs:
                    bowler_data['Runs'].append(run.text)
                wickets = bowling.find_elements(By.CSS_SELECTOR, 'td:nth-child(5)')
                for wicket in wickets:
                    bowler_data['Wickets'].append(wicket.text)
                economies = bowling.find_elements(By.CSS_SELECTOR, 'td:nth-child(6)')
                for economy in economies:
                    bowler_data['Economy'].append(economy.text)
                nos = bowling.find_elements(By.CSS_SELECTOR, 'td:nth-child(11)')
                for no in nos:
                    bowler_data['No Ball'].append(no.text)
                wides = bowling.find_elements(By.CSS_SELECTOR, 'td:nth-child(10)')
                for wide in wides:
                    bowler_data['Wides'].append(wide.text)

        except NoSuchElementException:
            pass
        
    driver.delete_all_cookies()
    driver.quit()
    time.sleep(5)
    
    df1 = pd.DataFrame(batter_data)
    df2 = pd.DataFrame(bowler_data)

    if batch_cnt == 1:
        df1.to_csv('batter_data.csv', mode='w', header=True, index=False)
        df2.to_csv('bowler_data.csv', mode='w', header=True, index=False)
    else:
        df1.to_csv('batter_data.csv', mode='a', header=False, index=False)
        df2.to_csv('bowler_data.csv', mode='a', header=False, index=False)

    # Clear data dictionaries for the next batch
    batter_data = {
        'Sl No': [],
        'Name': [],
        'Team': [],
        'Runs': [],
        'Balls': [],
        'Fours': [],
        'Sixes': [],
        'Strikes': [],
        'Series': [],
        'Match': [],
        'Date': [],
        'Link': []
    }

    bowler_data = {
        'Sl No': [],
        'Name': [],
        'Team': [],
        'Overs': [],
        'Runs': [],
        'Maidens': [],
        'Wickets': [],
        'Wides': [],
        'No Ball': [],
        'Economy': [],
        'Series': [],
        'Match': [],
        'Date': [],
        'Link': []
    }


Reading batch 189 match 5661
Reading batch 189 match 5662
Reading batch 189 match 5663
Reading batch 189 match 5664
Reading batch 189 match 5665
Reading batch 189 match 5666
Reading batch 189 match 5667
Reading batch 189 match 5668
Reading batch 189 match 5669
Reading batch 189 match 5670
Reading batch 189 match 5671
Reading batch 189 match 5672
Reading batch 189 match 5673
Reading batch 189 match 5674
Reading batch 189 match 5675
Reading batch 189 match 5676
Reading batch 189 match 5677
Reading batch 189 match 5678
Reading batch 189 match 5679
Reading batch 189 match 5680
Reading batch 189 match 5681
Reading batch 189 match 5682
Reading batch 189 match 5683
Reading batch 189 match 5684
Reading batch 189 match 5685
Reading batch 189 match 5686
Reading batch 189 match 5687
Reading batch 189 match 5688
Reading batch 189 match 5689
Reading batch 189 match 5690
Reading batch 190 match 5691
Reading batch 190 match 5692
Reading batch 190 match 5693
Reading batch 190 match 5694
Reading batch 

Reading batch 198 match 5944
Reading batch 198 match 5945
Reading batch 198 match 5946
Reading batch 198 match 5947
Reading batch 198 match 5948
Reading batch 198 match 5949
Reading batch 198 match 5950
Reading batch 198 match 5951
Reading batch 198 match 5952
Reading batch 198 match 5953
Reading batch 198 match 5954
Reading batch 198 match 5955
Reading batch 198 match 5956
Reading batch 198 match 5957
Reading batch 198 match 5958
Reading batch 198 match 5959
Reading batch 198 match 5960
Reading batch 199 match 5961
Reading batch 199 match 5962
Reading batch 199 match 5963
Reading batch 199 match 5964
Reading batch 199 match 5965
Reading batch 199 match 5966
Reading batch 199 match 5967
Reading batch 199 match 5968
Reading batch 199 match 5969
Reading batch 199 match 5970
Reading batch 199 match 5971
Reading batch 199 match 5972
Reading batch 199 match 5973
Reading batch 199 match 5974
Reading batch 199 match 5975
Reading batch 199 match 5976
Reading batch 199 match 5977
Reading batch 

Reading batch 207 match 6227
Reading batch 207 match 6228
Reading batch 207 match 6229
Reading batch 207 match 6230
Reading batch 208 match 6231
Reading batch 208 match 6232
Reading batch 208 match 6233
Reading batch 208 match 6234
Reading batch 208 match 6235
Reading batch 208 match 6236
Reading batch 208 match 6237
Reading batch 208 match 6238
Reading batch 208 match 6239
Reading batch 208 match 6240
Reading batch 208 match 6241
Reading batch 208 match 6242
Reading batch 208 match 6243
Reading batch 208 match 6244
Reading batch 208 match 6245
Reading batch 208 match 6246
Reading batch 208 match 6247
Reading batch 208 match 6248
Reading batch 208 match 6249
Reading batch 208 match 6250
Reading batch 208 match 6251
Reading batch 208 match 6252
Reading batch 208 match 6253
Reading batch 208 match 6254
Reading batch 208 match 6255
Reading batch 208 match 6256
Reading batch 208 match 6257
Reading batch 208 match 6258
Reading batch 208 match 6259
Reading batch 208 match 6260
Reading batch 

Reading batch 217 match 6510
Reading batch 217 match 6511
Reading batch 217 match 6512
Reading batch 217 match 6513
Reading batch 217 match 6514
Reading batch 217 match 6515
Reading batch 217 match 6516
Reading batch 217 match 6517
Reading batch 217 match 6518
Reading batch 217 match 6519
Reading batch 217 match 6520
Reading batch 217 match 6521
Reading batch 217 match 6522
Reading batch 217 match 6523
Reading batch 217 match 6524
Reading batch 217 match 6525
Reading batch 217 match 6526
Reading batch 217 match 6527
Reading batch 217 match 6528
Reading batch 217 match 6529
Reading batch 217 match 6530
Reading batch 218 match 6531
Reading batch 218 match 6532
Reading batch 218 match 6533
Reading batch 218 match 6534
Reading batch 218 match 6535
Reading batch 218 match 6536
Reading batch 218 match 6537
Reading batch 218 match 6538
Reading batch 218 match 6539
Reading batch 218 match 6540
Reading batch 218 match 6541
Reading batch 218 match 6542
Reading batch 218 match 6543
Reading batch 

In [4]:
pd.set_option('display.max_colwidth', None)

df1 = pd.read_csv('batter_data.csv')
df2 = pd.read_csv('bowler_data.csv')

link = pd.concat([df1['Link'], df2['Link']], ignore_index=True)
player_links = link.unique()

In [8]:
player_data = {
    'Sl No': [],
    'Link': [],
    'pName': [],
    'Country': [],
    'Born': []
}

pcnt = 3850
pbatch_cnt = 78
pbatch = 50

for i in range(3850, len(player_links), pbatch):
    plinks = player_links[i:i+pbatch]
    
    driver = create_driver()
    pbatch_cnt += 1
    print(f'Reading batch {pbatch_cnt} player {pcnt} ...')
    
    for link in plinks:
        driver.get(link)
        driver.implicitly_wait(1)
        pcnt += 1
        
        try:
            name = driver.find_element(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div[2]/div[1]/div[1]/div/div/div/div/div/div[1]/h1').text
            country = driver.find_element(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div[2]/div[1]/div[1]/div/div/div/div/div/div[1]/div/span[1]').text
            date_str = driver.find_element(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div[2]/div[2]/div[2]/div/div/div[1]/div[2]/span/p').text
            date_parts = date_str.split(', ')
            if (len(date_parts) > 2) & (len(date_parts[0]) > 5):
                date_join = ', '.join(date_parts[:2])
                date_time = datetime.strptime(date_join, "%B %d, %Y")
                date = date_time.strftime("%Y-%m-%d")
            else:
                date = '-'
            
            player_data['Link'].append(link)
            player_data['pName'].append(name)
            player_data['Country'].append(country)
            player_data['Born'].append(date)
            player_data['Sl No'].append(pcnt)
            
        except NoSuchElementException:
            pass 
    
    driver.delete_all_cookies()
    driver.quit()
    time.sleep(1)
    
    player_df = pd.DataFrame(player_data)

    if pbatch_cnt == 1:
        player_df.to_csv('player_data.csv', mode='w', header=True, index=False)
    else:
        player_df.to_csv('player_data.csv', mode='a', header=False, index=False)

    player_data = {
        'Sl No': [],
        'Link': [],
        'pName': [],
        'Country': [],
        'Born': []
    }

Reading batch 79 player 3850 ...
Reading batch 80 player 3900 ...
Reading batch 81 player 3950 ...
Reading batch 82 player 4000 ...
Reading batch 83 player 4050 ...
Reading batch 84 player 4100 ...
Reading batch 85 player 4150 ...
Reading batch 86 player 4200 ...
Reading batch 87 player 4250 ...
Reading batch 88 player 4300 ...
Reading batch 89 player 4350 ...
Reading batch 90 player 4400 ...
Reading batch 91 player 4450 ...
Reading batch 92 player 4500 ...
Reading batch 93 player 4550 ...
Reading batch 94 player 4600 ...
Reading batch 95 player 4650 ...
Reading batch 96 player 4700 ...
Reading batch 97 player 4750 ...
Reading batch 98 player 4800 ...
Reading batch 99 player 4850 ...
Reading batch 100 player 4900 ...
Reading batch 101 player 4950 ...


In [120]:
bowler_df = bowler_df[['Name', 'Date', 'Over', 'Runs', 'Wickets', 'M', 'W', 'NB', 'Economy', 'Team', 'Series', 'Match', 'Country', 'Born']]
bowler_df.to_csv('bowler_data.csv')