# Extracting IPL Data from Cricbuzz

In [146]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from datetime import datetime
import pandas as pd
import time
import os

In [151]:
service = Service(executable_path='/Applications/Chdriver/chromedriver')
driver = webdriver.Chrome(service=service)


url = 'https://www.cricbuzz.com/cricket-series/5945/indian-premier-league-2023/matches'
retry_get(driver, url)

match_urls = []
matches = driver.find_elements(By.CLASS_NAME, 'cb-series-matches')
for match in matches:
    link_element = match.find_element(By.CLASS_NAME, 'text-hvr-underline').get_attribute('href')
    link = link_element.replace("cricket-scores", "live-cricket-scorecard")
    match_urls.append(link)
    
batter_data = {
    'Match Date': [],
    'Venue': [],
    'Link': [],
    'Runs': [],
    'Balls': [],
    'Fours': [],
    'Sixes': [],
    'Strike Rate': [],
    'Team': [],
    'Season': []
}

bowler_data = {
    'Match Date': [],
    'Venue': [],
    'Link': [],
    'Overs': [],
    'Maiden': [],
    'Runs': [],
    'Wicket': [],
    'No Ball': [],
    'Wide': [],
    'Economy': [],
    'Team': [],
    'Season': []
}

for url in match_urls:
    driver.get(url)
    time.sleep(1)

    try:
        venue = driver.find_element(By.XPATH, '//*[@id="page-wrapper"]/div[4]/div[1]/div[2]/a[2]/span/span[2]/span')
        
        season_element = driver.find_element(By.XPATH, '//*[@id="page-wrapper"]/div[4]/div[1]/div[2]/a[1]/span')
        season_words = season_element.text.split()[3:]
        season = ' '.join(season_words)
        
        date = driver.find_element(By.CSS_SELECTOR, 'span.schedule-date')
        date_object = pd.to_datetime(date.text, format='%A, %B %d, %Y')
        
        batting1 = driver.find_element(By.XPATH, '//*[@id="innings_1"]/div[1]')
        batting2 = driver.find_element(By.XPATH, '//*[@id="innings_2"]/div[1]')
        
        xpath1 = '//*[@id="innings_1"]/div[1]/div[1]/span[1]'
        xpath2 = '//*[@id="innings_2"]/div[1]/div[1]/span[1]'
        for section, team_xpath in zip([batting1, batting2], [xpath1, xpath2]):
            team_element = section.find_element(By.XPATH, team_xpath)
            team_words = team_element.text.split(' ')[:-1]
            initials = [word[0] for word in team_words]
            team_name = ''.join(initials)
            
            batters = section.find_elements(By.CLASS_NAME, 'cb-scrd-itms')
            for batter in batters[:-3]:
                batter_data['Venue'].append(venue.text)                
                batter_data['Season'].append(season)
                batter_data['Match Date'].append(date_object.strftime('%Y-%m-%d'))

                name_link = batter.find_element(By.CLASS_NAME, 'cb-text-link').get_attribute('href')
                batter_data['Link'].append(name_link)
                                
                scores = batter.find_elements(By.CLASS_NAME, 'text-right')
                batter_data['Runs'].append(scores[0].text)
                batter_data['Balls'].append(scores[1].text)
                batter_data['Fours'].append(scores[2].text)
                batter_data['Sixes'].append(scores[3].text)
                batter_data['Strike Rate'].append(scores[4].text)
                batter_data['Team'].append(team_name)
            
        bowling1 = driver.find_element(By.XPATH, '//*[@id="innings_1"]/div[4]')
        bowling2 = driver.find_element(By.XPATH, '//*[@id="innings_2"]/div[4]')
        xpath1 = '//*[@id="innings_1"]/div[1]/div[1]/span[1]'
        xpath2 = '//*[@id="innings_2"]/div[1]/div[1]/span[1]'
        for section, team_xpath in zip([bowling1, bowling2], [xpath2, xpath1]):
            team_element = section.find_element(By.XPATH, team_xpath)
            team_words = team_element.text.split(' ')[:-1]
            initials = [word[0] for word in team_words]
            team_name = ''.join(initials)
            
            bowlers = section.find_elements(By.CLASS_NAME, 'cb-scrd-itms')
            for bowler in bowlers:
                name_link = bowler.find_element(By.CLASS_NAME, 'cb-text-link').get_attribute('href')   
                bowler_data['Link'].append(name_link)

                bowler_data['Match Date'].append(date_object.strftime('%Y-%m-%d'))               
                bowler_data['Venue'].append(venue.text)
                bowler_data['Season'].append(season)

                scores = bowler.find_elements(By.CLASS_NAME, 'text-right')
                bowler_data['Overs'].append(scores[0].text)
                bowler_data['Maiden'].append(scores[1].text)
                bowler_data['Runs'].append(scores[2].text)
                bowler_data['Wicket'].append(scores[3].text)
                bowler_data['No Ball'].append(scores[4].text)
                bowler_data['Wide'].append(scores[5].text)
                bowler_data['Economy'].append(scores[6].text)
                bowler_data['Team'].append(team_name)
                
    except NoSuchElementException as e:
        pass

batter_df = pd.DataFrame(batter_data)
bowler_df = pd.DataFrame(bowler_data)


driver.quit()

In [166]:
batter_link = pd.DataFrame({'Link': batter_df['Link'].unique()})
bowler_link = pd.DataFrame({'Link': bowler_df['Link'].unique()})
player_link = pd.concat([batter_link, bowler_link], axis=0).reset_index(drop=True)['Link'].unique()

In [171]:
service = Service(executable_path='/Applications/Chdriver/chromedriver')
driver = webdriver.Chrome(service=service)

player_data = {
    'Link': [],
    'Name': [],
    'Country': [],
    'Born': []
}
for link in player_link:
    retry_get(driver, link)
    time.sleep(0.1)
    
    player_data['Link'].append(link)
    
    name = retry_found_element(driver, By.XPATH, '//*[@id="playerProfile"]/div[1]/div[2]/h1').text
    player_data['Name'].append(name)

    country = retry_found_element(driver, By.XPATH, '//*[@id="playerProfile"]/div[1]/div[2]/h3').text
    player_data['Country'].append(country)

    date_string = retry_found_element(driver, By.XPATH, '//*[@id="playerProfile"]/div[2]/div[1]/div/div[3]').text
    date_part = date_string.split('(')[0].strip()
    parsed_date = datetime.strptime(date_part, '%b %d, %Y')
    formatted_date = parsed_date.strftime('%Y-%m-%d')
    player_data['Born'].append(formatted_date)
    
player_df = pd.DataFrame(player_data)    
driver.quit()

In [172]:
player_df

Unnamed: 0,Link,Name,Country,Born
0,https://www.cricbuzz.com/profiles/9838/conway,Devon Conway,New Zealand,1991-07-08
1,https://www.cricbuzz.com/profiles/11813/rutura...,Ruturaj Gaikwad,India,1997-01-31
2,https://www.cricbuzz.com/profiles/6692/moeen,Moeen Ali,England,1987-06-18
3,https://www.cricbuzz.com/profiles/6557/stokes,Ben Stokes,England,1991-06-04
4,https://www.cricbuzz.com/profiles/6311/rayudu,Ambati Rayudu,India,1985-09-23
...,...,...,...,...
274,https://www.cricbuzz.com/profiles/36496/raghav...,Raghav Goyal,India,2001-01-26
275,https://www.cricbuzz.com/profiles/11221/obed-m...,Obed McCoy,West Indies,1997-01-04
276,https://www.cricbuzz.com/profiles/8019/root,Joe Root,England,1990-12-30
277,https://www.cricbuzz.com/profiles/14701/nitish...,Nitish Reddy,India,2003-05-26


In [173]:
df1 = pd.merge(batter_df, player_df, on='Link', how='left')
df2 = pd.merge(bowler_df, player_df, on='Link', how='left')
df1 = df1[['Match Date', 'Name', 'Runs', 'Balls','Fours', 'Sixes', 'Strike Rate', 'Country', 'Born', 'Team', 'Venue', 'Season']]
df2 = df2[['Match Date', 'Name', 'Overs', 'Maiden','Runs', 'Wicket', 'No Ball', 'Wide', 'Economy', 'Country', 'Born', 'Team', 'Venue', 'Season']]

In [180]:
batter_file = 'ipl-2023-batter.csv'
bowler_file = 'ipl-2023-bowler.csv'

df1.to_csv(batter_file, index=False, header=not os.path.isfile(batter_file))
df2.to_csv(bowler_file, index=False, header=not os.path.isfile(bowler_file))


In [178]:
df1

Unnamed: 0,Match Date,Name,Runs,Balls,Fours,Sixes,Strike Rate,Country,Born,Team,Venue,Season
0,2023-03-31,Devon Conway,1,6,0,0,16.67,New Zealand,1991-07-08,CSK,Ahmedabad,2023
1,2023-03-31,Ruturaj Gaikwad,92,50,4,9,184.00,India,1997-01-31,CSK,Ahmedabad,2023
2,2023-03-31,Moeen Ali,23,17,4,1,135.29,England,1987-06-18,CSK,Ahmedabad,2023
3,2023-03-31,Ben Stokes,7,6,1,0,116.67,England,1991-06-04,CSK,Ahmedabad,2023
4,2023-03-31,Ambati Rayudu,12,12,0,1,100.00,India,1985-09-23,CSK,Ahmedabad,2023
...,...,...,...,...,...,...,...,...,...,...,...,...
1168,2023-05-29,Shivam Dube,32,21,0,2,152.38,India,1993-06-26,CSK,Ahmedabad,2023
1169,2023-05-29,Ajinkya Rahane,27,13,2,2,207.69,India,1988-06-06,CSK,Ahmedabad,2023
1170,2023-05-29,Ambati Rayudu,19,8,1,2,237.50,India,1985-09-23,CSK,Ahmedabad,2023
1171,2023-05-29,MS Dhoni,0,1,0,0,0.00,India,1981-07-07,CSK,Ahmedabad,2023


In [179]:
df2

Unnamed: 0,Match Date,Name,Overs,Maiden,Runs,Wicket,No Ball,Wide,Economy,Country,Born,Team,Venue,Season
0,2023-03-31,Mohammed Shami,4,0,29,2,1,0,7.20,India,1990-09-03,GT,Ahmedabad,2023
1,2023-03-31,Hardik Pandya,3,0,28,0,0,0,9.30,India,1993-10-11,GT,Ahmedabad,2023
2,2023-03-31,Joshua Little,4,0,41,1,0,0,10.20,Ireland,1999-11-01,GT,Ahmedabad,2023
3,2023-03-31,Rashid Khan,4,0,26,2,0,0,6.50,Afghanistan,1998-09-20,GT,Ahmedabad,2023
4,2023-03-31,Alzarri Joseph,4,0,33,2,0,0,8.20,West Indies,1996-11-20,GT,Ahmedabad,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,2023-05-29,Hardik Pandya,1,0,14,0,0,1,14.00,India,1993-10-11,GT,Ahmedabad,2023
888,2023-05-29,Rashid Khan,3,0,44,0,0,0,14.70,Afghanistan,1998-09-20,GT,Ahmedabad,2023
889,2023-05-29,Noor Ahmad,3,0,17,2,0,3,5.70,Afghanistan,2005-01-03,GT,Ahmedabad,2023
890,2023-05-29,Joshua Little,2,0,30,0,0,0,15.00,Ireland,1999-11-01,GT,Ahmedabad,2023


In [None]:
df = pd.read_csv('ipl-2023-batter.csv')
df