## Scraping test

In [24]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as uc

class ScrapeData:

    def __init__(self, player_name):
        self.player_name = player_name
        self.player_id = None
        self.player_url = None
    
        # Initialize class variables for storing stats
        self.battingstats = None
        self.bowlingstats = None
        self.allroundstats = None
        self.fieldingstats = None
        self.player_info = None

        # Set up the WebDriver and open the search URL
        options = uc.ChromeOptions()
        options.add_argument("--window-size=1920,1080")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")

        print("Setting up WebDriver...")
        self.driver = uc.Chrome(options=options)        

        # Call get_player_url() to fetch the player's URL and ID when the object is initialized
        self.get_player_url()

    def get_player_url(self):
        start_time = time.time()
        print(f"Extracting {self.player_name}'s player URL and Player ID....")
        search_url = f"https://search.espncricinfo.com/ci/content/site/search.html?search={self.player_name.lower().replace(' ', '%20')};type=player"
        self.driver.get(search_url)

        try:
            player_link_element = self.driver.find_element(By.CSS_SELECTOR, "h3.name.link-cta a")
            self.player_url = player_link_element.get_attribute("href")
            self.player_id = self.player_url.split('-')[-1]
            print(f"Extraction Successful for {self.player_name}.")
            end_time = time.time()
            print(f"Time taken to extract URL: {end_time - start_time:.2f} seconds")
        except Exception as e:
            print(f"Error in extracting {self.player_name}'s url:", e)
            return None, None

    def extract_inns_data(self, record_type):
        start_time = time.time()
        print(f"Starting extraction of {self.player_name}'s {record_type} stats....")
        
        # Construct the search URL based on record_type (batting, bowling, etc.)
        search_url = f"https://stats.espncricinfo.com/ci/engine/player/{self.player_id}.html?class=11;template=results;type={record_type};view=innings"
        
        # Open the URL
        self.driver.get(search_url)

        # Step 1: Extract the headers of the table
        headers = self.driver.find_elements(By.CSS_SELECTOR, "thead tr.headlinks th")
        header_names = [header.text for header in headers if header.text != ''] + ['Match id']  # Add match_id column name
        
        # Step 2: Extract the data from the 4th tbody
        rows = self.driver.find_elements(By.XPATH, "(//tbody)[4]//tr")
        
        # Step 3: Extract the data column-wise and store it in a list
        player_data = []
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            row_data = [cell.text for cell in cells if cell.text != '']
            player_data.append(row_data)
        
        # Step 4: Create a DataFrame from the extracted data
        innings_data = pd.DataFrame(player_data, columns=header_names)
        
        end_time = time.time()
        print(f"Extracted {innings_data.shape[0]} records in {end_time - start_time:.2f} seconds")
        
        return innings_data

    def extract_player_info(self):
        try:
            start_time = time.time()
            print(f"Starting extraction of {self.player_name}'s personal info....")
            
            # Start by opening the player info URL
            self.driver.get(self.player_url)

            # Step 1: Extract headers within the specified div tag
            headers = self.driver.find_elements(By.XPATH, "//div[@class='ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8']//p[@class='ds-text-tight-m ds-font-regular ds-uppercase ds-text-typo-mid3']")
            header_names = ['Player ID','Player URL']+[header.text for header in headers]

            # Step 2: Extract values within the specified div tag
            values = self.driver.find_elements(By.XPATH, "//div[@class='ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8']//span[@class='ds-text-title-s ds-font-bold ds-text-typo']")
            value_texts = [self.player_id,self.player_url]+[value.text for value in values]

            # Step 3: Create a DataFrame from the extracted data
            player_info = pd.DataFrame([value_texts], columns=header_names)

            end_time = time.time()
            print(f"Extracted player info in {end_time - start_time:.2f} seconds")

            return player_info
            
        except Exception as e:
            print(f"Error in extracting {self.player_name}'s personal info:", e)
            return None

    def get_player_stats(self, stats_type="all"):
        try:
            # Ensure that player ID or player URL is available
            if not (self.player_id or self.player_url):
                print("Player ID is not available. Run get_player_url() first.")
                return
            
            # Fetch personal information if 'personal_info' is passed
            if stats_type == "personal_info":
                self.player_info = self.extract_player_info()
            
            # Fetch batting stats if 'all' or 'batting' is passed
            if stats_type == "all" or stats_type == "batting":
                self.battingstats = self.extract_inns_data('batting')

            # Fetch bowling stats if 'all' or 'bowling' is passed
            if stats_type == "all" or stats_type == "bowling":
                self.bowlingstats = self.extract_inns_data('bowling')

            # Check if the player is an all-rounder and fetch all-round stats
            if stats_type == "all" or stats_type == "allround":
                self.player_info = self.extract_player_info()
                if self.player_info is not None and 'allround' in self.player_info['PLAYING ROLE'][0].lower():
                    self.allroundstats = self.extract_inns_data('allround')

            # Fetch fielding stats if 'all' or 'fielding' is passed
            if stats_type == "all" or stats_type == "fielding":
                self.fieldingstats = self.extract_inns_data('fielding')

        except Exception as e:
            print(f"Error in extracting stats for {self.player_name}: ", e)

    def __del__(self):
        try:
            self.driver.quit()
            print("WebDriver closed successfully.")
        except Exception as e:
            print("Error while closing the WebDriver:", e)



##### Extracting Ground Information

The following functions are built to extract ground information. However, these are very resource-intensive, so we will take it up later. 

In [None]:
def extract_ground_links(player_id):
    """
    This function extracts ground links from the player innings data, ensuring no duplicate ground info is scraped.
    """
    # Step 1: Initialize the DataFrame to store ground info
    ground_info_df = pd.DataFrame(columns=["Ground ID", "Stadium Name", "Location", "Home Team", "Image URL"])

    # Set up the WebDriver for scraping
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    # Scrape Player Stats Page
    search_url = f"https://stats.espncricinfo.com/ci/engine/player/{player_id}.html?class=11;template=results;type=batting;view=innings"
    driver.get(search_url)
    
    # Extract ground links from innings data
    rows = driver.find_elements(By.XPATH, "(//tbody)[4]//tr")
    ground_links = []
    
    for row in rows:
        try:
            ground_name_element = row.find_element(By.XPATH, ".//td[contains(@class, 'left')][2]/a")
            ground_name = ground_name_element.text
            ground_link = ground_name_element.get_attribute('href')
            ground_links.append((ground_name, ground_link))
        except Exception as e:
            print(f"Error extracting ground data: {e}")
            continue

    # Step 2: Check if the ground has already been scraped (exists in ground_info DataFrame)
    for ground_name, ground_link in ground_links:
        if ground_name not in ground_info_df['Stadium Name'].values:
            # Create a new DataFrame for the new ground
            new_data = pd.DataFrame({"Stadium Name": [ground_name], "Ground Link": [ground_link]})
            ground_info_df = pd.concat([ground_info_df, new_data], ignore_index=True)
        else:
            print(f"Ground {ground_name} has already been scraped. Skipping.")

    # Step 3: Extract ground info for each link and append it to the ground_info_df
    for ground_link in ground_info_df['Ground Link']:
        ground_info_df = extract_ground_info(ground_link, ground_info_df)
    
    driver.quit()
    return ground_info_df

def extract_ground_info(ground_url, ground_info_df):
    """
    This function extracts ground information (ID, stadium name, location, home team, image URL)
    from a given ground URL and appends the data to the provided dataframe.
    """
    start_time = time.time()
    
    # Set up the WebDriver for scraping ground info
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    driver.get(ground_url)
    
    try:
        # 1. Ground ID (numeric portion of the URL)
        ground_id = ground_url.split('/')[-1].split('.')[0]
        
        # 2. Ground image URL
        img_element = driver.find_element(By.XPATH, "//div[@class='ds-p-0']//img[1]")
        image_url = img_element.get_attribute("src")
        
        # 3. Stadium Name
        stadium_name = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-text-title-m') and contains(@class, 'ds-font-bold')]").text
        
        # 4. Location (City)
        location = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-text-compact-s') and contains(@class, 'ds-font-bold')]").text.strip().replace("\n", ", ")
        
        # 5. Home Team (Country)
        home_team_text = driver.find_element(By.XPATH, "//span[contains(text(), 'Grounds in')]").text
        home_team = home_team_text.split("Grounds in")[-1].strip()
        
        # Prepare the ground info as a dictionary
        ground_info = pd.DataFrame({
            "Ground ID": [ground_id],
            "Stadium Name": [stadium_name],
            "Location": [location],
            "Home Team": [home_team],
            "Image URL": [image_url]
        })
        
        # Append the ground info to the DataFrame
        ground_info_df = pd.concat([ground_info_df,ground_info], ignore_index=True)
        print(f"Extracted info for ground {stadium_name} in {time.time() - start_time:.2f} seconds.")
    
    except Exception as e:
        print(f"Error while extracting info for {ground_url}: {e}")
    
    driver.quit()
    return ground_info_df

In [52]:
ground_url = 'https://www.espncricinfo.com/cricket-grounds/rangiri-dambulla-international-stadium-59368'

# Set up the WebDriver for scraping ground info
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
driver.get(ground_url)
    
try:
    # 1. Ground ID (numeric portion of the URL)
    ground_id = ground_url.split('/')[-1].split('.')[0]
        
    # 2. Ground image URL
    img_element = driver.find_element(By.XPATH, "//div[@class='ds-p-0']//img")
    image_url = img_element.get_attribute("src")
        
        # 3. Stadium Name
    stadium_name = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-text-title-m') and contains(@class, 'ds-font-bold')]").text
        
    # 4. Location (City)
    location = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-text-compact-s') and contains(@class, 'ds-font-bold')]").text.strip().replace("\n", ", ")
        
    # 5. Home Team (Country)
    home_team_text = driver.find_element(By.XPATH, "//span[contains(text(), 'Grounds in')]").text
    home_team = home_team_text.split("Grounds in")[-1].strip()
        
    # Prepare the ground info as a dictionary
    ground_info = pd.DataFrame({
            "Ground ID": [ground_id],
            "Stadium Name": [stadium_name],
            "Location": [location],
            "Home Team": [home_team],
            "Image URL": [image_url]
        })

except Exception as e : print(e)
    
ground_info

Message: no such element: Unable to locate element: {"method":"xpath","selector":"//div[@class='ds-p-0']//img"}
  (Session info: chrome=135.0.7049.85); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00B88073+60707]
	GetHandleVerifier [0x00B880B4+60772]
	(No symbol) [0x009B0683]
	(No symbol) [0x009F8660]
	(No symbol) [0x009F89FB]
	(No symbol) [0x00A41022]
	(No symbol) [0x00A1D094]
	(No symbol) [0x00A3E824]
	(No symbol) [0x00A1CE46]
	(No symbol) [0x009EC5D3]
	(No symbol) [0x009ED424]
	GetHandleVerifier [0x00DCBB53+2435075]
	GetHandleVerifier [0x00DC70F3+2416035]
	GetHandleVerifier [0x00DE349C+2531660]
	GetHandleVerifier [0x00B9F145+155125]
	GetHandleVerifier [0x00BA5AED+182173]
	GetHandleVerifier [0x00B8F948+91640]
	GetHandleVerifier [0x00B8FAF0+92064]
	GetHandleVerifier [0x00B7A5B0+4704]
	BaseThreadInitThunk [0x75D35D49+25]
	RtlInitializeExceptionChain [0x7

NameError: name 'ground_info' is not defined

In [None]:
player_id = 253802
grounds = extract_ground_links(player_id)

grounds

## Transformation

In [25]:
import time
import pandas as pd
import numpy as np

class TransformData:
    
    def __init__(self, player_name):
        self.player_name = player_name
        self.player_info = None
        self.battingstats = None
        self.bowlingstats = None
        self.allroundstats = None
        self.fieldingstats = None
        self.player_id = None
        self.player_url = None

    #transforming data

    def transform_data(self,df):
        
        #STEP 1: Replacing incorrect values. 
        repl_dict = {
            r'\*': '',       
            r'^DNB$': np.nan,  
            r'^TDNB$': np.nan, 
            r'^DNF$': np.nan,  
            r'^TDNF$': np.nan, 
            r'^-$': np.nan,    
            r'^sub$': np.nan   
                    }

        df = df.replace(repl_dict,regex=True)

        #STEP 2: Opposition column
        df['Format'] = df['Opposition'].str.extract(r'(^.*?)\sv\s')
        df['Opposition'] = df['Opposition'].str.extract(r'\sv\s(.*?$)')

        #STEP 3: Ground column
        ground_mapping = {
        "Colombo (SSC)": "Colombo",
        "Colombo (PSS)": "Colombo",
        "Colombo (RPS)": "Colombo",
        "Eden Gardens": "Kolkata",
        "Wankhede": "Mumbai",
        "Brabourne": "Mumbai",
        "Kingston": "Kingston Jamaica",
        "The Oval": "London",
        "Lord's": "London",
        "W.A.C.A": "Perth",
        "Dharamsala": "Dharamshala",
        "Hamilton": "Hamilton Waikato",
        "Fatullah": "Fatullah Dhaka",
        "Providence": "Providence Guyana",
        "Dubai (DICS)": "Dubai",
        "Chattogram": "Chattogram Chittagong"
        }

        df['Ground']=df['Ground'].replace(ground_mapping)
        df = df.rename(columns={'Ground':'Location'})

        #STEP 4: START DATE
        df['Start Date'] = df['Start Date'].astype('datetime64[ns]')

        #STEP 5: MATCH ID
        df['Match id']='#'+df['Match id'].str.extract(r'(\d+$)')
        df = df.rename(columns={'Match id':'Match ID'})

        return df
    
    def final_df(self, df, common_cols, custom_cols):

        dtype_mapping = {

                # Common Columns
                'Match ID': 'string',
                'Start Date': 'datetime64[ns]',
                'Format': 'string',
                'Inns': 'Int64',  # Allows NaN handling
                'Opposition': 'string',
                'Location': 'string',
                
                # Batting Columns
                'Pos': 'Int64',
                'Runs': 'Int64',
                'BF': 'Int64',
                '4s': 'Int64',
                '6s': 'Int64',
                'SR': 'float64',
                'Mins': 'Int64',
                'Dismissal': 'string',

                # Bowling Columns
                'Overs': 'float64',
                'Mdns': 'Int64',
                'Runs': 'Int64',
                'Wkts': 'Int64',
                'Econ': 'float64',

                # Fielding Columns
                'Dis': 'Int64',
                'Ct': 'Int64',

                # Allround Columns
                'Score': 'string',  # Could be runs or DNB, TDNB
                'Conc': 'Int64',
                'St': 'Int64'
            }

        if df is not None:
            df = self.transform_data(df)

            # Select the necessary columns
            df = df[common_cols[:-2] + custom_cols + common_cols[-2:]]

            # Apply type casting
            for col in df.columns:
                if col in dtype_mapping:
                    try:
                        df[col] = df[col].astype(dtype_mapping[col])
                    except Exception as e:
                        print(f"Data type casting failed for column {col}: {e}")

            return df


    def process_data(self,type="all"):

        common = ['Match ID','Start Date','Format','Inns','Opposition','Location']
        batcols = ['Pos','Runs','BF','4s','6s','SR','Mins','Dismissal']
        bowlcols = ['Pos','Overs','Mdns','Runs','Wkts','Econ']
        fieldcols = ['Dis','Ct']
        allroundcols = ['Score','Overs','Conc','Wkts','Ct','St']

        try:
        
            # Process batting stats
            if type == 'all' or type == 'batting':
                print(f"Processing {self.player_name}'s batting stats...")
                self.battingstats = self.final_df(self.battingstats, common, batcols)
                print(f"Batting stats processed successfully.")

            # Process bowling stats
            if type == 'all' or type == 'bowling':
                print(f"Processing {self.player_name}'s bowling stats...")
                self.bowlingstats = self.final_df(self.bowlingstats, common, bowlcols)
                print(f"Bowling stats processed successfully.")

            # Process fielding stats
            if type == 'all' or type == 'fielding':
                print(f"Processing {self.player_name}'s fielding stats...")
                self.fieldingstats = self.final_df(self.fieldingstats, common, fieldcols)
                print(f"Fielding stats processed successfully.")

            # Process allround stats
            if type == 'all' or type == 'allround':
                if self.player_info is not None and 'allround' in self.player_info['PLAYING ROLE'][0].lower():
                    print(f"Processing {self.player_name}'s all-round stats...")
                    self.allroundstats = self.final_df(self.allroundstats, common, allroundcols)
                    print(f"All-round stats processed successfully.")

           
        except Exception as e:
            print(f"Error in processing data for {self.player_name}: ", e)

## Loading

In [None]:
import pandas as pd
import boto3
import botocore.exceptions
from io import StringIO
from dotenv import load_dotenv
import os

load_dotenv()  # Load AWS credentials from .env

s3 = boto3.client("s3")

class LoadData:

    def __init__(self, player_name, data_type="raw"):
        
        """
        Initializes the CricketerStatsLoader with player name and data type.
        Args:
            player_name (str): Name of the player.
            data_type (str): Type of data ('raw' or 'tf').
        """

        self.player_name = player_name.lower().replace(" ", "_")
        self.data_type = data_type  # 'raw' or 'tf'
        self.battingstats = None
        self.bowlingstats = None
        self.fieldingstats = None
        self.allroundstats = None
        self.player_info = None

    def ensure_bucket_exists(self, bucket_name):

        """Checks if the S3 bucket exists, and creates it if not."""
        
        try:
            s3.head_bucket(Bucket=bucket_name)
            print(f"✅ Bucket '{bucket_name}' already exists.")
        
        except botocore.exceptions.ClientError as e:
            error_code = int(e.response["Error"]["Code"])
            
            if error_code == 404:
                print(f"⚠️ Bucket '{bucket_name}' does not exist. Creating...")
                s3.create_bucket(
                    Bucket=bucket_name,
                    CreateBucketConfiguration={
                        'LocationConstraint': os.getenv("AWS_DEFAULT_REGION")
                                            }
                )
                
                print(f"✅ Bucket '{bucket_name}' created successfully.")
            
            else: raise e
    
    def upload_df(self, bucket_name, object_key, df):

        """Uploads a Pandas DataFrame as a CSV file to S3."""

        if df is None or df.empty:
            print(f"Warning: {object_key} is empty, skipping upload.")
            return

        csv_buffer = StringIO()
        df.to_csv(csv_buffer, index=False)

        s3.put_object(
            Bucket=bucket_name,
            Key=object_key,
            Body=csv_buffer.getvalue(),
            ContentType="text/csv"
        )

        print(f" Uploaded to s3://{bucket_name}/{object_key}")

    def download_df(self, bucket_name, stat_type):

        """Downloads a cricket stat CSV from S3 into a DataFrame."""

        file_name_map = {
            "batting": "batting_stats.csv",
            "bowling": "bowling_stats.csv",
            "fielding": "fielding_stats.csv",
            "allround": "allround_stats.csv",
            "personal_info": "personal_info.csv"
        }

        '''If 'all' is passed, download all stat types
        if stat_type == "all":
            data = {}
            for stat in file_name_map:
                data[stat] = self.download_df(bucket_name, stat)  # Recursively call download_df for each type
            return data
        '''

        if stat_type not in file_name_map:
            print(f" Invalid stat_type '{stat_type}'.")
            return None

        object_key = f"{self.player_name}/{self.data_type}/{file_name_map[stat_type]}"

        try:

            response = s3.get_object(Bucket=bucket_name, Key=object_key)
            content = response["Body"].read().decode("utf-8")
            df = pd.read_csv(StringIO(content))
            print(f" Downloaded from s3://{bucket_name}/{object_key}")
            return df
        
        except Exception as e:
            print(f"Error downloading {stat_type} stats: {e}")
            return None

    def load_data(self, bucket_name, load_type, stat_type="all"):
    
        """
        Loads cricket stats data to/from S3.

        Args:
            bucket_name (str): Name of the S3 bucket.
            load_type (str): Type of load operation ('upload' or 'download').
            stat_type (str): Type of stats to load ('all', 'batting', 'bowling', 'fielding', 'allround', 'personal_info').
        """            
        
        if load_type not in ["upload", "download"]:
            print(f"Invalid load type '{load_type}'. Must be 'upload' or 'download'.")
            return

        if stat_type not in ["all", "batting", "bowling", "fielding", "allround", "personal_info"]:
            print(f"Invalid stat type '{stat_type}'. Must be 'all', 'batting', 'bowling', 'fielding', 'allround', or 'personal_info'.")
            return

        # Ensure the S3 bucket exists
        self.ensure_bucket_exists(bucket_name)

        # Perform the upload operation
        if load_type == "upload":

            print(f"Uploading {self.player_name}'s {self.data_type} {stat_type} data to S3...")
            
            base_folder = f"{self.player_name}/{self.data_type}/"

            if self.battingstats is not None and stat_type in ["all", "batting"]:
                self.upload_df(bucket_name, base_folder + "batting_stats.csv", self.battingstats)

            if self.bowlingstats is not None and stat_type in ["all", "bowling"]:
                self.upload_df(bucket_name, base_folder + "bowling_stats.csv", self.bowlingstats)

            if self.fieldingstats is not None and stat_type in ["all", "fielding"]:
                self.upload_df(bucket_name, base_folder + "fielding_stats.csv", self.fieldingstats)

            if self.allroundstats is not None and stat_type in ["all", "allround"]:
                self.upload_df(bucket_name, base_folder + "allround_stats.csv", self.allroundstats)

            if self.player_info is not None and stat_type in ["all", "personal_info"]:
                self.upload_df(bucket_name, base_folder + "personal_info.csv", self.player_info)

            print(f"All {self.data_type} data uploaded to s3://{bucket_name}/{base_folder}")

        # Perform the download operation
        elif load_type == "download":

            print(f"Downloading {self.player_name}'s {self.data_type} data from S3...")

            if stat_type in ["all", "personal_info"]:
                self.player_info = self.download_df(bucket_name, "personal_info")

            if stat_type in ["all", "batting"]:
                self.battingstats = self.download_df(bucket_name, "batting")

            if stat_type in ["all", "bowling"]:
                self.bowlingstats = self.download_df(bucket_name, "bowling")

            if stat_type in ["all", "fielding"]:
                self.fieldingstats = self.download_df(bucket_name, "fielding")

            if stat_type in ["all", "allround"]:

                # Download allround stats if PLAYING ROLE is "Allrounder"
                if self.player_info is not None and "Allrounder" in self.player_info["PLAYING ROLE"].values:
                    self.allroundstats = self.download_df(bucket_name, "allround")
                
                else:
                    print(f"Warning: Player {self.player_name} is not an Allrounder. Skipping allround stats download.")
                    self.allroundstats = None
                
            print(f"All {self.data_type} data downloaded from s3://{bucket_name}/{self.player_name}/{self.data_type}/")


## Testing the code

### Virat Kohli

In [None]:
# --- 🔹 Step 1: Scrape raw data ---
player_name = "Virat Kohli"
virat_raw = ScrapeData(player_name)
virat_raw.get_player_stats()

# --- 🔹 Step 2: Upload raw data to S3 ---
bucket_name = "cricketer-stats"
virat_raw_loader = LoadData(player_name, data_type="raw")

virat_raw_loader.battingstats = virat_raw.battingstats
virat_raw_loader.bowlingstats = virat_raw.bowlingstats
virat_raw_loader.fieldingstats = virat_raw.fieldingstats
virat_raw_loader.allroundstats = virat_raw.allroundstats
virat_raw_loader.player_info = virat_raw.player_info

virat_raw_loader.load_data(bucket_name, load_type="upload")


⚠️ Bucket 'cricketer-stats' does not exist. Creating...
✅ Bucket 'cricketer-stats' created successfully.
Uploading virat_kohli's raw all data to S3...
 Uploaded to s3://cricketer-stats/virat_kohli/raw/batting_stats.csv
 Uploaded to s3://cricketer-stats/virat_kohli/raw/bowling_stats.csv
 Uploaded to s3://cricketer-stats/virat_kohli/raw/fielding_stats.csv
 Uploaded to s3://cricketer-stats/virat_kohli/raw/personal_info.csv
All raw data uploaded to s3://cricketer-stats/virat_kohli/raw/


In [61]:
# --- 🔹 Step 3: Download raw data from S3 ---
virat_tf = TransformData(player_name)
virat_tf_loader = LoadData(player_name, data_type="raw")

virat_tf_loader.load_data(bucket_name, load_type="download", stat_type="all")

virat_tf.battingstats = virat_tf_loader.battingstats
virat_tf.bowlingstats = virat_tf_loader.bowlingstats
virat_tf.fieldingstats = virat_tf_loader.fieldingstats
virat_tf.allroundstats = virat_tf_loader.allroundstats
virat_tf.player_info = virat_tf_loader.player_info

# --- 🔹 Step 4: Transform the data ---
virat_tf.process_data()

✅ Bucket 'cricketer-stats' already exists.
Downloading virat_kohli's raw data from S3...
 Downloaded from s3://cricketer-stats/virat_kohli/raw/personal_info.csv
 Downloaded from s3://cricketer-stats/virat_kohli/raw/batting_stats.csv
 Downloaded from s3://cricketer-stats/virat_kohli/raw/bowling_stats.csv
 Downloaded from s3://cricketer-stats/virat_kohli/raw/fielding_stats.csv
All raw data downloaded from s3://cricketer-stats/virat_kohli/raw/
Processing Virat Kohli's batting stats...
Batting stats processed successfully.
Processing Virat Kohli's bowling stats...
Bowling stats processed successfully.
Processing Virat Kohli's fielding stats...
Fielding stats processed successfully.


In [62]:
# --- 🔹 Step 5: Upload transformed data to S3 ---
virat_tf_loader = LoadData(player_name, data_type="tf")

virat_tf_loader.battingstats = virat_tf.battingstats
virat_tf_loader.bowlingstats = virat_tf.bowlingstats
virat_tf_loader.fieldingstats = virat_tf.fieldingstats
virat_tf_loader.allroundstats = virat_tf.allroundstats
virat_tf_loader.player_info = virat_tf.player_info

virat_tf_loader.load_data(bucket_name, load_type="upload", stat_type="all")

✅ Bucket 'cricketer-stats' already exists.
Uploading virat_kohli's tf all data to S3...
 Uploaded to s3://cricketer-stats/virat_kohli/tf/batting_stats.csv
 Uploaded to s3://cricketer-stats/virat_kohli/tf/bowling_stats.csv
 Uploaded to s3://cricketer-stats/virat_kohli/tf/fielding_stats.csv
 Uploaded to s3://cricketer-stats/virat_kohli/tf/personal_info.csv
All tf data uploaded to s3://cricketer-stats/virat_kohli/tf/


### Jacques Kallis

In [70]:
# --- 🔹 Step 1: Scrape raw data ---
player_name = "Jacques Kallis"
kallis_raw = ScrapeData(player_name)
kallis_raw.get_player_stats()

# --- 🔹 Step 2: Upload raw data to S3 ---
bucket_name = "cricketer-stats"
kallis_raw_loader = LoadData(player_name, data_type="raw")

kallis_raw_loader.battingstats = kallis_raw.battingstats
kallis_raw_loader.bowlingstats = kallis_raw.bowlingstats
kallis_raw_loader.fieldingstats = kallis_raw.fieldingstats
kallis_raw_loader.allroundstats = kallis_raw.allroundstats
kallis_raw_loader.player_info = kallis_raw.player_info

kallis_raw_loader.load_data(bucket_name, load_type="upload")

Setting up WebDriver...
Extracting Jacques Kallis's player URL and Player ID....
Extraction Successful for Jacques Kallis.
Time taken to extract URL: 4.13 seconds
Starting extraction of Jacques Kallis's batting stats....
Extracted 646 records in 138.87 seconds
Starting extraction of Jacques Kallis's bowling stats....
Extracted 668 records in 118.81 seconds
Starting extraction of Jacques Kallis's personal info....
Extracted player info in 4.10 seconds
Starting extraction of Jacques Kallis's allround stats....
Extracted 1314 records in 247.82 seconds
Starting extraction of Jacques Kallis's fielding stats....
Extracted 668 records in 110.29 seconds
✅ Bucket 'cricketer-stats' already exists.
Uploading jacques_kallis's raw all data to S3...
 Uploaded to s3://cricketer-stats/jacques_kallis/raw/batting_stats.csv
 Uploaded to s3://cricketer-stats/jacques_kallis/raw/bowling_stats.csv
 Uploaded to s3://cricketer-stats/jacques_kallis/raw/fielding_stats.csv
 Uploaded to s3://cricketer-stats/jacque

In [71]:
# --- 🔹 Step 3: Download raw data from S3 ---
kallis_tf = TransformData(player_name)
kallis_tf_loader = LoadData(player_name, data_type="raw")

kallis_tf_loader.load_data(bucket_name, load_type="download", stat_type="all")

kallis_tf.battingstats = kallis_tf_loader.battingstats
kallis_tf.bowlingstats = kallis_tf_loader.bowlingstats
kallis_tf.fieldingstats = kallis_tf_loader.fieldingstats
kallis_tf.allroundstats = kallis_tf_loader.allroundstats
kallis_tf.player_info = kallis_tf_loader.player_info

# --- 🔹 Step 4: Transform the data ---
kallis_tf.process_data()

✅ Bucket 'cricketer-stats' already exists.
Downloading jacques_kallis's raw data from S3...
 Downloaded from s3://cricketer-stats/jacques_kallis/raw/personal_info.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/raw/batting_stats.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/raw/bowling_stats.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/raw/fielding_stats.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/raw/allround_stats.csv
All raw data downloaded from s3://cricketer-stats/jacques_kallis/raw/
Processing Jacques Kallis's batting stats...
Batting stats processed successfully.
Processing Jacques Kallis's bowling stats...
Bowling stats processed successfully.
Processing Jacques Kallis's fielding stats...
Fielding stats processed successfully.
Processing Jacques Kallis's all-round stats...
All-round stats processed successfully.


In [72]:
# --- 🔹 Step 5: Upload transformed data to S3 ---
kallis_tf_loader = LoadData(player_name, data_type="tf")

kallis_tf_loader.battingstats = kallis_tf.battingstats
kallis_tf_loader.bowlingstats = kallis_tf.bowlingstats
kallis_tf_loader.fieldingstats = kallis_tf.fieldingstats
kallis_tf_loader.allroundstats = kallis_tf.allroundstats
kallis_tf_loader.player_info = kallis_tf.player_info

kallis_tf_loader.load_data(bucket_name, load_type="upload", stat_type="all")

✅ Bucket 'cricketer-stats' already exists.
Uploading jacques_kallis's tf all data to S3...
 Uploaded to s3://cricketer-stats/jacques_kallis/tf/batting_stats.csv
 Uploaded to s3://cricketer-stats/jacques_kallis/tf/bowling_stats.csv
 Uploaded to s3://cricketer-stats/jacques_kallis/tf/fielding_stats.csv
 Uploaded to s3://cricketer-stats/jacques_kallis/tf/allround_stats.csv
 Uploaded to s3://cricketer-stats/jacques_kallis/tf/personal_info.csv
All tf data uploaded to s3://cricketer-stats/jacques_kallis/tf/


### James Anderson

In [75]:
# --- 🔹 Step 1: Scrape raw data ---
player_name = "JM Anderson"
anderson_raw = ScrapeData(player_name)
anderson_raw.get_player_stats()

# --- 🔹 Step 2: Upload raw data to S3 ---
bucket_name = "cricketer-stats"
anderson_raw_loader = LoadData(player_name, data_type="raw")

anderson_raw_loader.battingstats = anderson_raw.battingstats
anderson_raw_loader.bowlingstats = anderson_raw.bowlingstats
anderson_raw_loader.fieldingstats = anderson_raw.fieldingstats
anderson_raw_loader.allroundstats = anderson_raw.allroundstats
anderson_raw_loader.player_info = anderson_raw.player_info

anderson_raw_loader.load_data(bucket_name, load_type="upload")

Setting up WebDriver...
Extracting JM Anderson's player URL and Player ID....
Extraction Successful for JM Anderson.
Time taken to extract URL: 3.55 seconds
WebDriver closed successfully.
Starting extraction of JM Anderson's batting stats....
Extracted 556 records in 128.56 seconds
Starting extraction of JM Anderson's bowling stats....
Extracted 569 records in 113.30 seconds
Starting extraction of JM Anderson's personal info....
Extracted player info in 4.20 seconds
Starting extraction of JM Anderson's fielding stats....
Extracted 569 records in 104.76 seconds
✅ Bucket 'cricketer-stats' already exists.
Uploading jm_anderson's raw all data to S3...
 Uploaded to s3://cricketer-stats/jm_anderson/raw/batting_stats.csv
 Uploaded to s3://cricketer-stats/jm_anderson/raw/bowling_stats.csv
 Uploaded to s3://cricketer-stats/jm_anderson/raw/fielding_stats.csv
 Uploaded to s3://cricketer-stats/jm_anderson/raw/personal_info.csv
All raw data uploaded to s3://cricketer-stats/jm_anderson/raw/


In [76]:
# --- 🔹 Step 3: Download raw data from S3 ---
anderson_tf = TransformData(player_name)
anderson_tf_loader = LoadData(player_name, data_type="raw")

anderson_tf_loader.load_data(bucket_name, load_type="download", stat_type="all")

anderson_tf.battingstats = anderson_tf_loader.battingstats
anderson_tf.bowlingstats = anderson_tf_loader.bowlingstats
anderson_tf.fieldingstats = anderson_tf_loader.fieldingstats
anderson_tf.allroundstats = anderson_tf_loader.allroundstats
anderson_tf.player_info = anderson_tf_loader.player_info

# --- 🔹 Step 4: Transform the data ---
anderson_tf.process_data()

✅ Bucket 'cricketer-stats' already exists.
Downloading jm_anderson's raw data from S3...
 Downloaded from s3://cricketer-stats/jm_anderson/raw/personal_info.csv
 Downloaded from s3://cricketer-stats/jm_anderson/raw/batting_stats.csv
 Downloaded from s3://cricketer-stats/jm_anderson/raw/bowling_stats.csv
 Downloaded from s3://cricketer-stats/jm_anderson/raw/fielding_stats.csv
All raw data downloaded from s3://cricketer-stats/jm_anderson/raw/
Processing JM Anderson's batting stats...
Batting stats processed successfully.
Processing JM Anderson's bowling stats...
Bowling stats processed successfully.
Processing JM Anderson's fielding stats...
Fielding stats processed successfully.


In [77]:
# --- 🔹 Step 5: Upload transformed data to S3 ---
anderson_tf_loader = LoadData(player_name, data_type="tf")

anderson_tf_loader.battingstats = anderson_tf.battingstats
anderson_tf_loader.bowlingstats = anderson_tf.bowlingstats
anderson_tf_loader.fieldingstats = anderson_tf.fieldingstats
anderson_tf_loader.allroundstats = anderson_tf.allroundstats
anderson_tf_loader.player_info = anderson_tf.player_info

anderson_tf_loader.load_data(bucket_name, load_type="upload", stat_type="all")

✅ Bucket 'cricketer-stats' already exists.
Uploading jm_anderson's tf all data to S3...
 Uploaded to s3://cricketer-stats/jm_anderson/tf/batting_stats.csv
 Uploaded to s3://cricketer-stats/jm_anderson/tf/bowling_stats.csv
 Uploaded to s3://cricketer-stats/jm_anderson/tf/fielding_stats.csv
 Uploaded to s3://cricketer-stats/jm_anderson/tf/personal_info.csv
All tf data uploaded to s3://cricketer-stats/jm_anderson/tf/


## Debugging Scraper

In [None]:
def get_player_url(player_name,driver):
    start_time = time.time()
    print(f"Extracting {player_name}'s player URL and Player ID....")
    search_url = f"https://search.espncricinfo.com/ci/content/site/search.html?search={player_name.lower().replace(' ', '%20')};type=player"
    driver.get(search_url)

    try:
        player_link_element = driver.find_element(By.CSS_SELECTOR, "h3.name.link-cta a")
        player_url = player_link_element.get_attribute("href")
        player_id = player_url.split('-')[-1]
        print(f"Extraction Successful for {player_name}.")
        end_time = time.time()
        print(f"Time taken to extract URL: {end_time - start_time:.2f} seconds")

        return player_url, player_id
        
    except Exception as e:
        print(f"Error in extracting {player_name}'s url:", e)
        return None, None

In [26]:
def extract_player_info(driver,player_name, player_id, player_url):
        try:
            start_time = time.time()
            print(f"Starting extraction of {player_name}'s personal info....")
            
            # Start by opening the player info URL
            driver.get(player_url)

            # Step 1: Extract headers within the specified div tag
            headers = driver.find_elements(By.XPATH, "//div//p[@class='ds-text-tight-m ds-font-regular ds-uppercase ds-text-typo-mid3']")
            header_names = ['Player ID','Player URL']+[header.text for header in headers]

            # Step 2: Extract values within the specified div tag
            values = driver.find_elements(By.XPATH, "//div//span[@class='ds-text-title-s ds-font-bold ds-text-typo']")
            value_texts = [player_id,player_url]+[value.text for value in values]

            # Step 3: Create a DataFrame from the extracted data
            player_info = pd.DataFrame([value_texts], columns=header_names)

            end_time = time.time()
            print(f"Extracted player info in {end_time - start_time:.2f} seconds")

            return player_info
            
        except Exception as e:
            print(f"Error in extracting {player_name}'s personal info:", e)
            return None

In [2]:
player_name = "Virat Kohli"
player_id = 253802
player_url = "https://www.espncricinfo.com/cricketers/virat-kohli-253802"

In [None]:
driver.get(player_url)

headers = driver.find_elements(By.XPATH, "//div//p[@class='ds-text-tight-m ds-font-regular ds-uppercase ds-text-typo-mid3']")
[header.text for header in headers]

values = driver.find_elements(By.XPATH, "//div//span[@class='ds-text-title-s ds-font-bold ds-text-typo']")
[value.text for value in values]

['Virat Kohli',
 'November 05, 1988, Delhi',
 '36y 162d',
 'Right hand Bat',
 'Right arm Medium',
 'Top order Batter']