## Scraping test

In [4]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as uc

class ScrapeData:

    def __init__(self, player_name):
        self.player_name = player_name
        self.player_id = None
        self.player_url = None
    
        # Initialize class variables for storing stats
        self.battingstats = None
        self.bowlingstats = None
        self.allroundstats = None
        self.fieldingstats = None
        self.player_info = None

        # Set up the WebDriver and open the search URL
        options = uc.ChromeOptions()
        options.add_argument("--window-size=1920,1080")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")

        print("Setting up WebDriver...")
        self.driver = uc.Chrome(options=options)        

        # Call get_player_url() to fetch the player's URL and ID when the object is initialized
        self.get_player_url()

    def get_player_url(self):
        start_time = time.time()
        print(f"Extracting {self.player_name}'s player URL and Player ID....")
        search_url = f"https://search.espncricinfo.com/ci/content/site/search.html?search={self.player_name.lower().replace(' ', '%20')};type=player"
        self.driver.get(search_url)

        try:
            player_link_element = self.driver.find_element(By.CSS_SELECTOR, "h3.name.link-cta a")
            self.player_url = player_link_element.get_attribute("href")
            self.player_id = self.player_url.split('-')[-1]
            print(f"Extraction Successful for {self.player_name}.")
            end_time = time.time()
            print(f"Time taken to extract URL: {end_time - start_time:.2f} seconds")
        except Exception as e:
            print(f"Error in extracting {self.player_name}'s url:", e)
            return None, None

    def extract_inns_data(self, record_type):
        start_time = time.time()
        print(f"Starting extraction of {self.player_name}'s {record_type} stats....")
        
        # Construct the search URL based on record_type (batting, bowling, etc.)
        search_url = f"https://stats.espncricinfo.com/ci/engine/player/{self.player_id}.html?class=11;template=results;type={record_type};view=innings"
        
        # Open the URL
        self.driver.get(search_url)

        # Step 1: Extract the headers of the table
        headers = self.driver.find_elements(By.CSS_SELECTOR, "thead tr.headlinks th")
        header_names = [header.text for header in headers if header.text != ''] + ['Match id']  # Add match_id column name
        
        # Step 2: Extract the data from the 4th tbody
        rows = self.driver.find_elements(By.XPATH, "(//tbody)[4]//tr")
        
        # Step 3: Extract the data column-wise and store it in a list
        player_data = []
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            row_data = [cell.text for cell in cells if cell.text != '']
            player_data.append(row_data)
        
        # Step 4: Create a DataFrame from the extracted data
        innings_data = pd.DataFrame(player_data, columns=header_names)
        
        end_time = time.time()
        print(f"Extracted {innings_data.shape[0]} records in {end_time - start_time:.2f} seconds")
        
        return innings_data

    def extract_player_info(self):
        try:
            start_time = time.time()
            print(f"Starting extraction of {self.player_name}'s personal info....")
            
            # Start by opening the player info URL
            self.driver.get(self.player_url)

            # Step 1: Extract headers within the specified div tag
            headers = self.driver.find_elements(By.XPATH, "//div[@class='ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8']//p[@class='ds-text-tight-m ds-font-regular ds-uppercase ds-text-typo-mid3']")
            header_names = ['Player ID','Player URL']+[header.text for header in headers]

            # Step 2: Extract values within the specified div tag
            values = self.driver.find_elements(By.XPATH, "//div[@class='ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8']//span[@class='ds-text-title-s ds-font-bold ds-text-typo']")
            value_texts = [self.player_id,self.player_url]+[value.text for value in values]

            # Step 3: Create a DataFrame from the extracted data
            player_info = pd.DataFrame([value_texts], columns=header_names)

            end_time = time.time()
            print(f"Extracted player info in {end_time - start_time:.2f} seconds")

            return player_info
            
        except Exception as e:
            print(f"Error in extracting {self.player_name}'s personal info:", e)
            return None

    def get_player_stats(self, stats_type="all"):
        try:
            # Ensure that player ID or player URL is available
            if not (self.player_id or self.player_url):
                print("Player ID is not available. Run get_player_url() first.")
                return
            
            # Fetch personal information if 'personal_info' is passed
            if stats_type == "personal_info":
                self.player_info = self.extract_player_info()
            
            # Fetch batting stats if 'all' or 'batting' is passed
            if stats_type == "all" or stats_type == "batting":
                self.battingstats = self.extract_inns_data('batting')

            # Fetch bowling stats if 'all' or 'bowling' is passed
            if stats_type == "all" or stats_type == "bowling":
                self.bowlingstats = self.extract_inns_data('bowling')

            # Check if the player is an all-rounder and fetch all-round stats
            if stats_type == "all" or stats_type == "allround":
                self.player_info = self.extract_player_info()
                if self.player_info is not None and 'allround' in self.player_info['PLAYING ROLE'][0].lower():
                    self.allroundstats = self.extract_inns_data('allround')

            # Fetch fielding stats if 'all' or 'fielding' is passed
            if stats_type == "all" or stats_type == "fielding":
                self.fieldingstats = self.extract_inns_data('fielding')

        except Exception as e:
            print(f"Error in extracting stats for {self.player_name}: ", e)

    def __del__(self):
        try:
            self.driver.quit()
            print("WebDriver closed successfully.")
        except Exception as e:
            print("Error while closing the WebDriver:", e)

##### Extracting Ground Information

The following functions are built to extract ground information. However, these are very resource-intensive, so we will take it up later. 

In [None]:
def extract_ground_links(player_id):
    """
    This function extracts ground links from the player innings data, ensuring no duplicate ground info is scraped.
    """
    # Step 1: Initialize the DataFrame to store ground info
    ground_info_df = pd.DataFrame(columns=["Ground ID", "Stadium Name", "Location", "Home Team", "Image URL"])

    # Set up the WebDriver for scraping
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    # Scrape Player Stats Page
    search_url = f"https://stats.espncricinfo.com/ci/engine/player/{player_id}.html?class=11;template=results;type=batting;view=innings"
    driver.get(search_url)
    
    # Extract ground links from innings data
    rows = driver.find_elements(By.XPATH, "(//tbody)[4]//tr")
    ground_links = []
    
    for row in rows:
        try:
            ground_name_element = row.find_element(By.XPATH, ".//td[contains(@class, 'left')][2]/a")
            ground_name = ground_name_element.text
            ground_link = ground_name_element.get_attribute('href')
            ground_links.append((ground_name, ground_link))
        except Exception as e:
            print(f"Error extracting ground data: {e}")
            continue

    # Step 2: Check if the ground has already been scraped (exists in ground_info DataFrame)
    for ground_name, ground_link in ground_links:
        if ground_name not in ground_info_df['Stadium Name'].values:
            # Create a new DataFrame for the new ground
            new_data = pd.DataFrame({"Stadium Name": [ground_name], "Ground Link": [ground_link]})
            ground_info_df = pd.concat([ground_info_df, new_data], ignore_index=True)
        else:
            print(f"Ground {ground_name} has already been scraped. Skipping.")

    # Step 3: Extract ground info for each link and append it to the ground_info_df
    for ground_link in ground_info_df['Ground Link']:
        ground_info_df = extract_ground_info(ground_link, ground_info_df)
    
    driver.quit()
    return ground_info_df

def extract_ground_info(ground_url, ground_info_df):
    """
    This function extracts ground information (ID, stadium name, location, home team, image URL)
    from a given ground URL and appends the data to the provided dataframe.
    """
    start_time = time.time()
    
    # Set up the WebDriver for scraping ground info
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    driver.get(ground_url)
    
    try:
        # 1. Ground ID (numeric portion of the URL)
        ground_id = ground_url.split('/')[-1].split('.')[0]
        
        # 2. Ground image URL
        img_element = driver.find_element(By.XPATH, "//div[@class='ds-p-0']//img[1]")
        image_url = img_element.get_attribute("src")
        
        # 3. Stadium Name
        stadium_name = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-text-title-m') and contains(@class, 'ds-font-bold')]").text
        
        # 4. Location (City)
        location = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-text-compact-s') and contains(@class, 'ds-font-bold')]").text.strip().replace("\n", ", ")
        
        # 5. Home Team (Country)
        home_team_text = driver.find_element(By.XPATH, "//span[contains(text(), 'Grounds in')]").text
        home_team = home_team_text.split("Grounds in")[-1].strip()
        
        # Prepare the ground info as a dictionary
        ground_info = pd.DataFrame({
            "Ground ID": [ground_id],
            "Stadium Name": [stadium_name],
            "Location": [location],
            "Home Team": [home_team],
            "Image URL": [image_url]
        })
        
        # Append the ground info to the DataFrame
        ground_info_df = pd.concat([ground_info_df,ground_info], ignore_index=True)
        print(f"Extracted info for ground {stadium_name} in {time.time() - start_time:.2f} seconds.")
    
    except Exception as e:
        print(f"Error while extracting info for {ground_url}: {e}")
    
    driver.quit()
    return ground_info_df

In [52]:
ground_url = 'https://www.espncricinfo.com/cricket-grounds/rangiri-dambulla-international-stadium-59368'

# Set up the WebDriver for scraping ground info
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
driver.get(ground_url)
    
try:
    # 1. Ground ID (numeric portion of the URL)
    ground_id = ground_url.split('/')[-1].split('.')[0]
        
    # 2. Ground image URL
    img_element = driver.find_element(By.XPATH, "//div[@class='ds-p-0']//img")
    image_url = img_element.get_attribute("src")
        
        # 3. Stadium Name
    stadium_name = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-text-title-m') and contains(@class, 'ds-font-bold')]").text
        
    # 4. Location (City)
    location = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-text-compact-s') and contains(@class, 'ds-font-bold')]").text.strip().replace("\n", ", ")
        
    # 5. Home Team (Country)
    home_team_text = driver.find_element(By.XPATH, "//span[contains(text(), 'Grounds in')]").text
    home_team = home_team_text.split("Grounds in")[-1].strip()
        
    # Prepare the ground info as a dictionary
    ground_info = pd.DataFrame({
            "Ground ID": [ground_id],
            "Stadium Name": [stadium_name],
            "Location": [location],
            "Home Team": [home_team],
            "Image URL": [image_url]
        })

except Exception as e : print(e)
    
ground_info

Message: no such element: Unable to locate element: {"method":"xpath","selector":"//div[@class='ds-p-0']//img"}
  (Session info: chrome=135.0.7049.85); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00B88073+60707]
	GetHandleVerifier [0x00B880B4+60772]
	(No symbol) [0x009B0683]
	(No symbol) [0x009F8660]
	(No symbol) [0x009F89FB]
	(No symbol) [0x00A41022]
	(No symbol) [0x00A1D094]
	(No symbol) [0x00A3E824]
	(No symbol) [0x00A1CE46]
	(No symbol) [0x009EC5D3]
	(No symbol) [0x009ED424]
	GetHandleVerifier [0x00DCBB53+2435075]
	GetHandleVerifier [0x00DC70F3+2416035]
	GetHandleVerifier [0x00DE349C+2531660]
	GetHandleVerifier [0x00B9F145+155125]
	GetHandleVerifier [0x00BA5AED+182173]
	GetHandleVerifier [0x00B8F948+91640]
	GetHandleVerifier [0x00B8FAF0+92064]
	GetHandleVerifier [0x00B7A5B0+4704]
	BaseThreadInitThunk [0x75D35D49+25]
	RtlInitializeExceptionChain [0x7

NameError: name 'ground_info' is not defined

In [None]:
player_id = 253802
grounds = extract_ground_links(player_id)

grounds

## Transformation

In [5]:
import time
import pandas as pd
import numpy as np

class TransformData:
    
    def __init__(self, player_name):
        self.player_name = player_name
        self.player_info = None
        self.battingstats = None
        self.bowlingstats = None
        self.allroundstats = None
        self.fieldingstats = None
        self.player_id = None
        self.player_url = None

    #transforming data

    def transform_data(self,df):
        
        #STEP 1: Replacing incorrect values. 
        repl_dict = {
            r'\*': '',       
            r'^DNB$': np.nan,  
            r'^TDNB$': np.nan, 
            r'^DNF$': np.nan,  
            r'^TDNF$': np.nan, 
            r'^-$': np.nan,    
            r'^sub$': np.nan   
                    }

        df = df.replace(repl_dict,regex=True)

        #STEP 2: Opposition column
        df['Format'] = df['Opposition'].str.extract(r'(^.*?)\sv\s')
        df['Opposition'] = df['Opposition'].str.extract(r'\sv\s(.*?$)')

        #STEP 3: Ground column
        ground_mapping = {
        "Colombo (SSC)": "Colombo",
        "Colombo (PSS)": "Colombo",
        "Colombo (RPS)": "Colombo",
        "Eden Gardens": "Kolkata",
        "Wankhede": "Mumbai",
        "Brabourne": "Mumbai",
        "Kingston": "Kingston Jamaica",
        "The Oval": "London",
        "Lord's": "London",
        "W.A.C.A": "Perth",
        "Dharamsala": "Dharamshala",
        "Hamilton": "Hamilton Waikato",
        "Fatullah": "Fatullah Dhaka",
        "Providence": "Providence Guyana",
        "Dubai (DICS)": "Dubai",
        "Chattogram": "Chattogram Chittagong"
        }

        df['Ground']=df['Ground'].replace(ground_mapping)
        df = df.rename(columns={'Ground':'Location'})

        #STEP 4: START DATE
        df['Start Date'] = df['Start Date'].astype('datetime64[ns]')

        #STEP 5: MATCH ID
        df['Match id']='#'+df['Match id'].str.extract(r'(\d+$)')
        df = df.rename(columns={'Match id':'Match ID'})

        return df
    
    def final_df(self, df, common_cols, custom_cols):

        dtype_mapping = {

                # Common Columns
                'Match ID': 'string',
                'Start Date': 'datetime64[ns]',
                'Format': 'string',
                'Inns': 'Int64',  # Allows NaN handling
                'Opposition': 'string',
                'Location': 'string',
                
                # Batting Columns
                'Pos': 'Int64',
                'Runs': 'Int64',
                'BF': 'Int64',
                '4s': 'Int64',
                '6s': 'Int64',
                'SR': 'float64',
                'Mins': 'Int64',
                'Dismissal': 'string',

                # Bowling Columns
                'Overs': 'float64',
                'Mdns': 'Int64',
                'Runs': 'Int64',
                'Wkts': 'Int64',
                'Econ': 'float64',

                # Fielding Columns
                'Dis': 'Int64',
                'Ct': 'Int64',

                # Allround Columns
                'Score': 'string',  # Could be runs or DNB, TDNB
                'Conc': 'Int64',
                'St': 'Int64'
            }

        if df is not None:
            df = self.transform_data(df)

            # Select the necessary columns
            df = df[common_cols[:-2] + custom_cols + common_cols[-2:]]

            # Apply type casting
            for col in df.columns:
                if col in dtype_mapping:
                    try:
                        df[col] = df[col].astype(dtype_mapping[col])
                    except Exception as e:
                        print(f"Data type casting failed for column {col}: {e}")

            return df


    def process_data(self,type="all"):

        common = ['Match ID','Start Date','Format','Inns','Opposition','Location']
        batcols = ['Pos','Runs','BF','4s','6s','SR','Mins','Dismissal']
        bowlcols = ['Pos','Overs','Mdns','Runs','Wkts','Econ']
        fieldcols = ['Dis','Ct']
        allroundcols = ['Score','Overs','Conc','Wkts','Ct','St']

        try:
        
            # Process batting stats
            if type == 'all' or type == 'batting':
                print(f"Processing {self.player_name}'s batting stats...")
                self.battingstats = self.final_df(self.battingstats, common, batcols)
                print(f"Batting stats processed successfully.")

            # Process bowling stats
            if type == 'all' or type == 'bowling':
                print(f"Processing {self.player_name}'s bowling stats...")
                self.bowlingstats = self.final_df(self.bowlingstats, common, bowlcols)
                print(f"Bowling stats processed successfully.")

            # Process fielding stats
            if type == 'all' or type == 'fielding':
                print(f"Processing {self.player_name}'s fielding stats...")
                self.fieldingstats = self.final_df(self.fieldingstats, common, fieldcols)
                print(f"Fielding stats processed successfully.")

            # Process allround stats
            if type == 'all' or type == 'allround':
                if self.player_info is not None and 'allround' in self.player_info['PLAYING ROLE'][0].lower():
                    print(f"Processing {self.player_name}'s all-round stats...")
                    self.allroundstats = self.final_df(self.allroundstats, common, allroundcols)
                    print(f"All-round stats processed successfully.")

           
        except Exception as e:
            print(f"Error in processing data for {self.player_name}: ", e)

## Loading

In [6]:
import pandas as pd
import boto3
from botocore.exceptions import ClientError
from io import StringIO
from dotenv import load_dotenv
import os

load_dotenv()  # Load AWS credentials from .env

s3 = boto3.client("s3")

class LoadData:

    def __init__(self, player_name, data_type="raw"):
        
        """
        Initializes the CricketerStatsLoader with player name and data type.
        Args:
            player_name (str): Name of the player.
            data_type (str): Type of data ('raw' or 'tf').
        """

        self.player_name = player_name.lower().replace(" ", "_")
        self.data_type = data_type  # 'raw' or 'tf'
        self.battingstats = None
        self.bowlingstats = None
        self.fieldingstats = None
        self.allroundstats = None
        self.player_info = None

    def ensure_bucket_exists(self, bucket_name, flag=0):

        """Checks if the S3 bucket exists, and creates it if not."""
        
        try:
            s3.head_bucket(Bucket=bucket_name)
            print(f"Bucket '{bucket_name}' already exists.")
        
        except ClientError as e:
            error_code = int(e.response["Error"]["Code"])
            
            if error_code == 404:
                print(f"Bucket '{bucket_name}' does not exist.")

                if flag == 1:
                    
                    print(f"Creating bucket '{bucket_name}'...")
                    s3.create_bucket(
                        Bucket=bucket_name,
                        CreateBucketConfiguration={
                            'LocationConstraint': os.getenv("AWS_DEFAULT_REGION")
                                                }
                    )
                    print(f"Bucket '{bucket_name}' created successfully.")
            
            else: raise e
    
    def upload_df(self, bucket_name, object_key, df):

        """Uploads a Pandas DataFrame as a CSV file to S3."""

        if df is None or df.empty:
            print(f"Warning: {object_key} is empty, skipping upload.")
            return

        csv_buffer = StringIO()
        df.to_csv(csv_buffer, index=False)

        s3.put_object(
            Bucket=bucket_name,
            Key=object_key,
            Body=csv_buffer.getvalue(),
            ContentType="text/csv"
        )

        print(f" Uploaded to s3://{bucket_name}/{object_key}")

    def download_df(self, bucket_name, stat_type):

        """Downloads a cricket stat CSV from S3 into a DataFrame."""

        file_name_map = {
            "batting": "batting_stats.csv",
            "bowling": "bowling_stats.csv",
            "fielding": "fielding_stats.csv",
            "allround": "allround_stats.csv",
            "personal_info": "personal_info.csv"
        }


        if stat_type not in file_name_map:
            print(f" Invalid stat_type '{stat_type}'.")
            return None

        object_key = f"{self.player_name}/{self.data_type}/{file_name_map[stat_type]}"

        try:

            response = s3.get_object(Bucket=bucket_name, Key=object_key)
            content = response["Body"].read().decode("utf-8")
            df = pd.read_csv(StringIO(content))
            print(f" Downloaded from s3://{bucket_name}/{object_key}")
            return df
        
        except Exception as e:
            print(f"Error downloading {stat_type} stats: {e}")
            return None

    def load_data(self, bucket_name, load_type, stat_type="all"):
    
        """
        Loads cricket stats data to/from S3.

        Args:
            bucket_name (str): Name of the S3 bucket.
            load_type (str): Type of load operation ('upload' or 'download').
            stat_type (str): Type of stats to load ('all', 'batting', 'bowling', 'fielding', 'allround', 'personal_info').
        """            
        
        if load_type not in ["upload", "download"]:
            print(f"Invalid load type '{load_type}'. Must be 'upload' or 'download'.")
            return

        if stat_type not in ["all", "batting", "bowling", "fielding", "allround", "personal_info"]:
            print(f"Invalid stat type '{stat_type}'. Must be 'all', 'batting', 'bowling', 'fielding', 'allround', or 'personal_info'.")
            return

        # Perform the upload operation
        if load_type == "upload":

            # Ensure the S3 bucket exists
            self.ensure_bucket_exists(bucket_name,flag=1)

            print(f"Uploading {self.player_name}'s {self.data_type} {stat_type} data to S3...")
            
            base_folder = f"{self.player_name}/{self.data_type}/"

            if self.battingstats is not None and stat_type in ["all", "batting"]:
                self.upload_df(bucket_name, base_folder + "batting_stats.csv", self.battingstats)

            if self.bowlingstats is not None and stat_type in ["all", "bowling"]:
                self.upload_df(bucket_name, base_folder + "bowling_stats.csv", self.bowlingstats)

            if self.fieldingstats is not None and stat_type in ["all", "fielding"]:
                self.upload_df(bucket_name, base_folder + "fielding_stats.csv", self.fieldingstats)

            if self.allroundstats is not None and stat_type in ["all", "allround"]:
                self.upload_df(bucket_name, base_folder + "allround_stats.csv", self.allroundstats)

            if self.player_info is not None and stat_type in ["all", "personal_info"]:
                self.upload_df(bucket_name, base_folder + "personal_info.csv", self.player_info)

            print(f"All {self.data_type} data uploaded to s3://{bucket_name}/{base_folder}")

        # Perform the download operation
        elif load_type == "download":

            # Ensure the S3 bucket exists
            self.ensure_bucket_exists(bucket_name)

            print(f"Downloading {self.player_name}'s {self.data_type} data from S3...")

            if stat_type in ["all", "personal_info"]:
                self.player_info = self.download_df(bucket_name, "personal_info")

            if stat_type in ["all", "batting"]:
                self.battingstats = self.download_df(bucket_name, "batting")

            if stat_type in ["all", "bowling"]:
                self.bowlingstats = self.download_df(bucket_name, "bowling")

            if stat_type in ["all", "fielding"]:
                self.fieldingstats = self.download_df(bucket_name, "fielding")

            if stat_type in ["all", "allround"]:

                # Download allround stats if PLAYING ROLE is "Allrounder"
                if self.player_info is not None and "Allrounder" in self.player_info["PLAYING ROLE"].values:
                    self.allroundstats = self.download_df(bucket_name, "allround")
                
                else:
                    print(f"Warning: Player {self.player_name} is not an Allrounder. Skipping allround stats download.")
                    self.allroundstats = None
                
            print(f"All {self.data_type} data downloaded from s3://{bucket_name}/{self.player_name}/{self.data_type}/")


## Testing the code

### Virat Kohli

In [None]:
# --- 🔹 Step 1: Scrape raw data ---
player_name = "Virat Kohli"
virat_raw = ScrapeData(player_name)
virat_raw.get_player_stats()

# --- 🔹 Step 2: Upload raw data to S3 ---
bucket_name = "cricketer-stats"
virat_raw_loader = LoadData(player_name, data_type="raw")

virat_raw_loader.battingstats = virat_raw.battingstats
virat_raw_loader.bowlingstats = virat_raw.bowlingstats
virat_raw_loader.fieldingstats = virat_raw.fieldingstats
virat_raw_loader.allroundstats = virat_raw.allroundstats
virat_raw_loader.player_info = virat_raw.player_info

virat_raw_loader.load_data(bucket_name, load_type="upload")


⚠️ Bucket 'cricketer-stats' does not exist. Creating...
✅ Bucket 'cricketer-stats' created successfully.
Uploading virat_kohli's raw all data to S3...
 Uploaded to s3://cricketer-stats/virat_kohli/raw/batting_stats.csv
 Uploaded to s3://cricketer-stats/virat_kohli/raw/bowling_stats.csv
 Uploaded to s3://cricketer-stats/virat_kohli/raw/fielding_stats.csv
 Uploaded to s3://cricketer-stats/virat_kohli/raw/personal_info.csv
All raw data uploaded to s3://cricketer-stats/virat_kohli/raw/


In [61]:
# --- 🔹 Step 3: Download raw data from S3 ---
virat_tf = TransformData(player_name)
virat_tf_loader = LoadData(player_name, data_type="raw")

virat_tf_loader.load_data(bucket_name, load_type="download", stat_type="all")

virat_tf.battingstats = virat_tf_loader.battingstats
virat_tf.bowlingstats = virat_tf_loader.bowlingstats
virat_tf.fieldingstats = virat_tf_loader.fieldingstats
virat_tf.allroundstats = virat_tf_loader.allroundstats
virat_tf.player_info = virat_tf_loader.player_info

# --- 🔹 Step 4: Transform the data ---
virat_tf.process_data()

✅ Bucket 'cricketer-stats' already exists.
Downloading virat_kohli's raw data from S3...
 Downloaded from s3://cricketer-stats/virat_kohli/raw/personal_info.csv
 Downloaded from s3://cricketer-stats/virat_kohli/raw/batting_stats.csv
 Downloaded from s3://cricketer-stats/virat_kohli/raw/bowling_stats.csv
 Downloaded from s3://cricketer-stats/virat_kohli/raw/fielding_stats.csv
All raw data downloaded from s3://cricketer-stats/virat_kohli/raw/
Processing Virat Kohli's batting stats...
Batting stats processed successfully.
Processing Virat Kohli's bowling stats...
Bowling stats processed successfully.
Processing Virat Kohli's fielding stats...
Fielding stats processed successfully.


In [62]:
# --- 🔹 Step 5: Upload transformed data to S3 ---
virat_tf_loader = LoadData(player_name, data_type="tf")

virat_tf_loader.battingstats = virat_tf.battingstats
virat_tf_loader.bowlingstats = virat_tf.bowlingstats
virat_tf_loader.fieldingstats = virat_tf.fieldingstats
virat_tf_loader.allroundstats = virat_tf.allroundstats
virat_tf_loader.player_info = virat_tf.player_info

virat_tf_loader.load_data(bucket_name, load_type="upload", stat_type="all")

✅ Bucket 'cricketer-stats' already exists.
Uploading virat_kohli's tf all data to S3...
 Uploaded to s3://cricketer-stats/virat_kohli/tf/batting_stats.csv
 Uploaded to s3://cricketer-stats/virat_kohli/tf/bowling_stats.csv
 Uploaded to s3://cricketer-stats/virat_kohli/tf/fielding_stats.csv
 Uploaded to s3://cricketer-stats/virat_kohli/tf/personal_info.csv
All tf data uploaded to s3://cricketer-stats/virat_kohli/tf/


### Jacques Kallis

In [70]:
# --- 🔹 Step 1: Scrape raw data ---
player_name = "Jacques Kallis"
kallis_raw = ScrapeData(player_name)
kallis_raw.get_player_stats()

# --- 🔹 Step 2: Upload raw data to S3 ---
bucket_name = "cricketer-stats"
kallis_raw_loader = LoadData(player_name, data_type="raw")

kallis_raw_loader.battingstats = kallis_raw.battingstats
kallis_raw_loader.bowlingstats = kallis_raw.bowlingstats
kallis_raw_loader.fieldingstats = kallis_raw.fieldingstats
kallis_raw_loader.allroundstats = kallis_raw.allroundstats
kallis_raw_loader.player_info = kallis_raw.player_info

kallis_raw_loader.load_data(bucket_name, load_type="upload")

Setting up WebDriver...
Extracting Jacques Kallis's player URL and Player ID....
Extraction Successful for Jacques Kallis.
Time taken to extract URL: 4.13 seconds
Starting extraction of Jacques Kallis's batting stats....
Extracted 646 records in 138.87 seconds
Starting extraction of Jacques Kallis's bowling stats....
Extracted 668 records in 118.81 seconds
Starting extraction of Jacques Kallis's personal info....
Extracted player info in 4.10 seconds
Starting extraction of Jacques Kallis's allround stats....
Extracted 1314 records in 247.82 seconds
Starting extraction of Jacques Kallis's fielding stats....
Extracted 668 records in 110.29 seconds
✅ Bucket 'cricketer-stats' already exists.
Uploading jacques_kallis's raw all data to S3...
 Uploaded to s3://cricketer-stats/jacques_kallis/raw/batting_stats.csv
 Uploaded to s3://cricketer-stats/jacques_kallis/raw/bowling_stats.csv
 Uploaded to s3://cricketer-stats/jacques_kallis/raw/fielding_stats.csv
 Uploaded to s3://cricketer-stats/jacque

In [71]:
# --- 🔹 Step 3: Download raw data from S3 ---
kallis_tf = TransformData(player_name)
kallis_tf_loader = LoadData(player_name, data_type="raw")

kallis_tf_loader.load_data(bucket_name, load_type="download", stat_type="all")

kallis_tf.battingstats = kallis_tf_loader.battingstats
kallis_tf.bowlingstats = kallis_tf_loader.bowlingstats
kallis_tf.fieldingstats = kallis_tf_loader.fieldingstats
kallis_tf.allroundstats = kallis_tf_loader.allroundstats
kallis_tf.player_info = kallis_tf_loader.player_info

# --- 🔹 Step 4: Transform the data ---
kallis_tf.process_data()

✅ Bucket 'cricketer-stats' already exists.
Downloading jacques_kallis's raw data from S3...
 Downloaded from s3://cricketer-stats/jacques_kallis/raw/personal_info.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/raw/batting_stats.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/raw/bowling_stats.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/raw/fielding_stats.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/raw/allround_stats.csv
All raw data downloaded from s3://cricketer-stats/jacques_kallis/raw/
Processing Jacques Kallis's batting stats...
Batting stats processed successfully.
Processing Jacques Kallis's bowling stats...
Bowling stats processed successfully.
Processing Jacques Kallis's fielding stats...
Fielding stats processed successfully.
Processing Jacques Kallis's all-round stats...
All-round stats processed successfully.


In [72]:
# --- 🔹 Step 5: Upload transformed data to S3 ---
kallis_tf_loader = LoadData(player_name, data_type="tf")

kallis_tf_loader.battingstats = kallis_tf.battingstats
kallis_tf_loader.bowlingstats = kallis_tf.bowlingstats
kallis_tf_loader.fieldingstats = kallis_tf.fieldingstats
kallis_tf_loader.allroundstats = kallis_tf.allroundstats
kallis_tf_loader.player_info = kallis_tf.player_info

kallis_tf_loader.load_data(bucket_name, load_type="upload", stat_type="all")

✅ Bucket 'cricketer-stats' already exists.
Uploading jacques_kallis's tf all data to S3...
 Uploaded to s3://cricketer-stats/jacques_kallis/tf/batting_stats.csv
 Uploaded to s3://cricketer-stats/jacques_kallis/tf/bowling_stats.csv
 Uploaded to s3://cricketer-stats/jacques_kallis/tf/fielding_stats.csv
 Uploaded to s3://cricketer-stats/jacques_kallis/tf/allround_stats.csv
 Uploaded to s3://cricketer-stats/jacques_kallis/tf/personal_info.csv
All tf data uploaded to s3://cricketer-stats/jacques_kallis/tf/


### James Anderson

In [75]:
# --- 🔹 Step 1: Scrape raw data ---
player_name = "JM Anderson"
anderson_raw = ScrapeData(player_name)
anderson_raw.get_player_stats()

# --- 🔹 Step 2: Upload raw data to S3 ---
bucket_name = "cricketer-stats"
anderson_raw_loader = LoadData(player_name, data_type="raw")

anderson_raw_loader.battingstats = anderson_raw.battingstats
anderson_raw_loader.bowlingstats = anderson_raw.bowlingstats
anderson_raw_loader.fieldingstats = anderson_raw.fieldingstats
anderson_raw_loader.allroundstats = anderson_raw.allroundstats
anderson_raw_loader.player_info = anderson_raw.player_info

anderson_raw_loader.load_data(bucket_name, load_type="upload")

Setting up WebDriver...
Extracting JM Anderson's player URL and Player ID....
Extraction Successful for JM Anderson.
Time taken to extract URL: 3.55 seconds
WebDriver closed successfully.
Starting extraction of JM Anderson's batting stats....
Extracted 556 records in 128.56 seconds
Starting extraction of JM Anderson's bowling stats....
Extracted 569 records in 113.30 seconds
Starting extraction of JM Anderson's personal info....
Extracted player info in 4.20 seconds
Starting extraction of JM Anderson's fielding stats....
Extracted 569 records in 104.76 seconds
✅ Bucket 'cricketer-stats' already exists.
Uploading jm_anderson's raw all data to S3...
 Uploaded to s3://cricketer-stats/jm_anderson/raw/batting_stats.csv
 Uploaded to s3://cricketer-stats/jm_anderson/raw/bowling_stats.csv
 Uploaded to s3://cricketer-stats/jm_anderson/raw/fielding_stats.csv
 Uploaded to s3://cricketer-stats/jm_anderson/raw/personal_info.csv
All raw data uploaded to s3://cricketer-stats/jm_anderson/raw/


In [76]:
# --- 🔹 Step 3: Download raw data from S3 ---
anderson_tf = TransformData(player_name)
anderson_tf_loader = LoadData(player_name, data_type="raw")

anderson_tf_loader.load_data(bucket_name, load_type="download", stat_type="all")

anderson_tf.battingstats = anderson_tf_loader.battingstats
anderson_tf.bowlingstats = anderson_tf_loader.bowlingstats
anderson_tf.fieldingstats = anderson_tf_loader.fieldingstats
anderson_tf.allroundstats = anderson_tf_loader.allroundstats
anderson_tf.player_info = anderson_tf_loader.player_info

# --- 🔹 Step 4: Transform the data ---
anderson_tf.process_data()

✅ Bucket 'cricketer-stats' already exists.
Downloading jm_anderson's raw data from S3...
 Downloaded from s3://cricketer-stats/jm_anderson/raw/personal_info.csv
 Downloaded from s3://cricketer-stats/jm_anderson/raw/batting_stats.csv
 Downloaded from s3://cricketer-stats/jm_anderson/raw/bowling_stats.csv
 Downloaded from s3://cricketer-stats/jm_anderson/raw/fielding_stats.csv
All raw data downloaded from s3://cricketer-stats/jm_anderson/raw/
Processing JM Anderson's batting stats...
Batting stats processed successfully.
Processing JM Anderson's bowling stats...
Bowling stats processed successfully.
Processing JM Anderson's fielding stats...
Fielding stats processed successfully.


In [77]:
# --- 🔹 Step 5: Upload transformed data to S3 ---
anderson_tf_loader = LoadData(player_name, data_type="tf")

anderson_tf_loader.battingstats = anderson_tf.battingstats
anderson_tf_loader.bowlingstats = anderson_tf.bowlingstats
anderson_tf_loader.fieldingstats = anderson_tf.fieldingstats
anderson_tf_loader.allroundstats = anderson_tf.allroundstats
anderson_tf_loader.player_info = anderson_tf.player_info

anderson_tf_loader.load_data(bucket_name, load_type="upload", stat_type="all")

✅ Bucket 'cricketer-stats' already exists.
Uploading jm_anderson's tf all data to S3...
 Uploaded to s3://cricketer-stats/jm_anderson/tf/batting_stats.csv
 Uploaded to s3://cricketer-stats/jm_anderson/tf/bowling_stats.csv
 Uploaded to s3://cricketer-stats/jm_anderson/tf/fielding_stats.csv
 Uploaded to s3://cricketer-stats/jm_anderson/tf/personal_info.csv
All tf data uploaded to s3://cricketer-stats/jm_anderson/tf/


## Debugging Scraper

In [None]:
def get_player_url(player_name,driver):
    start_time = time.time()
    print(f"Extracting {player_name}'s player URL and Player ID....")
    search_url = f"https://search.espncricinfo.com/ci/content/site/search.html?search={player_name.lower().replace(' ', '%20')};type=player"
    driver.get(search_url)

    try:
        player_link_element = driver.find_element(By.CSS_SELECTOR, "h3.name.link-cta a")
        player_url = player_link_element.get_attribute("href")
        player_id = player_url.split('-')[-1]
        print(f"Extraction Successful for {player_name}.")
        end_time = time.time()
        print(f"Time taken to extract URL: {end_time - start_time:.2f} seconds")

        return player_url, player_id
        
    except Exception as e:
        print(f"Error in extracting {player_name}'s url:", e)
        return None, None

In [26]:
def extract_player_info(driver,player_name, player_id, player_url):
        try:
            start_time = time.time()
            print(f"Starting extraction of {player_name}'s personal info....")
            
            # Start by opening the player info URL
            driver.get(player_url)

            # Step 1: Extract headers within the specified div tag
            headers = driver.find_elements(By.XPATH, "//div//p[@class='ds-text-tight-m ds-font-regular ds-uppercase ds-text-typo-mid3']")
            header_names = ['Player ID','Player URL']+[header.text for header in headers]

            # Step 2: Extract values within the specified div tag
            values = driver.find_elements(By.XPATH, "//div//span[@class='ds-text-title-s ds-font-bold ds-text-typo']")
            value_texts = [player_id,player_url]+[value.text for value in values]

            # Step 3: Create a DataFrame from the extracted data
            player_info = pd.DataFrame([value_texts], columns=header_names)

            end_time = time.time()
            print(f"Extracted player info in {end_time - start_time:.2f} seconds")

            return player_info
            
        except Exception as e:
            print(f"Error in extracting {player_name}'s personal info:", e)
            return None

In [2]:
player_name = "Virat Kohli"
player_id = 253802
player_url = "https://www.espncricinfo.com/cricketers/virat-kohli-253802"

In [None]:
driver.get(player_url)

headers = driver.find_elements(By.XPATH, "//div//p[@class='ds-text-tight-m ds-font-regular ds-uppercase ds-text-typo-mid3']")
[header.text for header in headers]

values = driver.find_elements(By.XPATH, "//div//span[@class='ds-text-title-s ds-font-bold ds-text-typo']")
[value.text for value in values]

['Virat Kohli',
 'November 05, 1988, Delhi',
 '36y 162d',
 'Right hand Bat',
 'Right arm Medium',
 'Top order Batter']

## Data Modelling

To create a data model, we will follow the following steps - 

1. create a master dataframe for each of the datasets. It would consist of combine Statistics of all scraped players. 
2. Use inns_id - (player_id + match_id + inns_no) - as a composite primary key. 
3. upload data to "master" folder.

In [223]:
import pandas as pd
import numpy as np
import time

In [1]:
import boto3
import os

In [3]:
s3 = boto3.client("s3")

response = s3.list_buckets()
print(response)


{'ResponseMetadata': {'RequestId': '6A6EPTAC70BKYJHC', 'HostId': 'Z/DuKG0P00LQ5NARJn3Y22FSpqAYkcFJuHkG1z+9sKY1nZstEKwIqhFiL9wKfiaZWEaH9LMK/NA=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'Z/DuKG0P00LQ5NARJn3Y22FSpqAYkcFJuHkG1z+9sKY1nZstEKwIqhFiL9wKfiaZWEaH9LMK/NA=', 'x-amz-request-id': '6A6EPTAC70BKYJHC', 'date': 'Mon, 05 May 2025 05:56:45 GMT', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'Buckets': [{'Name': 'airlines-reviews', 'CreationDate': datetime.datetime(2025, 4, 26, 15, 20, 18, tzinfo=tzutc())}, {'Name': 'cricketer-stats', 'CreationDate': datetime.datetime(2025, 4, 16, 10, 14, 53, tzinfo=tzutc())}], 'Owner': {'ID': 'fca0ef78a9a2f6e77610782ee76f1164826155183738f9645eb03b6daa9646a3'}}


### Testing phase 1 - single player

In [38]:
player_name = "Virat Kohli"
bucket_name = "cricketer-stats"

#downloading transformed data from s3
virat_tf_loader = LoadData(player_name, data_type="tf")
virat_tf_loader.load_data(bucket_name, load_type="download", stat_type="all")

Bucket 'cricketer-stats' already exists.
Downloading virat_kohli's tf data from S3...
 Downloaded from s3://cricketer-stats/virat_kohli/tf/personal_info.csv
 Downloaded from s3://cricketer-stats/virat_kohli/tf/batting_stats.csv
 Downloaded from s3://cricketer-stats/virat_kohli/tf/bowling_stats.csv
 Downloaded from s3://cricketer-stats/virat_kohli/tf/fielding_stats.csv
All tf data downloaded from s3://cricketer-stats/virat_kohli/tf/


In [39]:
display(
    virat_tf_loader.battingstats,
    virat_tf_loader.bowlingstats,
    virat_tf_loader.fieldingstats,
    virat_tf_loader.player_info
)

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location
0,#2742,2008-08-18,ODI,1.0,2.0,12.0,22.0,1.0,0.0,54.54,33.0,lbw,Sri Lanka,Dambulla
1,#2745,2008-08-20,ODI,2.0,2.0,37.0,67.0,6.0,0.0,55.22,82.0,caught,Sri Lanka,Dambulla
2,#2750,2008-08-24,ODI,1.0,1.0,25.0,38.0,4.0,0.0,65.78,40.0,run out,Sri Lanka,Colombo
3,#2755,2008-08-27,ODI,1.0,1.0,54.0,66.0,7.0,0.0,81.81,87.0,bowled,Sri Lanka,Colombo
4,#2756,2008-08-29,ODI,2.0,1.0,31.0,46.0,3.0,1.0,67.39,45.0,lbw,Sri Lanka,Colombo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,#4844,2025-02-20,ODI,2.0,3.0,22.0,38.0,1.0,0.0,57.89,,caught,Bangladesh,Dubai
642,#4847,2025-02-23,ODI,2.0,3.0,100.0,111.0,7.0,0.0,90.09,151.0,not out,Pakistan,Dubai
643,#4852,2025-03-02,ODI,1.0,3.0,11.0,14.0,2.0,0.0,78.57,18.0,caught,New Zealand,Dubai
644,#4853,2025-03-04,ODI,2.0,3.0,84.0,98.0,5.0,0.0,85.71,135.0,caught,Australia,Dubai


Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location
0,#2742,2008-08-18,ODI,2.0,,,,,,,Sri Lanka,Dambulla
1,#2745,2008-08-20,ODI,1.0,,,,,,,Sri Lanka,Dambulla
2,#2750,2008-08-24,ODI,2.0,,,,,,,Sri Lanka,Colombo
3,#2755,2008-08-27,ODI,2.0,,,,,,,Sri Lanka,Colombo
4,#2756,2008-08-29,ODI,1.0,,,,,,,Sri Lanka,Colombo
...,...,...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,,,,,,,Bangladesh,Dubai
659,#4847,2025-02-23,ODI,1.0,,,,,,,Pakistan,Dubai
660,#4852,2025-03-02,ODI,2.0,,,,,,,New Zealand,Dubai
661,#4853,2025-03-04,ODI,1.0,,,,,,,Australia,Dubai


Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location
0,#2742,2008-08-18,ODI,2.0,0.0,0.0,Sri Lanka,Dambulla
1,#2745,2008-08-20,ODI,1.0,3.0,3.0,Sri Lanka,Dambulla
2,#2750,2008-08-24,ODI,2.0,0.0,0.0,Sri Lanka,Colombo
3,#2755,2008-08-27,ODI,2.0,0.0,0.0,Sri Lanka,Colombo
4,#2756,2008-08-29,ODI,1.0,0.0,0.0,Sri Lanka,Colombo
...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,2.0,2.0,Bangladesh,Dubai
659,#4847,2025-02-23,ODI,1.0,2.0,2.0,Pakistan,Dubai
660,#4852,2025-03-02,ODI,2.0,1.0,1.0,New Zealand,Dubai
661,#4853,2025-03-04,ODI,1.0,2.0,2.0,Australia,Dubai


Unnamed: 0,Player ID,Player URL,FULL NAME,BORN,AGE,BATTING STYLE,BOWLING STYLE,PLAYING ROLE
0,253802,https://www.espncricinfo.com/cricketers/virat-...,Virat Kohli,"November 05, 1988, Delhi",36y 165d,Right hand Bat,Right arm Medium,Top order Batter


Now, let us start with the aggregation. First, we will create 5 empty dataframes - master dfs. 

In [69]:
batting_master = None
bowling_master = None
fielding_master = None
allround_master = None
info_master = None

#### Batting

Now, we will start with batting. First, let us append the batting stats to the master df. 

In [70]:
concat_df = virat_tf_loader.battingstats.copy()
concat_df['Player ID'] = virat_tf_loader.player_info['Player ID'][0]
concat_df['Inns ID'] = (concat_df['Player ID'].astype(str)+"_"+
                             concat_df['Match ID'].astype(str)+"_"+
                             concat_df['Inns'].astype(str)
                            )
concat_df


Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,1.0,2.0,12.0,22.0,1.0,0.0,54.54,33.0,lbw,Sri Lanka,Dambulla,253802,253802_#2742_1.0
1,#2745,2008-08-20,ODI,2.0,2.0,37.0,67.0,6.0,0.0,55.22,82.0,caught,Sri Lanka,Dambulla,253802,253802_#2745_2.0
2,#2750,2008-08-24,ODI,1.0,1.0,25.0,38.0,4.0,0.0,65.78,40.0,run out,Sri Lanka,Colombo,253802,253802_#2750_1.0
3,#2755,2008-08-27,ODI,1.0,1.0,54.0,66.0,7.0,0.0,81.81,87.0,bowled,Sri Lanka,Colombo,253802,253802_#2755_1.0
4,#2756,2008-08-29,ODI,2.0,1.0,31.0,46.0,3.0,1.0,67.39,45.0,lbw,Sri Lanka,Colombo,253802,253802_#2756_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,#4844,2025-02-20,ODI,2.0,3.0,22.0,38.0,1.0,0.0,57.89,,caught,Bangladesh,Dubai,253802,253802_#4844_2.0
642,#4847,2025-02-23,ODI,2.0,3.0,100.0,111.0,7.0,0.0,90.09,151.0,not out,Pakistan,Dubai,253802,253802_#4847_2.0
643,#4852,2025-03-02,ODI,1.0,3.0,11.0,14.0,2.0,0.0,78.57,18.0,caught,New Zealand,Dubai,253802,253802_#4852_1.0
644,#4853,2025-03-04,ODI,2.0,3.0,84.0,98.0,5.0,0.0,85.71,135.0,caught,Australia,Dubai,253802,253802_#4853_2.0


Good. Now, we will create a composite primary key - (player_id + match_id + inns_no). 

In [72]:
if concat_df is None:
    pass
elif batting_master is None:
    batting_master = concat_df
else:
    batting_master = pd.concat([batting_master, concat_df])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

batting_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,1.0,2.0,12.0,22.0,1.0,0.0,54.54,33.0,lbw,Sri Lanka,Dambulla,253802,253802_#2742_1.0
1,#2745,2008-08-20,ODI,2.0,2.0,37.0,67.0,6.0,0.0,55.22,82.0,caught,Sri Lanka,Dambulla,253802,253802_#2745_2.0
2,#2750,2008-08-24,ODI,1.0,1.0,25.0,38.0,4.0,0.0,65.78,40.0,run out,Sri Lanka,Colombo,253802,253802_#2750_1.0
3,#2755,2008-08-27,ODI,1.0,1.0,54.0,66.0,7.0,0.0,81.81,87.0,bowled,Sri Lanka,Colombo,253802,253802_#2755_1.0
4,#2756,2008-08-29,ODI,2.0,1.0,31.0,46.0,3.0,1.0,67.39,45.0,lbw,Sri Lanka,Colombo,253802,253802_#2756_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,#4844,2025-02-20,ODI,2.0,3.0,22.0,38.0,1.0,0.0,57.89,,caught,Bangladesh,Dubai,253802,253802_#4844_2.0
642,#4847,2025-02-23,ODI,2.0,3.0,100.0,111.0,7.0,0.0,90.09,151.0,not out,Pakistan,Dubai,253802,253802_#4847_2.0
643,#4852,2025-03-02,ODI,1.0,3.0,11.0,14.0,2.0,0.0,78.57,18.0,caught,New Zealand,Dubai,253802,253802_#4852_1.0
644,#4853,2025-03-04,ODI,2.0,3.0,84.0,98.0,5.0,0.0,85.71,135.0,caught,Australia,Dubai,253802,253802_#4853_2.0


In [73]:
batting_master['Inns ID'].nunique()

646

Great. this works! Now, let us test it by splitting our dataset, and then performing it. 

In [76]:
batting_master = None

concat_df = virat_tf_loader.battingstats.copy()
concat_df['Player ID'] = virat_tf_loader.player_info['Player ID'][0]
concat_df['Inns ID'] = (concat_df['Player ID'].astype(str)+"_"+
                             concat_df['Match ID'].astype(str)+"_"+
                             concat_df['Inns'].astype(str)
                            )

concat_df_1 = concat_df.copy().iloc[:300,]
concat_df_2 = concat_df.copy().iloc[200:,]

In [77]:
if concat_df_1 is None:
    pass
elif batting_master is None:
    batting_master = concat_df_1
else:
    batting_master = pd.concat([batting_master, concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

batting_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,1.0,2.0,12.0,22.0,1.0,0.0,54.54,33.0,lbw,Sri Lanka,Dambulla,253802,253802_#2742_1.0
1,#2745,2008-08-20,ODI,2.0,2.0,37.0,67.0,6.0,0.0,55.22,82.0,caught,Sri Lanka,Dambulla,253802,253802_#2745_2.0
2,#2750,2008-08-24,ODI,1.0,1.0,25.0,38.0,4.0,0.0,65.78,40.0,run out,Sri Lanka,Colombo,253802,253802_#2750_1.0
3,#2755,2008-08-27,ODI,1.0,1.0,54.0,66.0,7.0,0.0,81.81,87.0,bowled,Sri Lanka,Colombo,253802,253802_#2755_1.0
4,#2756,2008-08-29,ODI,2.0,1.0,31.0,46.0,3.0,1.0,67.39,45.0,lbw,Sri Lanka,Colombo,253802,253802_#2756_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,#563,2016-08-28,T20I,2.0,,,,,,,,,West Indies,Lauderhill,253802,253802_#563_2.0
296,#2221,2016-09-22,Test,1.0,4.0,9.0,10.0,2.0,0.0,90.00,12.0,caught,New Zealand,Kanpur,253802,253802_#2221_1.0
297,#2221,2016-09-22,Test,3.0,4.0,18.0,40.0,3.0,0.0,45.00,45.0,caught,New Zealand,Kanpur,253802,253802_#2221_3.0
298,#2222,2016-09-30,Test,1.0,4.0,9.0,28.0,1.0,0.0,32.14,42.0,caught,New Zealand,Kolkata,253802,253802_#2222_1.0


In [78]:
if concat_df_2 is None:
    pass
elif batting_master is None:
    batting_master = concat_df_2
else:
    batting_master = pd.concat([batting_master, concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

batting_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,1.0,2.0,12.0,22.0,1.0,0.0,54.54,33.0,lbw,Sri Lanka,Dambulla,253802,253802_#2742_1.0
1,#2745,2008-08-20,ODI,2.0,2.0,37.0,67.0,6.0,0.0,55.22,82.0,caught,Sri Lanka,Dambulla,253802,253802_#2745_2.0
2,#2750,2008-08-24,ODI,1.0,1.0,25.0,38.0,4.0,0.0,65.78,40.0,run out,Sri Lanka,Colombo,253802,253802_#2750_1.0
3,#2755,2008-08-27,ODI,1.0,1.0,54.0,66.0,7.0,0.0,81.81,87.0,bowled,Sri Lanka,Colombo,253802,253802_#2755_1.0
4,#2756,2008-08-29,ODI,2.0,1.0,31.0,46.0,3.0,1.0,67.39,45.0,lbw,Sri Lanka,Colombo,253802,253802_#2756_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,#4844,2025-02-20,ODI,2.0,3.0,22.0,38.0,1.0,0.0,57.89,,caught,Bangladesh,Dubai,253802,253802_#4844_2.0
642,#4847,2025-02-23,ODI,2.0,3.0,100.0,111.0,7.0,0.0,90.09,151.0,not out,Pakistan,Dubai,253802,253802_#4847_2.0
643,#4852,2025-03-02,ODI,1.0,3.0,11.0,14.0,2.0,0.0,78.57,18.0,caught,New Zealand,Dubai,253802,253802_#4852_1.0
644,#4853,2025-03-04,ODI,2.0,3.0,84.0,98.0,5.0,0.0,85.71,135.0,caught,Australia,Dubai,253802,253802_#4853_2.0


Perfect! It works!

#### Bowling

Now, let us test it for bowling stats. 

In [79]:
bowling_master = None

concat_df = virat_tf_loader.bowlingstats.copy()
concat_df['Player ID'] = virat_tf_loader.player_info['Player ID'][0]
concat_df['Inns ID'] = (concat_df['Player ID'].astype(str)+"_"+
                             concat_df['Match ID'].astype(str)+"_"+
                             concat_df['Inns'].astype(str)
                            )
concat_df

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,2.0,,,,,,,Sri Lanka,Dambulla,253802,253802_#2742_2.0
1,#2745,2008-08-20,ODI,1.0,,,,,,,Sri Lanka,Dambulla,253802,253802_#2745_1.0
2,#2750,2008-08-24,ODI,2.0,,,,,,,Sri Lanka,Colombo,253802,253802_#2750_2.0
3,#2755,2008-08-27,ODI,2.0,,,,,,,Sri Lanka,Colombo,253802,253802_#2755_2.0
4,#2756,2008-08-29,ODI,1.0,,,,,,,Sri Lanka,Colombo,253802,253802_#2756_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,,,,,,,Bangladesh,Dubai,253802,253802_#4844_1.0
659,#4847,2025-02-23,ODI,1.0,,,,,,,Pakistan,Dubai,253802,253802_#4847_1.0
660,#4852,2025-03-02,ODI,2.0,,,,,,,New Zealand,Dubai,253802,253802_#4852_2.0
661,#4853,2025-03-04,ODI,1.0,,,,,,,Australia,Dubai,253802,253802_#4853_1.0


In [80]:
if concat_df is None:
    pass
elif bowling_master is None:
    bowling_master = concat_df
else:
    bowling_master = pd.concat([bowling_master, concat_df])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

bowling_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,2.0,,,,,,,Sri Lanka,Dambulla,253802,253802_#2742_2.0
1,#2745,2008-08-20,ODI,1.0,,,,,,,Sri Lanka,Dambulla,253802,253802_#2745_1.0
2,#2750,2008-08-24,ODI,2.0,,,,,,,Sri Lanka,Colombo,253802,253802_#2750_2.0
3,#2755,2008-08-27,ODI,2.0,,,,,,,Sri Lanka,Colombo,253802,253802_#2755_2.0
4,#2756,2008-08-29,ODI,1.0,,,,,,,Sri Lanka,Colombo,253802,253802_#2756_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,,,,,,,Bangladesh,Dubai,253802,253802_#4844_1.0
659,#4847,2025-02-23,ODI,1.0,,,,,,,Pakistan,Dubai,253802,253802_#4847_1.0
660,#4852,2025-03-02,ODI,2.0,,,,,,,New Zealand,Dubai,253802,253802_#4852_2.0
661,#4853,2025-03-04,ODI,1.0,,,,,,,Australia,Dubai,253802,253802_#4853_1.0


In [81]:
bowling_master['Inns ID'].nunique()

663

Great. this works! Now, let us test it by splitting our dataset, and then performing it. 

In [82]:
bowling_master = None

concat_df = virat_tf_loader.bowlingstats.copy()
concat_df['Player ID'] = virat_tf_loader.player_info['Player ID'][0]
concat_df['Inns ID'] = (concat_df['Player ID'].astype(str)+"_"+
                             concat_df['Match ID'].astype(str)+"_"+
                             concat_df['Inns'].astype(str)
                            )

concat_df_1 = concat_df.copy().iloc[:300,]
concat_df_2 = concat_df.copy().iloc[200:,]

In [83]:
if concat_df_1 is None:
    pass
elif bowling_master is None:
    bowling_master = concat_df_1
else:
    bowling_master = pd.concat([bowling_master, concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

bowling_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,2.0,,,,,,,Sri Lanka,Dambulla,253802,253802_#2742_2.0
1,#2745,2008-08-20,ODI,1.0,,,,,,,Sri Lanka,Dambulla,253802,253802_#2745_1.0
2,#2750,2008-08-24,ODI,2.0,,,,,,,Sri Lanka,Colombo,253802,253802_#2750_2.0
3,#2755,2008-08-27,ODI,2.0,,,,,,,Sri Lanka,Colombo,253802,253802_#2755_2.0
4,#2756,2008-08-29,ODI,1.0,,,,,,,Sri Lanka,Colombo,253802,253802_#2756_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,#2215,2016-08-09,Test,4.0,,,,,,,West Indies,Gros Islet,253802,253802_#2215_4.0
296,#2218,2016-08-18,Test,1.0,,,,,,,West Indies,Port of Spain,253802,253802_#2218_1.0
297,#562,2016-08-27,T20I,1.0,,,,,,,West Indies,Lauderhill,253802,253802_#562_1.0
298,#563,2016-08-28,T20I,1.0,,,,,,,West Indies,Lauderhill,253802,253802_#563_1.0


In [84]:
if concat_df_2 is None:
    pass
elif bowling_master is None:
    bowling_master = concat_df_2
else:
    bowling_master = pd.concat([bowling_master, concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

bowling_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,2.0,,,,,,,Sri Lanka,Dambulla,253802,253802_#2742_2.0
1,#2745,2008-08-20,ODI,1.0,,,,,,,Sri Lanka,Dambulla,253802,253802_#2745_1.0
2,#2750,2008-08-24,ODI,2.0,,,,,,,Sri Lanka,Colombo,253802,253802_#2750_2.0
3,#2755,2008-08-27,ODI,2.0,,,,,,,Sri Lanka,Colombo,253802,253802_#2755_2.0
4,#2756,2008-08-29,ODI,1.0,,,,,,,Sri Lanka,Colombo,253802,253802_#2756_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,,,,,,,Bangladesh,Dubai,253802,253802_#4844_1.0
659,#4847,2025-02-23,ODI,1.0,,,,,,,Pakistan,Dubai,253802,253802_#4847_1.0
660,#4852,2025-03-02,ODI,2.0,,,,,,,New Zealand,Dubai,253802,253802_#4852_2.0
661,#4853,2025-03-04,ODI,1.0,,,,,,,Australia,Dubai,253802,253802_#4853_1.0


Perfect! this works as well. 

#### Fielding

Now, let us test it for fielding stats. 

In [85]:
fielding_master = None

concat_df = virat_tf_loader.fieldingstats.copy()
concat_df['Player ID'] = virat_tf_loader.player_info['Player ID'][0]
concat_df['Inns ID'] = (concat_df['Player ID'].astype(str)+"_"+
                             concat_df['Match ID'].astype(str)+"_"+
                             concat_df['Inns'].astype(str)
                            )
concat_df

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,2.0,0.0,0.0,Sri Lanka,Dambulla,253802,253802_#2742_2.0
1,#2745,2008-08-20,ODI,1.0,3.0,3.0,Sri Lanka,Dambulla,253802,253802_#2745_1.0
2,#2750,2008-08-24,ODI,2.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_#2750_2.0
3,#2755,2008-08-27,ODI,2.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_#2755_2.0
4,#2756,2008-08-29,ODI,1.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_#2756_1.0
...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,2.0,2.0,Bangladesh,Dubai,253802,253802_#4844_1.0
659,#4847,2025-02-23,ODI,1.0,2.0,2.0,Pakistan,Dubai,253802,253802_#4847_1.0
660,#4852,2025-03-02,ODI,2.0,1.0,1.0,New Zealand,Dubai,253802,253802_#4852_2.0
661,#4853,2025-03-04,ODI,1.0,2.0,2.0,Australia,Dubai,253802,253802_#4853_1.0


In [86]:
if concat_df is None:
    pass
elif fielding_master is None:
    fielding_master = concat_df
else:
    fielding_master = pd.concat([fielding_master, concat_df])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

fielding_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,2.0,0.0,0.0,Sri Lanka,Dambulla,253802,253802_#2742_2.0
1,#2745,2008-08-20,ODI,1.0,3.0,3.0,Sri Lanka,Dambulla,253802,253802_#2745_1.0
2,#2750,2008-08-24,ODI,2.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_#2750_2.0
3,#2755,2008-08-27,ODI,2.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_#2755_2.0
4,#2756,2008-08-29,ODI,1.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_#2756_1.0
...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,2.0,2.0,Bangladesh,Dubai,253802,253802_#4844_1.0
659,#4847,2025-02-23,ODI,1.0,2.0,2.0,Pakistan,Dubai,253802,253802_#4847_1.0
660,#4852,2025-03-02,ODI,2.0,1.0,1.0,New Zealand,Dubai,253802,253802_#4852_2.0
661,#4853,2025-03-04,ODI,1.0,2.0,2.0,Australia,Dubai,253802,253802_#4853_1.0


In [87]:
fielding_master['Inns ID'].nunique()

663

Great. this works! Now, let us test it by splitting our dataset, and then performing it. 

In [88]:
fielding_master = None

concat_df = virat_tf_loader.fieldingstats.copy()
concat_df['Player ID'] = virat_tf_loader.player_info['Player ID'][0]
concat_df['Inns ID'] = (concat_df['Player ID'].astype(str)+"_"+
                             concat_df['Match ID'].astype(str)+"_"+
                             concat_df['Inns'].astype(str)
                            )

concat_df_1 = concat_df.copy().iloc[:300,]
concat_df_2 = concat_df.copy().iloc[200:,]

In [89]:
if concat_df_1 is None:
    pass
elif fielding_master is None:
    fielding_master = concat_df_1
else:
    fielding_master = pd.concat([fielding_master, concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

fielding_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,2.0,0.0,0.0,Sri Lanka,Dambulla,253802,253802_#2742_2.0
1,#2745,2008-08-20,ODI,1.0,3.0,3.0,Sri Lanka,Dambulla,253802,253802_#2745_1.0
2,#2750,2008-08-24,ODI,2.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_#2750_2.0
3,#2755,2008-08-27,ODI,2.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_#2755_2.0
4,#2756,2008-08-29,ODI,1.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_#2756_1.0
...,...,...,...,...,...,...,...,...,...,...
295,#2215,2016-08-09,Test,4.0,1.0,1.0,West Indies,Gros Islet,253802,253802_#2215_4.0
296,#2218,2016-08-18,Test,1.0,0.0,0.0,West Indies,Port of Spain,253802,253802_#2218_1.0
297,#562,2016-08-27,T20I,1.0,0.0,0.0,West Indies,Lauderhill,253802,253802_#562_1.0
298,#563,2016-08-28,T20I,1.0,1.0,1.0,West Indies,Lauderhill,253802,253802_#563_1.0


In [90]:
if concat_df_2 is None:
    pass
elif fielding_master is None:
    fielding_master = concat_df_2
else:
    fielding_master = pd.concat([fielding_master, concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

fielding_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,2.0,0.0,0.0,Sri Lanka,Dambulla,253802,253802_#2742_2.0
1,#2745,2008-08-20,ODI,1.0,3.0,3.0,Sri Lanka,Dambulla,253802,253802_#2745_1.0
2,#2750,2008-08-24,ODI,2.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_#2750_2.0
3,#2755,2008-08-27,ODI,2.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_#2755_2.0
4,#2756,2008-08-29,ODI,1.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_#2756_1.0
...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,2.0,2.0,Bangladesh,Dubai,253802,253802_#4844_1.0
659,#4847,2025-02-23,ODI,1.0,2.0,2.0,Pakistan,Dubai,253802,253802_#4847_1.0
660,#4852,2025-03-02,ODI,2.0,1.0,1.0,New Zealand,Dubai,253802,253802_#4852_2.0
661,#4853,2025-03-04,ODI,1.0,2.0,2.0,Australia,Dubai,253802,253802_#4853_1.0


#### Personal Info

Now, let us test it for personal info. 

In [91]:
info_master = None

concat_df = virat_tf_loader.player_info.copy()
concat_df

Unnamed: 0,Player ID,Player URL,FULL NAME,BORN,AGE,BATTING STYLE,BOWLING STYLE,PLAYING ROLE
0,253802,https://www.espncricinfo.com/cricketers/virat-...,Virat Kohli,"November 05, 1988, Delhi",36y 165d,Right hand Bat,Right arm Medium,Top order Batter


In [92]:
if concat_df is None:
    pass
elif info_master is None:
    info_master = concat_df
else:
    info_master = pd.concat([info_master, concat_df])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

info_master

Unnamed: 0,Player ID,Player URL,FULL NAME,BORN,AGE,BATTING STYLE,BOWLING STYLE,PLAYING ROLE
0,253802,https://www.espncricinfo.com/cricketers/virat-...,Virat Kohli,"November 05, 1988, Delhi",36y 165d,Right hand Bat,Right arm Medium,Top order Batter


In [95]:
info_master['Player ID'].nunique()

1

Perfect! this works as well. Since this is the dimension table, we will already have `Player ID` as primary key. 

### Phase 2 - multiple players

In [215]:
bucket_name = "cricketer-stats"

player_1 = "Virat Kohli"
player_2 = "Jacques Kallis"
player_3 = "JM Anderson"

#downloading transformed data from s3
p1_loader = LoadData(player_1, data_type="tf")
p1_loader.load_data(bucket_name, load_type="download", stat_type="all")

p2_loader = LoadData(player_2, data_type="tf")
p2_loader.load_data(bucket_name, load_type="download", stat_type="all")

p3_loader = LoadData(player_3, data_type="tf")
p3_loader.load_data(bucket_name, load_type="download", stat_type="all")

Bucket 'cricketer-stats' already exists.
Downloading virat_kohli's tf data from S3...
 Downloaded from s3://cricketer-stats/virat_kohli/tf/personal_info.csv
 Downloaded from s3://cricketer-stats/virat_kohli/tf/batting_stats.csv
 Downloaded from s3://cricketer-stats/virat_kohli/tf/bowling_stats.csv
 Downloaded from s3://cricketer-stats/virat_kohli/tf/fielding_stats.csv
All tf data downloaded from s3://cricketer-stats/virat_kohli/tf/
Bucket 'cricketer-stats' already exists.
Downloading jacques_kallis's tf data from S3...
 Downloaded from s3://cricketer-stats/jacques_kallis/tf/personal_info.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/tf/batting_stats.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/tf/bowling_stats.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/tf/fielding_stats.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/tf/allround_stats.csv
All tf data downloaded from s3://cricketer-stats/jacques_kallis/tf/
Bucket 'cricketer-stats' already ex

In [97]:
display(
    p1_loader.battingstats,
    p1_loader.bowlingstats,
    p1_loader.fieldingstats,
    p1_loader.player_info
)

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location
0,#2742,2008-08-18,ODI,1.0,2.0,12.0,22.0,1.0,0.0,54.54,33.0,lbw,Sri Lanka,Dambulla
1,#2745,2008-08-20,ODI,2.0,2.0,37.0,67.0,6.0,0.0,55.22,82.0,caught,Sri Lanka,Dambulla
2,#2750,2008-08-24,ODI,1.0,1.0,25.0,38.0,4.0,0.0,65.78,40.0,run out,Sri Lanka,Colombo
3,#2755,2008-08-27,ODI,1.0,1.0,54.0,66.0,7.0,0.0,81.81,87.0,bowled,Sri Lanka,Colombo
4,#2756,2008-08-29,ODI,2.0,1.0,31.0,46.0,3.0,1.0,67.39,45.0,lbw,Sri Lanka,Colombo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,#4844,2025-02-20,ODI,2.0,3.0,22.0,38.0,1.0,0.0,57.89,,caught,Bangladesh,Dubai
642,#4847,2025-02-23,ODI,2.0,3.0,100.0,111.0,7.0,0.0,90.09,151.0,not out,Pakistan,Dubai
643,#4852,2025-03-02,ODI,1.0,3.0,11.0,14.0,2.0,0.0,78.57,18.0,caught,New Zealand,Dubai
644,#4853,2025-03-04,ODI,2.0,3.0,84.0,98.0,5.0,0.0,85.71,135.0,caught,Australia,Dubai


Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location
0,#2742,2008-08-18,ODI,2.0,,,,,,,Sri Lanka,Dambulla
1,#2745,2008-08-20,ODI,1.0,,,,,,,Sri Lanka,Dambulla
2,#2750,2008-08-24,ODI,2.0,,,,,,,Sri Lanka,Colombo
3,#2755,2008-08-27,ODI,2.0,,,,,,,Sri Lanka,Colombo
4,#2756,2008-08-29,ODI,1.0,,,,,,,Sri Lanka,Colombo
...,...,...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,,,,,,,Bangladesh,Dubai
659,#4847,2025-02-23,ODI,1.0,,,,,,,Pakistan,Dubai
660,#4852,2025-03-02,ODI,2.0,,,,,,,New Zealand,Dubai
661,#4853,2025-03-04,ODI,1.0,,,,,,,Australia,Dubai


Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location
0,#2742,2008-08-18,ODI,2.0,0.0,0.0,Sri Lanka,Dambulla
1,#2745,2008-08-20,ODI,1.0,3.0,3.0,Sri Lanka,Dambulla
2,#2750,2008-08-24,ODI,2.0,0.0,0.0,Sri Lanka,Colombo
3,#2755,2008-08-27,ODI,2.0,0.0,0.0,Sri Lanka,Colombo
4,#2756,2008-08-29,ODI,1.0,0.0,0.0,Sri Lanka,Colombo
...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,2.0,2.0,Bangladesh,Dubai
659,#4847,2025-02-23,ODI,1.0,2.0,2.0,Pakistan,Dubai
660,#4852,2025-03-02,ODI,2.0,1.0,1.0,New Zealand,Dubai
661,#4853,2025-03-04,ODI,1.0,2.0,2.0,Australia,Dubai


Unnamed: 0,Player ID,Player URL,FULL NAME,BORN,AGE,BATTING STYLE,BOWLING STYLE,PLAYING ROLE
0,253802,https://www.espncricinfo.com/cricketers/virat-...,Virat Kohli,"November 05, 1988, Delhi",36y 165d,Right hand Bat,Right arm Medium,Top order Batter


In [98]:
display(
    p2_loader.battingstats,
    p2_loader.bowlingstats,
    p2_loader.fieldingstats,
    p2_loader.player_info
)

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location
0,#1318,1995-12-14,Test,1.0,6.0,1.0,12.0,0.0,0.0,8.33,11.0,caught,England,Durban
1,#1321,1996-01-02,Test,2.0,7.0,7.0,65.0,1.0,0.0,10.76,117.0,lbw,England,Cape Town
2,#1321,1996-01-02,Test,4.0,,,,,,,,,England,Cape Town
3,#1033,1996-01-09,ODI,1.0,7.0,38.0,65.0,3.0,0.0,58.46,75.0,caught,England,Cape Town
4,#1034,1996-01-11,ODI,1.0,4.0,29.0,35.0,2.0,1.0,82.85,51.0,caught,England,Bloemfontein
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,#2111,2013-12-26,Test,2.0,4.0,115.0,316.0,13.0,0.0,36.39,393.0,caught,India,Durban
642,#2111,2013-12-26,Test,4.0,,,,,,,,,India,Durban
643,#3500,2014-07-06,ODI,1.0,3.0,0.0,2.0,0.0,0.0,0.00,3.0,lbw,Sri Lanka,Colombo
644,#3501,2014-07-09,ODI,2.0,3.0,1.0,7.0,0.0,0.0,14.28,15.0,caught,Sri Lanka,Pallekele


Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location
0,#1318,1995-12-14,Test,2.0,,,,,,,England,Durban
1,#1321,1996-01-02,Test,1.0,5.0,4.0,2.0,2.0,0.0,0.50,England,Cape Town
2,#1321,1996-01-02,Test,3.0,,,,,,,England,Cape Town
3,#1033,1996-01-09,ODI,2.0,7.0,3.0,0.0,14.0,0.0,4.66,England,Cape Town
4,#1034,1996-01-11,ODI,2.0,6.0,5.0,0.0,27.0,0.0,5.40,England,Bloemfontein
...,...,...,...,...,...,...,...,...,...,...,...,...
663,#2111,2013-12-26,Test,1.0,4.0,11.0,1.0,36.0,0.0,3.27,India,Durban
664,#2111,2013-12-26,Test,3.0,,,,,,,India,Durban
665,#3500,2014-07-06,ODI,2.0,,,,,,,Sri Lanka,Colombo
666,#3501,2014-07-09,ODI,1.0,,,,,,,Sri Lanka,Pallekele


Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location
0,#1318,1995-12-14,Test,2.0,0.0,0.0,England,Durban
1,#1321,1996-01-02,Test,1.0,0.0,0.0,England,Cape Town
2,#1321,1996-01-02,Test,3.0,1.0,1.0,England,Cape Town
3,#1033,1996-01-09,ODI,2.0,0.0,0.0,England,Cape Town
4,#1034,1996-01-11,ODI,2.0,0.0,0.0,England,Bloemfontein
...,...,...,...,...,...,...,...,...
663,#2111,2013-12-26,Test,1.0,1.0,1.0,India,Durban
664,#2111,2013-12-26,Test,3.0,0.0,0.0,India,Durban
665,#3500,2014-07-06,ODI,2.0,1.0,1.0,Sri Lanka,Colombo
666,#3501,2014-07-09,ODI,1.0,0.0,0.0,Sri Lanka,Pallekele


Unnamed: 0,Player ID,Player URL,FULL NAME,BORN,AGE,BATTING STYLE,BOWLING STYLE,PLAYING ROLE
0,45789,https://www.espncricinfo.com/cricketers/jacque...,Jacques Henry Kallis,"October 16, 1975, Pinelands, Cape Town, Cape P...",49y 182d,Right hand Bat,Right arm Fast medium,Allrounder


In [99]:
display(
    p3_loader.battingstats,
    p3_loader.bowlingstats,
    p3_loader.fieldingstats,
    p3_loader.player_info
)

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location
0,#1919,2002-12-15,ODI,2.0,10.0,6.0,13.0,1.0,0.0,46.15,18.0,bowled,Australia,Melbourne
1,#1922,2002-12-17,ODI,1.0,10.0,0.0,8.0,0.0,0.0,0.00,10.0,bowled,Sri Lanka,Brisbane
2,#1924,2002-12-20,ODI,1.0,10.0,1.0,3.0,0.0,0.0,33.33,3.0,not out,Sri Lanka,Perth
3,#1932,2003-01-11,ODI,2.0,,,,,,,,,Australia,Hobart
4,#1934,2003-01-13,ODI,2.0,10.0,1.0,11.0,0.0,0.0,9.09,11.0,bowled,Sri Lanka,Sydney
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551,#2531,2024-02-23,Test,1.0,11.0,0.0,4.0,0.0,0.0,0.00,7.0,lbw,India,Ranchi
552,#2531,2024-02-23,Test,3.0,11.0,0.0,3.0,0.0,0.0,0.00,1.0,caught,India,Ranchi
553,#2534,2024-03-07,Test,1.0,11.0,0.0,3.0,0.0,0.0,0.00,2.0,caught,India,Dharamshala
554,#2534,2024-03-07,Test,3.0,11.0,0.0,5.0,0.0,0.0,0.00,9.0,not out,India,Dharamshala


Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location
0,#1919,2002-12-15,ODI,1.0,2.0,6.0,0.0,46.0,1.0,7.66,Australia,Melbourne
1,#1922,2002-12-17,ODI,2.0,2.0,10.0,1.0,48.0,2.0,4.80,Sri Lanka,Brisbane
2,#1924,2002-12-20,ODI,2.0,2.0,8.0,2.0,23.0,2.0,2.87,Sri Lanka,Perth
3,#1932,2003-01-11,ODI,1.0,2.0,10.0,1.0,40.0,2.0,4.00,Australia,Hobart
4,#1934,2003-01-13,ODI,1.0,2.0,9.0,0.0,58.0,2.0,6.44,Sri Lanka,Sydney
...,...,...,...,...,...,...,...,...,...,...,...,...
564,#2531,2024-02-23,Test,2.0,1.0,18.0,4.0,48.0,2.0,2.66,India,Ranchi
565,#2531,2024-02-23,Test,4.0,4.0,3.0,1.0,12.0,0.0,4.00,India,Ranchi
566,#2534,2024-03-07,Test,2.0,1.0,16.0,2.0,60.0,2.0,3.75,India,Dharamshala
567,#2538,2024-07-10,Test,1.0,1.0,10.4,3.0,26.0,1.0,2.43,West Indies,London


Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location
0,#1919,2002-12-15,ODI,1.0,0.0,0.0,Australia,Melbourne
1,#1922,2002-12-17,ODI,2.0,0.0,0.0,Sri Lanka,Brisbane
2,#1924,2002-12-20,ODI,2.0,0.0,0.0,Sri Lanka,Perth
3,#1932,2003-01-11,ODI,1.0,0.0,0.0,Australia,Hobart
4,#1934,2003-01-13,ODI,1.0,1.0,1.0,Sri Lanka,Sydney
...,...,...,...,...,...,...,...,...
564,#2531,2024-02-23,Test,2.0,0.0,0.0,India,Ranchi
565,#2531,2024-02-23,Test,4.0,1.0,1.0,India,Ranchi
566,#2534,2024-03-07,Test,2.0,0.0,0.0,India,Dharamshala
567,#2538,2024-07-10,Test,1.0,0.0,0.0,West Indies,London


Unnamed: 0,Player ID,Player URL,FULL NAME,BORN,AGE,NICKNAMES,BATTING STYLE,BOWLING STYLE,PLAYING ROLE,HEIGHT,EDUCATION
0,8608,https://www.espncricinfo.com/cricketers/james-...,James Michael Anderson,"July 30, 1982, Burnley, Lancashire",42y 260d,Jimmy,Left hand Bat,Right arm Fast medium,Bowler,6ft 2in,St Theodore's RC High School; St Theodore's RC...


Here, it is clear that there are other parameters in the `Player Info` data, only not for everyone. 
However, the additional data (Nicknames, Height and Education etc) are not relevant. So, we will omit them. 

In [100]:
batting_master = None
bowling_master = None
fielding_master = None
allround_master = None
info_master = None

#### Batting

We will test this - one by one, by partitioning data into 2 parts for each player, and then randomly uploading to check. 

In [None]:
# For Virat Kohli
p1_concat_df = p1_loader.battingstats.copy()
p1_concat_df['Player ID'] = p1_loader.player_info['Player ID'][0]
p1_concat_df['Inns ID'] = (p1_concat_df['Player ID'].astype(str)+"_"+
                             p1_concat_df['Match ID'].astype(str)+"_"+
                             p1_concat_df['Inns'].astype(str)
                            )
p1_concat_df_1 = p1_concat_df.copy().iloc[:300,]
p1_concat_df_2 = p1_concat_df.copy().iloc[200:,]

p1_concat_df_1.shape[0], p1_concat_df_2.shape[0]

(300, 446)

In [103]:
# For Jacques Kallis
p2_concat_df = p2_loader.battingstats.copy()
p2_concat_df['Player ID'] = p2_loader.player_info['Player ID'][0]
p2_concat_df['Inns ID'] = (p2_concat_df['Player ID'].astype(str)+"_"+
                             p2_concat_df['Match ID'].astype(str)+"_"+
                             p2_concat_df['Inns'].astype(str)
                            )
p2_concat_df_1 = p2_concat_df.copy().iloc[:300,]
p2_concat_df_2 = p2_concat_df.copy().iloc[200:,]

p2_concat_df_1.shape[0], p2_concat_df_2.shape[0]

(300, 446)

In [104]:
# For James Anderson
p3_concat_df = p3_loader.battingstats.copy()
p3_concat_df['Player ID'] = p3_loader.player_info['Player ID'][0]
p3_concat_df['Inns ID'] = (p3_concat_df['Player ID'].astype(str)+"_"+
                             p3_concat_df['Match ID'].astype(str)+"_"+
                             p3_concat_df['Inns'].astype(str)
                            )
p3_concat_df_1 = p3_concat_df.copy().iloc[:300,]
p3_concat_df_2 = p3_concat_df.copy().iloc[200:,]

p3_concat_df_1.shape[0], p3_concat_df_2.shape[0]

(300, 356)

##### Loading 1st parts of all 3 players

Now, we will load 1st part of all 3 players. 

In [105]:
batting_master = None

In [106]:
# Adding Virat Kohli's data
if p1_concat_df_1 is None:
    pass
elif batting_master is None:
    batting_master = p1_concat_df_1
else:
    batting_master = pd.concat([batting_master, p1_concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

batting_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,1.0,2.0,12.0,22.0,1.0,0.0,54.54,33.0,lbw,Sri Lanka,Dambulla,253802,253802_#2742_1.0
1,#2745,2008-08-20,ODI,2.0,2.0,37.0,67.0,6.0,0.0,55.22,82.0,caught,Sri Lanka,Dambulla,253802,253802_#2745_2.0
2,#2750,2008-08-24,ODI,1.0,1.0,25.0,38.0,4.0,0.0,65.78,40.0,run out,Sri Lanka,Colombo,253802,253802_#2750_1.0
3,#2755,2008-08-27,ODI,1.0,1.0,54.0,66.0,7.0,0.0,81.81,87.0,bowled,Sri Lanka,Colombo,253802,253802_#2755_1.0
4,#2756,2008-08-29,ODI,2.0,1.0,31.0,46.0,3.0,1.0,67.39,45.0,lbw,Sri Lanka,Colombo,253802,253802_#2756_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,#563,2016-08-28,T20I,2.0,,,,,,,,,West Indies,Lauderhill,253802,253802_#563_2.0
296,#2221,2016-09-22,Test,1.0,4.0,9.0,10.0,2.0,0.0,90.00,12.0,caught,New Zealand,Kanpur,253802,253802_#2221_1.0
297,#2221,2016-09-22,Test,3.0,4.0,18.0,40.0,3.0,0.0,45.00,45.0,caught,New Zealand,Kanpur,253802,253802_#2221_3.0
298,#2222,2016-09-30,Test,1.0,4.0,9.0,28.0,1.0,0.0,32.14,42.0,caught,New Zealand,Kolkata,253802,253802_#2222_1.0


In [107]:
batting_master.shape[0]

300

In [108]:
# Adding Jacques Kallis's data
if p2_concat_df_1 is None:
    pass
elif batting_master is None:
    batting_master = p2_concat_df_1
else:
    batting_master = pd.concat([batting_master, p2_concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

batting_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#1318,1995-12-14,Test,1.0,6.0,1.0,12.0,0.0,0.0,8.33,11.0,caught,England,Durban,45789,45789_#1318_1.0
2,#1321,1996-01-02,Test,4.0,,,,,,,,,England,Cape Town,45789,45789_#1321_4.0
3,#1033,1996-01-09,ODI,1.0,7.0,38.0,65.0,3.0,0.0,58.46,75.0,caught,England,Cape Town,45789,45789_#1033_1.0
4,#1034,1996-01-11,ODI,1.0,4.0,29.0,35.0,2.0,1.0,82.85,51.0,caught,England,Bloemfontein,45789,45789_#1034_1.0
5,#1036,1996-01-13,ODI,2.0,5.0,16.0,49.0,0.0,0.0,32.65,51.0,run out,England,Johannesburg,45789,45789_#1036_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,#563,2016-08-28,T20I,2.0,,,,,,,,,West Indies,Lauderhill,253802,253802_#563_2.0
296,#2221,2016-09-22,Test,1.0,4.0,9.0,10.0,2.0,0.0,90.00,12.0,caught,New Zealand,Kanpur,253802,253802_#2221_1.0
297,#2221,2016-09-22,Test,3.0,4.0,18.0,40.0,3.0,0.0,45.00,45.0,caught,New Zealand,Kanpur,253802,253802_#2221_3.0
298,#2222,2016-09-30,Test,1.0,4.0,9.0,28.0,1.0,0.0,32.14,42.0,caught,New Zealand,Kolkata,253802,253802_#2222_1.0


We can see there must be some common values.

In [117]:
pd.concat([p1_concat_df_1,p2_concat_df_1])['Inns ID'].value_counts()

Inns ID
45789_#1321_2.0     2
45789_#1397_1.0     2
45789_#1468_2.0     2
45789_#1507_1.0     2
45789_#1544_2.0     2
                   ..
45789_#1656_1.0     1
45789_#1656_3.0     1
45789_#1659_1.0     1
45789_#1659_3.0     1
253802_#2756_2.0    1
Name: count, Length: 593, dtype: int64

In [119]:
p2_concat_df[p2_concat_df['Match ID']=='#1321']

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
1,#1321,1996-01-02,Test,2.0,7.0,7.0,65.0,1.0,0.0,10.76,117.0,lbw,England,Cape Town,45789,45789_#1321_2.0
2,#1321,1996-01-02,Test,4.0,,,,,,,,,England,Cape Town,45789,45789_#1321_4.0
67,#1321,1998-04-19,ODI,2.0,3.0,39.0,51.0,4.0,1.0,76.47,71.0,caught,Sri Lanka,Bloemfontein,45789,45789_#1321_2.0


Ok. Let us modify the composite key - (player id + format + match id + inns number). This should work. 

In [121]:
# For Virat Kohli
p1_concat_df = p1_loader.battingstats.copy()
p1_concat_df['Player ID'] = p1_loader.player_info['Player ID'][0]
p1_concat_df['Inns ID'] = (p1_concat_df['Player ID'].astype(str)+"_"+
                           p1_concat_df['Format'].astype(str)+
                           p1_concat_df['Match ID'].astype(str)+"_"+
                           p1_concat_df['Inns'].astype(str)
                            )
p1_concat_df_1 = p1_concat_df.copy().iloc[:300,]
p1_concat_df_2 = p1_concat_df.copy().iloc[200:,]

print("p1:",p1_concat_df_1.shape[0], p1_concat_df_2.shape[0])

# For Jacques Kallis
p2_concat_df = p2_loader.battingstats.copy()
p2_concat_df['Player ID'] = p2_loader.player_info['Player ID'][0]
p2_concat_df['Inns ID'] = (p2_concat_df['Player ID'].astype(str)+"_"+
                           p2_concat_df['Format'].astype(str)+
                           p2_concat_df['Match ID'].astype(str)+"_"+
                           p2_concat_df['Inns'].astype(str)
                            )
p2_concat_df_1 = p2_concat_df.copy().iloc[:300,]
p2_concat_df_2 = p2_concat_df.copy().iloc[200:,]

print("p2:",p2_concat_df_1.shape[0], p2_concat_df_2.shape[0])

# For James Anderson
p3_concat_df = p3_loader.battingstats.copy()
p3_concat_df['Player ID'] = p3_loader.player_info['Player ID'][0]
p3_concat_df['Inns ID'] = (p3_concat_df['Player ID'].astype(str)+"_"+
                           p3_concat_df['Format'].astype(str)+
                           p3_concat_df['Match ID'].astype(str)+"_"+
                           p3_concat_df['Inns'].astype(str)
                            )
p3_concat_df_1 = p3_concat_df.copy().iloc[:300,]
p3_concat_df_2 = p3_concat_df.copy().iloc[200:,]

print("p3:",p3_concat_df_1.shape[0], p3_concat_df_2.shape[0])

p1: 300 446
p2: 300 446
p3: 300 356


Let us test now. 

In [122]:
batting_master = None

In [123]:
# Adding Virat Kohli's data
if p1_concat_df_1 is None:
    pass
elif batting_master is None:
    batting_master = p1_concat_df_1
else:
    batting_master = pd.concat([batting_master, p1_concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

batting_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,1.0,2.0,12.0,22.0,1.0,0.0,54.54,33.0,lbw,Sri Lanka,Dambulla,253802,253802_ODI#2742_1.0
1,#2745,2008-08-20,ODI,2.0,2.0,37.0,67.0,6.0,0.0,55.22,82.0,caught,Sri Lanka,Dambulla,253802,253802_ODI#2745_2.0
2,#2750,2008-08-24,ODI,1.0,1.0,25.0,38.0,4.0,0.0,65.78,40.0,run out,Sri Lanka,Colombo,253802,253802_ODI#2750_1.0
3,#2755,2008-08-27,ODI,1.0,1.0,54.0,66.0,7.0,0.0,81.81,87.0,bowled,Sri Lanka,Colombo,253802,253802_ODI#2755_1.0
4,#2756,2008-08-29,ODI,2.0,1.0,31.0,46.0,3.0,1.0,67.39,45.0,lbw,Sri Lanka,Colombo,253802,253802_ODI#2756_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,#563,2016-08-28,T20I,2.0,,,,,,,,,West Indies,Lauderhill,253802,253802_T20I#563_2.0
296,#2221,2016-09-22,Test,1.0,4.0,9.0,10.0,2.0,0.0,90.00,12.0,caught,New Zealand,Kanpur,253802,253802_Test#2221_1.0
297,#2221,2016-09-22,Test,3.0,4.0,18.0,40.0,3.0,0.0,45.00,45.0,caught,New Zealand,Kanpur,253802,253802_Test#2221_3.0
298,#2222,2016-09-30,Test,1.0,4.0,9.0,28.0,1.0,0.0,32.14,42.0,caught,New Zealand,Kolkata,253802,253802_Test#2222_1.0


In [124]:
# Adding Jacques Kallis's data
if p2_concat_df_1 is None:
    pass
elif batting_master is None:
    batting_master = p2_concat_df_1
else:
    batting_master = pd.concat([batting_master, p2_concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

batting_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#1318,1995-12-14,Test,1.0,6.0,1.0,12.0,0.0,0.0,8.33,11.0,caught,England,Durban,45789,45789_Test#1318_1.0
1,#1321,1996-01-02,Test,2.0,7.0,7.0,65.0,1.0,0.0,10.76,117.0,lbw,England,Cape Town,45789,45789_Test#1321_2.0
2,#1321,1996-01-02,Test,4.0,,,,,,,,,England,Cape Town,45789,45789_Test#1321_4.0
3,#1033,1996-01-09,ODI,1.0,7.0,38.0,65.0,3.0,0.0,58.46,75.0,caught,England,Cape Town,45789,45789_ODI#1033_1.0
4,#1034,1996-01-11,ODI,1.0,4.0,29.0,35.0,2.0,1.0,82.85,51.0,caught,England,Bloemfontein,45789,45789_ODI#1034_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,#563,2016-08-28,T20I,2.0,,,,,,,,,West Indies,Lauderhill,253802,253802_T20I#563_2.0
296,#2221,2016-09-22,Test,1.0,4.0,9.0,10.0,2.0,0.0,90.00,12.0,caught,New Zealand,Kanpur,253802,253802_Test#2221_1.0
297,#2221,2016-09-22,Test,3.0,4.0,18.0,40.0,3.0,0.0,45.00,45.0,caught,New Zealand,Kanpur,253802,253802_Test#2221_3.0
298,#2222,2016-09-30,Test,1.0,4.0,9.0,28.0,1.0,0.0,32.14,42.0,caught,New Zealand,Kolkata,253802,253802_Test#2222_1.0


In [125]:
batting_master['Inns ID'].nunique()

600

Perfect! This is taken care of, now. Let us move towards James Anderson's 1st part data.

In [126]:
# Adding James Anderson's data
if p3_concat_df_1 is None:
    pass
elif batting_master is None:
    batting_master = p3_concat_df_1
else:
    batting_master = pd.concat([batting_master, p3_concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

batting_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,2.0,10.0,6.0,13.0,1.0,0.0,46.15,18.0,bowled,Australia,Melbourne,8608,8608_ODI#1919_2.0
1,#1922,2002-12-17,ODI,1.0,10.0,0.0,8.0,0.0,0.0,0.00,10.0,bowled,Sri Lanka,Brisbane,8608,8608_ODI#1922_1.0
2,#1924,2002-12-20,ODI,1.0,10.0,1.0,3.0,0.0,0.0,33.33,3.0,not out,Sri Lanka,Perth,8608,8608_ODI#1924_1.0
3,#1932,2003-01-11,ODI,2.0,,,,,,,,,Australia,Hobart,8608,8608_ODI#1932_2.0
4,#1934,2003-01-13,ODI,2.0,10.0,1.0,11.0,0.0,0.0,9.09,11.0,bowled,Sri Lanka,Sydney,8608,8608_ODI#1934_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,#563,2016-08-28,T20I,2.0,,,,,,,,,West Indies,Lauderhill,253802,253802_T20I#563_2.0
296,#2221,2016-09-22,Test,1.0,4.0,9.0,10.0,2.0,0.0,90.00,12.0,caught,New Zealand,Kanpur,253802,253802_Test#2221_1.0
297,#2221,2016-09-22,Test,3.0,4.0,18.0,40.0,3.0,0.0,45.00,45.0,caught,New Zealand,Kanpur,253802,253802_Test#2221_3.0
298,#2222,2016-09-30,Test,1.0,4.0,9.0,28.0,1.0,0.0,32.14,42.0,caught,New Zealand,Kolkata,253802,253802_Test#2222_1.0


In [127]:
batting_master.nunique()

Match ID      710
Start Date    737
Format          3
Inns            4
Pos            11
Runs          133
BF            171
4s             24
6s              7
SR            428
Mins          209
Dismissal       7
Opposition     18
Location       99
Player ID       3
Inns ID       900
dtype: int64

Perfect! 1st part is loaded. Now, let's come to 2nd part. 

##### Loading 2nd Part of all 3 players

In [128]:
# Adding Virat Kohli's data
if p1_concat_df_2 is None:
    pass
elif batting_master is None:
    batting_master = p1_concat_df_2
else:
    batting_master = pd.concat([batting_master, p1_concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

batting_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,2.0,10.0,6.0,13.0,1.0,0.0,46.15,18.0,bowled,Australia,Melbourne,8608,8608_ODI#1919_2.0
1,#1922,2002-12-17,ODI,1.0,10.0,0.0,8.0,0.0,0.0,0.00,10.0,bowled,Sri Lanka,Brisbane,8608,8608_ODI#1922_1.0
2,#1924,2002-12-20,ODI,1.0,10.0,1.0,3.0,0.0,0.0,33.33,3.0,not out,Sri Lanka,Perth,8608,8608_ODI#1924_1.0
3,#1932,2003-01-11,ODI,2.0,,,,,,,,,Australia,Hobart,8608,8608_ODI#1932_2.0
4,#1934,2003-01-13,ODI,2.0,10.0,1.0,11.0,0.0,0.0,9.09,11.0,bowled,Sri Lanka,Sydney,8608,8608_ODI#1934_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,#4844,2025-02-20,ODI,2.0,3.0,22.0,38.0,1.0,0.0,57.89,,caught,Bangladesh,Dubai,253802,253802_ODI#4844_2.0
642,#4847,2025-02-23,ODI,2.0,3.0,100.0,111.0,7.0,0.0,90.09,151.0,not out,Pakistan,Dubai,253802,253802_ODI#4847_2.0
643,#4852,2025-03-02,ODI,1.0,3.0,11.0,14.0,2.0,0.0,78.57,18.0,caught,New Zealand,Dubai,253802,253802_ODI#4852_1.0
644,#4853,2025-03-04,ODI,2.0,3.0,84.0,98.0,5.0,0.0,85.71,135.0,caught,Australia,Dubai,253802,253802_ODI#4853_2.0


In [129]:
batting_master[batting_master['Player ID']==p1_loader.player_info['Player ID'][0]]

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,1.0,2.0,12.0,22.0,1.0,0.0,54.54,33.0,lbw,Sri Lanka,Dambulla,253802,253802_ODI#2742_1.0
1,#2745,2008-08-20,ODI,2.0,2.0,37.0,67.0,6.0,0.0,55.22,82.0,caught,Sri Lanka,Dambulla,253802,253802_ODI#2745_2.0
2,#2750,2008-08-24,ODI,1.0,1.0,25.0,38.0,4.0,0.0,65.78,40.0,run out,Sri Lanka,Colombo,253802,253802_ODI#2750_1.0
3,#2755,2008-08-27,ODI,1.0,1.0,54.0,66.0,7.0,0.0,81.81,87.0,bowled,Sri Lanka,Colombo,253802,253802_ODI#2755_1.0
4,#2756,2008-08-29,ODI,2.0,1.0,31.0,46.0,3.0,1.0,67.39,45.0,lbw,Sri Lanka,Colombo,253802,253802_ODI#2756_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,#4844,2025-02-20,ODI,2.0,3.0,22.0,38.0,1.0,0.0,57.89,,caught,Bangladesh,Dubai,253802,253802_ODI#4844_2.0
642,#4847,2025-02-23,ODI,2.0,3.0,100.0,111.0,7.0,0.0,90.09,151.0,not out,Pakistan,Dubai,253802,253802_ODI#4847_2.0
643,#4852,2025-03-02,ODI,1.0,3.0,11.0,14.0,2.0,0.0,78.57,18.0,caught,New Zealand,Dubai,253802,253802_ODI#4852_1.0
644,#4853,2025-03-04,ODI,2.0,3.0,84.0,98.0,5.0,0.0,85.71,135.0,caught,Australia,Dubai,253802,253802_ODI#4853_2.0


Perfect! This matches the records! Let us check for other two players as well. 

In [130]:
# Adding Jacques Kallis's data
if p2_concat_df_2 is None:
    pass
elif batting_master is None:
    batting_master = p2_concat_df_2
else:
    batting_master = pd.concat([batting_master, p2_concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

batting_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,2.0,10.0,6.0,13.0,1.0,0.0,46.15,18.0,bowled,Australia,Melbourne,8608,8608_ODI#1919_2.0
1,#1922,2002-12-17,ODI,1.0,10.0,0.0,8.0,0.0,0.0,0.00,10.0,bowled,Sri Lanka,Brisbane,8608,8608_ODI#1922_1.0
2,#1924,2002-12-20,ODI,1.0,10.0,1.0,3.0,0.0,0.0,33.33,3.0,not out,Sri Lanka,Perth,8608,8608_ODI#1924_1.0
3,#1932,2003-01-11,ODI,2.0,,,,,,,,,Australia,Hobart,8608,8608_ODI#1932_2.0
4,#1934,2003-01-13,ODI,2.0,10.0,1.0,11.0,0.0,0.0,9.09,11.0,bowled,Sri Lanka,Sydney,8608,8608_ODI#1934_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,#4844,2025-02-20,ODI,2.0,3.0,22.0,38.0,1.0,0.0,57.89,,caught,Bangladesh,Dubai,253802,253802_ODI#4844_2.0
642,#4847,2025-02-23,ODI,2.0,3.0,100.0,111.0,7.0,0.0,90.09,151.0,not out,Pakistan,Dubai,253802,253802_ODI#4847_2.0
643,#4852,2025-03-02,ODI,1.0,3.0,11.0,14.0,2.0,0.0,78.57,18.0,caught,New Zealand,Dubai,253802,253802_ODI#4852_1.0
644,#4853,2025-03-04,ODI,2.0,3.0,84.0,98.0,5.0,0.0,85.71,135.0,caught,Australia,Dubai,253802,253802_ODI#4853_2.0


In [131]:
batting_master[batting_master['Player ID']==p2_loader.player_info['Player ID'][0]]

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#1318,1995-12-14,Test,1.0,6.0,1.0,12.0,0.0,0.0,8.33,11.0,caught,England,Durban,45789,45789_Test#1318_1.0
1,#1321,1996-01-02,Test,2.0,7.0,7.0,65.0,1.0,0.0,10.76,117.0,lbw,England,Cape Town,45789,45789_Test#1321_2.0
2,#1321,1996-01-02,Test,4.0,,,,,,,,,England,Cape Town,45789,45789_Test#1321_4.0
3,#1033,1996-01-09,ODI,1.0,7.0,38.0,65.0,3.0,0.0,58.46,75.0,caught,England,Cape Town,45789,45789_ODI#1033_1.0
4,#1034,1996-01-11,ODI,1.0,4.0,29.0,35.0,2.0,1.0,82.85,51.0,caught,England,Bloemfontein,45789,45789_ODI#1034_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,#2111,2013-12-26,Test,2.0,4.0,115.0,316.0,13.0,0.0,36.39,393.0,caught,India,Durban,45789,45789_Test#2111_2.0
642,#2111,2013-12-26,Test,4.0,,,,,,,,,India,Durban,45789,45789_Test#2111_4.0
643,#3500,2014-07-06,ODI,1.0,3.0,0.0,2.0,0.0,0.0,0.00,3.0,lbw,Sri Lanka,Colombo,45789,45789_ODI#3500_1.0
644,#3501,2014-07-09,ODI,2.0,3.0,1.0,7.0,0.0,0.0,14.28,15.0,caught,Sri Lanka,Pallekele,45789,45789_ODI#3501_2.0


Perfect! This matches with the records as well. Let us check for the last one. 

In [132]:
# Adding James Anderson's data
if p3_concat_df_2 is None:
    pass
elif batting_master is None:
    batting_master = p3_concat_df_2
else:
    batting_master = pd.concat([batting_master, p3_concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

batting_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,2.0,10.0,6.0,13.0,1.0,0.0,46.15,18.0,bowled,Australia,Melbourne,8608,8608_ODI#1919_2.0
1,#1922,2002-12-17,ODI,1.0,10.0,0.0,8.0,0.0,0.0,0.00,10.0,bowled,Sri Lanka,Brisbane,8608,8608_ODI#1922_1.0
2,#1924,2002-12-20,ODI,1.0,10.0,1.0,3.0,0.0,0.0,33.33,3.0,not out,Sri Lanka,Perth,8608,8608_ODI#1924_1.0
3,#1932,2003-01-11,ODI,2.0,,,,,,,,,Australia,Hobart,8608,8608_ODI#1932_2.0
4,#1934,2003-01-13,ODI,2.0,10.0,1.0,11.0,0.0,0.0,9.09,11.0,bowled,Sri Lanka,Sydney,8608,8608_ODI#1934_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,#4844,2025-02-20,ODI,2.0,3.0,22.0,38.0,1.0,0.0,57.89,,caught,Bangladesh,Dubai,253802,253802_ODI#4844_2.0
642,#4847,2025-02-23,ODI,2.0,3.0,100.0,111.0,7.0,0.0,90.09,151.0,not out,Pakistan,Dubai,253802,253802_ODI#4847_2.0
643,#4852,2025-03-02,ODI,1.0,3.0,11.0,14.0,2.0,0.0,78.57,18.0,caught,New Zealand,Dubai,253802,253802_ODI#4852_1.0
644,#4853,2025-03-04,ODI,2.0,3.0,84.0,98.0,5.0,0.0,85.71,135.0,caught,Australia,Dubai,253802,253802_ODI#4853_2.0


In [133]:
batting_master[batting_master['Player ID']==p3_loader.player_info['Player ID'][0]]

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,2.0,10.0,6.0,13.0,1.0,0.0,46.15,18.0,bowled,Australia,Melbourne,8608,8608_ODI#1919_2.0
1,#1922,2002-12-17,ODI,1.0,10.0,0.0,8.0,0.0,0.0,0.00,10.0,bowled,Sri Lanka,Brisbane,8608,8608_ODI#1922_1.0
2,#1924,2002-12-20,ODI,1.0,10.0,1.0,3.0,0.0,0.0,33.33,3.0,not out,Sri Lanka,Perth,8608,8608_ODI#1924_1.0
3,#1932,2003-01-11,ODI,2.0,,,,,,,,,Australia,Hobart,8608,8608_ODI#1932_2.0
4,#1934,2003-01-13,ODI,2.0,10.0,1.0,11.0,0.0,0.0,9.09,11.0,bowled,Sri Lanka,Sydney,8608,8608_ODI#1934_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551,#2531,2024-02-23,Test,1.0,11.0,0.0,4.0,0.0,0.0,0.00,7.0,lbw,India,Ranchi,8608,8608_Test#2531_1.0
552,#2531,2024-02-23,Test,3.0,11.0,0.0,3.0,0.0,0.0,0.00,1.0,caught,India,Ranchi,8608,8608_Test#2531_3.0
553,#2534,2024-03-07,Test,1.0,11.0,0.0,3.0,0.0,0.0,0.00,2.0,caught,India,Dharamshala,8608,8608_Test#2534_1.0
554,#2534,2024-03-07,Test,3.0,11.0,0.0,5.0,0.0,0.0,0.00,9.0,not out,India,Dharamshala,8608,8608_Test#2534_3.0


Excellent. For reference, let us output the number of records for all 3 players. 

In [142]:
print("Original\n------------")
print(f"p1 ({p1_loader.player_info['Player ID'][0]}):",p1_loader.battingstats.shape[0])
print(f"p2 ({p2_loader.player_info['Player ID'][0]}):",p2_loader.battingstats.shape[0])
print(f"p3 ({p3_loader.player_info['Player ID'][0]}):", p3_loader.battingstats.shape[0])

print("\nInside master data sheet------------")
batting_master['Player ID'].value_counts()

Original
------------
p1 (253802): 646
p2 (45789): 646
p3 (8608): 556

Inside master data sheet------------


Player ID
45789     646
253802    646
8608      556
Name: count, dtype: int64

Perfect! The batting sheet is tested, and ready. 

#### Bowling

We will test this, again - one by one, by partitioning data into 2 parts for each player, and then randomly uploading to check. 

In [152]:
# For Virat Kohli
p1_concat_df = p1_loader.bowlingstats.copy()
p1_concat_df['Player ID'] = p1_loader.player_info['Player ID'][0]
p1_concat_df['Inns ID'] = (p1_concat_df['Player ID'].astype(str)+"_"+
                           p1_concat_df['Format'].astype(str)+
                           p1_concat_df['Match ID'].astype(str)+"_"+
                           p1_concat_df['Inns'].astype(str)
                            )
p1_concat_df_1 = p1_concat_df.copy().iloc[:300,]
p1_concat_df_2 = p1_concat_df.copy().iloc[200:,]

print("p1:",p1_concat_df_1.shape[0], p1_concat_df_2.shape[0])

# For Jacques Kallis
p2_concat_df = p2_loader.bowlingstats.copy()
p2_concat_df['Player ID'] = p2_loader.player_info['Player ID'][0]
p2_concat_df['Inns ID'] = (p2_concat_df['Player ID'].astype(str)+"_"+
                           p2_concat_df['Format'].astype(str)+
                           p2_concat_df['Match ID'].astype(str)+"_"+
                           p2_concat_df['Inns'].astype(str)
                            )
p2_concat_df_1 = p2_concat_df.copy().iloc[:300,]
p2_concat_df_2 = p2_concat_df.copy().iloc[200:,]

print("p2:",p2_concat_df_1.shape[0], p2_concat_df_2.shape[0])

# For James Anderson
p3_concat_df = p3_loader.bowlingstats.copy()
p3_concat_df['Player ID'] = p3_loader.player_info['Player ID'][0]
p3_concat_df['Inns ID'] = (p3_concat_df['Player ID'].astype(str)+"_"+
                           p3_concat_df['Format'].astype(str)+
                           p3_concat_df['Match ID'].astype(str)+"_"+
                           p3_concat_df['Inns'].astype(str)
                            )
p3_concat_df_1 = p3_concat_df.copy().iloc[:300,]
p3_concat_df_2 = p3_concat_df.copy().iloc[200:,]

print("p3:",p3_concat_df_1.shape[0], p3_concat_df_2.shape[0])

p1: 300 463
p2: 300 468
p3: 300 369


##### Loading 1st parts of all 3 players

Now, we will load 1st part of all 3 players. 

In [153]:
bowling_master = None

In [154]:
# Adding Virat Kohli's data
if p1_concat_df_1 is None:
    pass
elif bowling_master is None:
    bowling_master = p1_concat_df_1
else:
    bowling_master = pd.concat([bowling_master, p1_concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

bowling_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,2.0,,,,,,,Sri Lanka,Dambulla,253802,253802_ODI#2742_2.0
1,#2745,2008-08-20,ODI,1.0,,,,,,,Sri Lanka,Dambulla,253802,253802_ODI#2745_1.0
2,#2750,2008-08-24,ODI,2.0,,,,,,,Sri Lanka,Colombo,253802,253802_ODI#2750_2.0
3,#2755,2008-08-27,ODI,2.0,,,,,,,Sri Lanka,Colombo,253802,253802_ODI#2755_2.0
4,#2756,2008-08-29,ODI,1.0,,,,,,,Sri Lanka,Colombo,253802,253802_ODI#2756_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,#2215,2016-08-09,Test,4.0,,,,,,,West Indies,Gros Islet,253802,253802_Test#2215_4.0
296,#2218,2016-08-18,Test,1.0,,,,,,,West Indies,Port of Spain,253802,253802_Test#2218_1.0
297,#562,2016-08-27,T20I,1.0,,,,,,,West Indies,Lauderhill,253802,253802_T20I#562_1.0
298,#563,2016-08-28,T20I,1.0,,,,,,,West Indies,Lauderhill,253802,253802_T20I#563_1.0


In [155]:
bowling_master.shape[0]

300

In [156]:
# Adding Jacques Kallis's data
if p2_concat_df_1 is None:
    pass
elif bowling_master is None:
    bowling_master = p2_concat_df_1
else:
    bowling_master = pd.concat([bowling_master, p2_concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

bowling_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location,Player ID,Inns ID
0,#1318,1995-12-14,Test,2.0,,,,,,,England,Durban,45789,45789_Test#1318_2.0
1,#1321,1996-01-02,Test,1.0,5.0,4.0,2.0,2.0,0.0,0.50,England,Cape Town,45789,45789_Test#1321_1.0
2,#1321,1996-01-02,Test,3.0,,,,,,,England,Cape Town,45789,45789_Test#1321_3.0
3,#1033,1996-01-09,ODI,2.0,7.0,3.0,0.0,14.0,0.0,4.66,England,Cape Town,45789,45789_ODI#1033_2.0
4,#1034,1996-01-11,ODI,2.0,6.0,5.0,0.0,27.0,0.0,5.40,England,Bloemfontein,45789,45789_ODI#1034_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,#2215,2016-08-09,Test,4.0,,,,,,,West Indies,Gros Islet,253802,253802_Test#2215_4.0
296,#2218,2016-08-18,Test,1.0,,,,,,,West Indies,Port of Spain,253802,253802_Test#2218_1.0
297,#562,2016-08-27,T20I,1.0,,,,,,,West Indies,Lauderhill,253802,253802_T20I#562_1.0
298,#563,2016-08-28,T20I,1.0,,,,,,,West Indies,Lauderhill,253802,253802_T20I#563_1.0


In [157]:
bowling_master['Inns ID'].nunique()

600

Perfect! This is taken care of, now. Let us move towards James Anderson's 1st part data.

In [158]:
# Adding James Anderson's data
if p3_concat_df_1 is None:
    pass
elif bowling_master is None:
    bowling_master = p3_concat_df_1
else:
    bowling_master = pd.concat([bowling_master, p3_concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

bowling_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,1.0,2.0,6.0,0.0,46.0,1.0,7.66,Australia,Melbourne,8608,8608_ODI#1919_1.0
1,#1922,2002-12-17,ODI,2.0,2.0,10.0,1.0,48.0,2.0,4.80,Sri Lanka,Brisbane,8608,8608_ODI#1922_2.0
2,#1924,2002-12-20,ODI,2.0,2.0,8.0,2.0,23.0,2.0,2.87,Sri Lanka,Perth,8608,8608_ODI#1924_2.0
3,#1932,2003-01-11,ODI,1.0,2.0,10.0,1.0,40.0,2.0,4.00,Australia,Hobart,8608,8608_ODI#1932_1.0
4,#1934,2003-01-13,ODI,1.0,2.0,9.0,0.0,58.0,2.0,6.44,Sri Lanka,Sydney,8608,8608_ODI#1934_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,#2215,2016-08-09,Test,4.0,,,,,,,West Indies,Gros Islet,253802,253802_Test#2215_4.0
296,#2218,2016-08-18,Test,1.0,,,,,,,West Indies,Port of Spain,253802,253802_Test#2218_1.0
297,#562,2016-08-27,T20I,1.0,,,,,,,West Indies,Lauderhill,253802,253802_T20I#562_1.0
298,#563,2016-08-28,T20I,1.0,,,,,,,West Indies,Lauderhill,253802,253802_T20I#563_1.0


In [159]:
bowling_master.nunique()

Match ID      699
Start Date    726
Format          3
Inns            4
Pos             8
Overs          93
Mdns           14
Runs          107
Wkts            8
Econ          286
Opposition     18
Location       99
Player ID       3
Inns ID       900
dtype: int64

Perfect! 1st part is loaded. Now, let's come to 2nd part. 

##### Loading 2nd Part of all 3 players

In [160]:
# Adding Virat Kohli's data
if p1_concat_df_2 is None:
    pass
elif bowling_master is None:
    bowling_master = p1_concat_df_2
else:
    bowling_master = pd.concat([bowling_master, p1_concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

bowling_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,1.0,2.0,6.0,0.0,46.0,1.0,7.66,Australia,Melbourne,8608,8608_ODI#1919_1.0
1,#1922,2002-12-17,ODI,2.0,2.0,10.0,1.0,48.0,2.0,4.80,Sri Lanka,Brisbane,8608,8608_ODI#1922_2.0
2,#1924,2002-12-20,ODI,2.0,2.0,8.0,2.0,23.0,2.0,2.87,Sri Lanka,Perth,8608,8608_ODI#1924_2.0
3,#1932,2003-01-11,ODI,1.0,2.0,10.0,1.0,40.0,2.0,4.00,Australia,Hobart,8608,8608_ODI#1932_1.0
4,#1934,2003-01-13,ODI,1.0,2.0,9.0,0.0,58.0,2.0,6.44,Sri Lanka,Sydney,8608,8608_ODI#1934_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,,,,,,,Bangladesh,Dubai,253802,253802_ODI#4844_1.0
659,#4847,2025-02-23,ODI,1.0,,,,,,,Pakistan,Dubai,253802,253802_ODI#4847_1.0
660,#4852,2025-03-02,ODI,2.0,,,,,,,New Zealand,Dubai,253802,253802_ODI#4852_2.0
661,#4853,2025-03-04,ODI,1.0,,,,,,,Australia,Dubai,253802,253802_ODI#4853_1.0


In [161]:
bowling_master[bowling_master['Player ID']==p1_loader.player_info['Player ID'][0]]

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,2.0,,,,,,,Sri Lanka,Dambulla,253802,253802_ODI#2742_2.0
1,#2745,2008-08-20,ODI,1.0,,,,,,,Sri Lanka,Dambulla,253802,253802_ODI#2745_1.0
2,#2750,2008-08-24,ODI,2.0,,,,,,,Sri Lanka,Colombo,253802,253802_ODI#2750_2.0
3,#2755,2008-08-27,ODI,2.0,,,,,,,Sri Lanka,Colombo,253802,253802_ODI#2755_2.0
4,#2756,2008-08-29,ODI,1.0,,,,,,,Sri Lanka,Colombo,253802,253802_ODI#2756_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,,,,,,,Bangladesh,Dubai,253802,253802_ODI#4844_1.0
659,#4847,2025-02-23,ODI,1.0,,,,,,,Pakistan,Dubai,253802,253802_ODI#4847_1.0
660,#4852,2025-03-02,ODI,2.0,,,,,,,New Zealand,Dubai,253802,253802_ODI#4852_2.0
661,#4853,2025-03-04,ODI,1.0,,,,,,,Australia,Dubai,253802,253802_ODI#4853_1.0


Perfect! This matches the records! Let us check for other two players as well. 

In [162]:
# Adding Jacques Kallis's data
if p2_concat_df_2 is None:
    pass
elif bowling_master is None:
    bowling_master = p2_concat_df_2
else:
    bowling_master = pd.concat([bowling_master, p2_concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

bowling_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,1.0,2.0,6.0,0.0,46.0,1.0,7.66,Australia,Melbourne,8608,8608_ODI#1919_1.0
1,#1922,2002-12-17,ODI,2.0,2.0,10.0,1.0,48.0,2.0,4.80,Sri Lanka,Brisbane,8608,8608_ODI#1922_2.0
2,#1924,2002-12-20,ODI,2.0,2.0,8.0,2.0,23.0,2.0,2.87,Sri Lanka,Perth,8608,8608_ODI#1924_2.0
3,#1932,2003-01-11,ODI,1.0,2.0,10.0,1.0,40.0,2.0,4.00,Australia,Hobart,8608,8608_ODI#1932_1.0
4,#1934,2003-01-13,ODI,1.0,2.0,9.0,0.0,58.0,2.0,6.44,Sri Lanka,Sydney,8608,8608_ODI#1934_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,,,,,,,Bangladesh,Dubai,253802,253802_ODI#4844_1.0
659,#4847,2025-02-23,ODI,1.0,,,,,,,Pakistan,Dubai,253802,253802_ODI#4847_1.0
660,#4852,2025-03-02,ODI,2.0,,,,,,,New Zealand,Dubai,253802,253802_ODI#4852_2.0
661,#4853,2025-03-04,ODI,1.0,,,,,,,Australia,Dubai,253802,253802_ODI#4853_1.0


In [164]:
bowling_master[bowling_master['Player ID']==p2_loader.player_info['Player ID'][0]]

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location,Player ID,Inns ID
0,#1318,1995-12-14,Test,2.0,,,,,,,England,Durban,45789,45789_Test#1318_2.0
1,#1321,1996-01-02,Test,1.0,5.0,4.0,2.0,2.0,0.0,0.50,England,Cape Town,45789,45789_Test#1321_1.0
2,#1321,1996-01-02,Test,3.0,,,,,,,England,Cape Town,45789,45789_Test#1321_3.0
3,#1033,1996-01-09,ODI,2.0,7.0,3.0,0.0,14.0,0.0,4.66,England,Cape Town,45789,45789_ODI#1033_2.0
4,#1034,1996-01-11,ODI,2.0,6.0,5.0,0.0,27.0,0.0,5.40,England,Bloemfontein,45789,45789_ODI#1034_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,#2111,2013-12-26,Test,1.0,4.0,11.0,1.0,36.0,0.0,3.27,India,Durban,45789,45789_Test#2111_1.0
664,#2111,2013-12-26,Test,3.0,,,,,,,India,Durban,45789,45789_Test#2111_3.0
665,#3500,2014-07-06,ODI,2.0,,,,,,,Sri Lanka,Colombo,45789,45789_ODI#3500_2.0
666,#3501,2014-07-09,ODI,1.0,,,,,,,Sri Lanka,Pallekele,45789,45789_ODI#3501_1.0


Perfect! This matches with the records as well. Let us check for the last one. 

In [165]:
# Adding James Anderson's data
if p3_concat_df_2 is None:
    pass
elif bowling_master is None:
    bowling_master = p3_concat_df_2
else:
    bowling_master = pd.concat([bowling_master, p3_concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

bowling_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,1.0,2.0,6.0,0.0,46.0,1.0,7.66,Australia,Melbourne,8608,8608_ODI#1919_1.0
1,#1922,2002-12-17,ODI,2.0,2.0,10.0,1.0,48.0,2.0,4.80,Sri Lanka,Brisbane,8608,8608_ODI#1922_2.0
2,#1924,2002-12-20,ODI,2.0,2.0,8.0,2.0,23.0,2.0,2.87,Sri Lanka,Perth,8608,8608_ODI#1924_2.0
3,#1932,2003-01-11,ODI,1.0,2.0,10.0,1.0,40.0,2.0,4.00,Australia,Hobart,8608,8608_ODI#1932_1.0
4,#1934,2003-01-13,ODI,1.0,2.0,9.0,0.0,58.0,2.0,6.44,Sri Lanka,Sydney,8608,8608_ODI#1934_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,,,,,,,Bangladesh,Dubai,253802,253802_ODI#4844_1.0
659,#4847,2025-02-23,ODI,1.0,,,,,,,Pakistan,Dubai,253802,253802_ODI#4847_1.0
660,#4852,2025-03-02,ODI,2.0,,,,,,,New Zealand,Dubai,253802,253802_ODI#4852_2.0
661,#4853,2025-03-04,ODI,1.0,,,,,,,Australia,Dubai,253802,253802_ODI#4853_1.0


In [166]:
bowling_master[bowling_master['Player ID']==p3_loader.player_info['Player ID'][0]]

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,1.0,2.0,6.0,0.0,46.0,1.0,7.66,Australia,Melbourne,8608,8608_ODI#1919_1.0
1,#1922,2002-12-17,ODI,2.0,2.0,10.0,1.0,48.0,2.0,4.80,Sri Lanka,Brisbane,8608,8608_ODI#1922_2.0
2,#1924,2002-12-20,ODI,2.0,2.0,8.0,2.0,23.0,2.0,2.87,Sri Lanka,Perth,8608,8608_ODI#1924_2.0
3,#1932,2003-01-11,ODI,1.0,2.0,10.0,1.0,40.0,2.0,4.00,Australia,Hobart,8608,8608_ODI#1932_1.0
4,#1934,2003-01-13,ODI,1.0,2.0,9.0,0.0,58.0,2.0,6.44,Sri Lanka,Sydney,8608,8608_ODI#1934_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,#2531,2024-02-23,Test,2.0,1.0,18.0,4.0,48.0,2.0,2.66,India,Ranchi,8608,8608_Test#2531_2.0
565,#2531,2024-02-23,Test,4.0,4.0,3.0,1.0,12.0,0.0,4.00,India,Ranchi,8608,8608_Test#2531_4.0
566,#2534,2024-03-07,Test,2.0,1.0,16.0,2.0,60.0,2.0,3.75,India,Dharamshala,8608,8608_Test#2534_2.0
567,#2538,2024-07-10,Test,1.0,1.0,10.4,3.0,26.0,1.0,2.43,West Indies,London,8608,8608_Test#2538_1.0


Excellent. For reference, let us output the number of records for all 3 players. 

In [168]:
print("Original\n------------")
print(f"p1 ({p1_loader.player_info['Player ID'][0]}):",p1_loader.bowlingstats.shape[0])
print(f"p2 ({p2_loader.player_info['Player ID'][0]}):",p2_loader.bowlingstats.shape[0])
print(f"p3 ({p3_loader.player_info['Player ID'][0]}):", p3_loader.bowlingstats.shape[0])

print("\nInside master data sheet------------")
bowling_master['Player ID'].value_counts()

Original
------------
p1 (253802): 663
p2 (45789): 668
p3 (8608): 569

Inside master data sheet------------


Player ID
45789     668
253802    663
8608      569
Name: count, dtype: int64

Perfect! The bowling sheet is tested, and ready. 

#### Fielding

We will test this, again - one by one, by partitioning data into 2 parts for each player, and then randomly uploading to check. 

In [169]:
# For Virat Kohli
p1_concat_df = p1_loader.fieldingstats.copy()
p1_concat_df['Player ID'] = p1_loader.player_info['Player ID'][0]
p1_concat_df['Inns ID'] = (p1_concat_df['Player ID'].astype(str)+"_"+
                           p1_concat_df['Format'].astype(str)+
                           p1_concat_df['Match ID'].astype(str)+"_"+
                           p1_concat_df['Inns'].astype(str)
                            )
p1_concat_df_1 = p1_concat_df.copy().iloc[:300,]
p1_concat_df_2 = p1_concat_df.copy().iloc[200:,]

print("p1:",p1_concat_df_1.shape[0], p1_concat_df_2.shape[0])

# For Jacques Kallis
p2_concat_df = p2_loader.fieldingstats.copy()
p2_concat_df['Player ID'] = p2_loader.player_info['Player ID'][0]
p2_concat_df['Inns ID'] = (p2_concat_df['Player ID'].astype(str)+"_"+
                           p2_concat_df['Format'].astype(str)+
                           p2_concat_df['Match ID'].astype(str)+"_"+
                           p2_concat_df['Inns'].astype(str)
                            )
p2_concat_df_1 = p2_concat_df.copy().iloc[:300,]
p2_concat_df_2 = p2_concat_df.copy().iloc[200:,]

print("p2:",p2_concat_df_1.shape[0], p2_concat_df_2.shape[0])

# For James Anderson
p3_concat_df = p3_loader.fieldingstats.copy()
p3_concat_df['Player ID'] = p3_loader.player_info['Player ID'][0]
p3_concat_df['Inns ID'] = (p3_concat_df['Player ID'].astype(str)+"_"+
                           p3_concat_df['Format'].astype(str)+
                           p3_concat_df['Match ID'].astype(str)+"_"+
                           p3_concat_df['Inns'].astype(str)
                            )
p3_concat_df_1 = p3_concat_df.copy().iloc[:300,]
p3_concat_df_2 = p3_concat_df.copy().iloc[200:,]

print("p3:",p3_concat_df_1.shape[0], p3_concat_df_2.shape[0])

p1: 300 463
p2: 300 468
p3: 300 369


##### Loading 1st parts of all 3 players

Now, we will load 1st part of all 3 players. 

In [170]:
fielding_master = None

In [171]:
# Adding Virat Kohli's data
if p1_concat_df_1 is None:
    pass
elif fielding_master is None:
    fielding_master = p1_concat_df_1
else:
    fielding_master = pd.concat([fielding_master, p1_concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

fielding_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,2.0,0.0,0.0,Sri Lanka,Dambulla,253802,253802_ODI#2742_2.0
1,#2745,2008-08-20,ODI,1.0,3.0,3.0,Sri Lanka,Dambulla,253802,253802_ODI#2745_1.0
2,#2750,2008-08-24,ODI,2.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_ODI#2750_2.0
3,#2755,2008-08-27,ODI,2.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_ODI#2755_2.0
4,#2756,2008-08-29,ODI,1.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_ODI#2756_1.0
...,...,...,...,...,...,...,...,...,...,...
295,#2215,2016-08-09,Test,4.0,1.0,1.0,West Indies,Gros Islet,253802,253802_Test#2215_4.0
296,#2218,2016-08-18,Test,1.0,0.0,0.0,West Indies,Port of Spain,253802,253802_Test#2218_1.0
297,#562,2016-08-27,T20I,1.0,0.0,0.0,West Indies,Lauderhill,253802,253802_T20I#562_1.0
298,#563,2016-08-28,T20I,1.0,1.0,1.0,West Indies,Lauderhill,253802,253802_T20I#563_1.0


In [None]:
fielding_master.shape[0]

300

In [172]:
# Adding Jacques Kallis's data
if p2_concat_df_1 is None:
    pass
elif fielding_master is None:
    fielding_master = p2_concat_df_1
else:
    fielding_master = pd.concat([fielding_master, p2_concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

fielding_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location,Player ID,Inns ID
0,#1318,1995-12-14,Test,2.0,0.0,0.0,England,Durban,45789,45789_Test#1318_2.0
1,#1321,1996-01-02,Test,1.0,0.0,0.0,England,Cape Town,45789,45789_Test#1321_1.0
2,#1321,1996-01-02,Test,3.0,1.0,1.0,England,Cape Town,45789,45789_Test#1321_3.0
3,#1033,1996-01-09,ODI,2.0,0.0,0.0,England,Cape Town,45789,45789_ODI#1033_2.0
4,#1034,1996-01-11,ODI,2.0,0.0,0.0,England,Bloemfontein,45789,45789_ODI#1034_2.0
...,...,...,...,...,...,...,...,...,...,...
295,#2215,2016-08-09,Test,4.0,1.0,1.0,West Indies,Gros Islet,253802,253802_Test#2215_4.0
296,#2218,2016-08-18,Test,1.0,0.0,0.0,West Indies,Port of Spain,253802,253802_Test#2218_1.0
297,#562,2016-08-27,T20I,1.0,0.0,0.0,West Indies,Lauderhill,253802,253802_T20I#562_1.0
298,#563,2016-08-28,T20I,1.0,1.0,1.0,West Indies,Lauderhill,253802,253802_T20I#563_1.0


In [173]:
fielding_master['Inns ID'].nunique()

600

Perfect! This is taken care of, now. Let us move towards James Anderson's 1st part data.

In [175]:
# Adding James Anderson's data
if p3_concat_df_1 is None:
    pass
elif fielding_master is None:
    fielding_master = p3_concat_df_1
else:
    fielding_master = pd.concat([fielding_master, p3_concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

fielding_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,1.0,0.0,0.0,Australia,Melbourne,8608,8608_ODI#1919_1.0
1,#1922,2002-12-17,ODI,2.0,0.0,0.0,Sri Lanka,Brisbane,8608,8608_ODI#1922_2.0
2,#1924,2002-12-20,ODI,2.0,0.0,0.0,Sri Lanka,Perth,8608,8608_ODI#1924_2.0
3,#1932,2003-01-11,ODI,1.0,0.0,0.0,Australia,Hobart,8608,8608_ODI#1932_1.0
4,#1934,2003-01-13,ODI,1.0,1.0,1.0,Sri Lanka,Sydney,8608,8608_ODI#1934_1.0
...,...,...,...,...,...,...,...,...,...,...
295,#2215,2016-08-09,Test,4.0,1.0,1.0,West Indies,Gros Islet,253802,253802_Test#2215_4.0
296,#2218,2016-08-18,Test,1.0,0.0,0.0,West Indies,Port of Spain,253802,253802_Test#2218_1.0
297,#562,2016-08-27,T20I,1.0,0.0,0.0,West Indies,Lauderhill,253802,253802_T20I#562_1.0
298,#563,2016-08-28,T20I,1.0,1.0,1.0,West Indies,Lauderhill,253802,253802_T20I#563_1.0


In [176]:
fielding_master.nunique()

Match ID      699
Start Date    726
Format          3
Inns            4
Dis             4
Ct              4
Opposition     18
Location       99
Player ID       3
Inns ID       900
dtype: int64

Perfect! 1st part is loaded. Now, let's come to 2nd part. 

##### Loading 2nd Part of all 3 players

In [177]:
# Adding Virat Kohli's data
if p1_concat_df_2 is None:
    pass
elif fielding_master is None:
    fielding_master = p1_concat_df_2
else:
    fielding_master = pd.concat([fielding_master, p1_concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

fielding_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,1.0,0.0,0.0,Australia,Melbourne,8608,8608_ODI#1919_1.0
1,#1922,2002-12-17,ODI,2.0,0.0,0.0,Sri Lanka,Brisbane,8608,8608_ODI#1922_2.0
2,#1924,2002-12-20,ODI,2.0,0.0,0.0,Sri Lanka,Perth,8608,8608_ODI#1924_2.0
3,#1932,2003-01-11,ODI,1.0,0.0,0.0,Australia,Hobart,8608,8608_ODI#1932_1.0
4,#1934,2003-01-13,ODI,1.0,1.0,1.0,Sri Lanka,Sydney,8608,8608_ODI#1934_1.0
...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,2.0,2.0,Bangladesh,Dubai,253802,253802_ODI#4844_1.0
659,#4847,2025-02-23,ODI,1.0,2.0,2.0,Pakistan,Dubai,253802,253802_ODI#4847_1.0
660,#4852,2025-03-02,ODI,2.0,1.0,1.0,New Zealand,Dubai,253802,253802_ODI#4852_2.0
661,#4853,2025-03-04,ODI,1.0,2.0,2.0,Australia,Dubai,253802,253802_ODI#4853_1.0


In [178]:
fielding_master[fielding_master['Player ID']==p1_loader.player_info['Player ID'][0]]

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location,Player ID,Inns ID
0,#2742,2008-08-18,ODI,2.0,0.0,0.0,Sri Lanka,Dambulla,253802,253802_ODI#2742_2.0
1,#2745,2008-08-20,ODI,1.0,3.0,3.0,Sri Lanka,Dambulla,253802,253802_ODI#2745_1.0
2,#2750,2008-08-24,ODI,2.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_ODI#2750_2.0
3,#2755,2008-08-27,ODI,2.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_ODI#2755_2.0
4,#2756,2008-08-29,ODI,1.0,0.0,0.0,Sri Lanka,Colombo,253802,253802_ODI#2756_1.0
...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,2.0,2.0,Bangladesh,Dubai,253802,253802_ODI#4844_1.0
659,#4847,2025-02-23,ODI,1.0,2.0,2.0,Pakistan,Dubai,253802,253802_ODI#4847_1.0
660,#4852,2025-03-02,ODI,2.0,1.0,1.0,New Zealand,Dubai,253802,253802_ODI#4852_2.0
661,#4853,2025-03-04,ODI,1.0,2.0,2.0,Australia,Dubai,253802,253802_ODI#4853_1.0


Perfect! This matches the records! Let us check for other two players as well. 

In [179]:
# Adding Jacques Kallis's data
if p2_concat_df_2 is None:
    pass
elif fielding_master is None:
    fielding_master = p2_concat_df_2
else:
    fielding_master = pd.concat([fielding_master, p2_concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

fielding_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,1.0,0.0,0.0,Australia,Melbourne,8608,8608_ODI#1919_1.0
1,#1922,2002-12-17,ODI,2.0,0.0,0.0,Sri Lanka,Brisbane,8608,8608_ODI#1922_2.0
2,#1924,2002-12-20,ODI,2.0,0.0,0.0,Sri Lanka,Perth,8608,8608_ODI#1924_2.0
3,#1932,2003-01-11,ODI,1.0,0.0,0.0,Australia,Hobart,8608,8608_ODI#1932_1.0
4,#1934,2003-01-13,ODI,1.0,1.0,1.0,Sri Lanka,Sydney,8608,8608_ODI#1934_1.0
...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,2.0,2.0,Bangladesh,Dubai,253802,253802_ODI#4844_1.0
659,#4847,2025-02-23,ODI,1.0,2.0,2.0,Pakistan,Dubai,253802,253802_ODI#4847_1.0
660,#4852,2025-03-02,ODI,2.0,1.0,1.0,New Zealand,Dubai,253802,253802_ODI#4852_2.0
661,#4853,2025-03-04,ODI,1.0,2.0,2.0,Australia,Dubai,253802,253802_ODI#4853_1.0


In [180]:
fielding_master[fielding_master['Player ID']==p2_loader.player_info['Player ID'][0]]

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location,Player ID,Inns ID
0,#1318,1995-12-14,Test,2.0,0.0,0.0,England,Durban,45789,45789_Test#1318_2.0
1,#1321,1996-01-02,Test,1.0,0.0,0.0,England,Cape Town,45789,45789_Test#1321_1.0
2,#1321,1996-01-02,Test,3.0,1.0,1.0,England,Cape Town,45789,45789_Test#1321_3.0
3,#1033,1996-01-09,ODI,2.0,0.0,0.0,England,Cape Town,45789,45789_ODI#1033_2.0
4,#1034,1996-01-11,ODI,2.0,0.0,0.0,England,Bloemfontein,45789,45789_ODI#1034_2.0
...,...,...,...,...,...,...,...,...,...,...
663,#2111,2013-12-26,Test,1.0,1.0,1.0,India,Durban,45789,45789_Test#2111_1.0
664,#2111,2013-12-26,Test,3.0,0.0,0.0,India,Durban,45789,45789_Test#2111_3.0
665,#3500,2014-07-06,ODI,2.0,1.0,1.0,Sri Lanka,Colombo,45789,45789_ODI#3500_2.0
666,#3501,2014-07-09,ODI,1.0,0.0,0.0,Sri Lanka,Pallekele,45789,45789_ODI#3501_1.0


Perfect! This matches with the records as well. Let us check for the last one. 

In [181]:
# Adding James Anderson's data
if p3_concat_df_2 is None:
    pass
elif fielding_master is None:
    fielding_master = p3_concat_df_2
else:
    fielding_master = pd.concat([fielding_master, p3_concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

fielding_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,1.0,0.0,0.0,Australia,Melbourne,8608,8608_ODI#1919_1.0
1,#1922,2002-12-17,ODI,2.0,0.0,0.0,Sri Lanka,Brisbane,8608,8608_ODI#1922_2.0
2,#1924,2002-12-20,ODI,2.0,0.0,0.0,Sri Lanka,Perth,8608,8608_ODI#1924_2.0
3,#1932,2003-01-11,ODI,1.0,0.0,0.0,Australia,Hobart,8608,8608_ODI#1932_1.0
4,#1934,2003-01-13,ODI,1.0,1.0,1.0,Sri Lanka,Sydney,8608,8608_ODI#1934_1.0
...,...,...,...,...,...,...,...,...,...,...
658,#4844,2025-02-20,ODI,1.0,2.0,2.0,Bangladesh,Dubai,253802,253802_ODI#4844_1.0
659,#4847,2025-02-23,ODI,1.0,2.0,2.0,Pakistan,Dubai,253802,253802_ODI#4847_1.0
660,#4852,2025-03-02,ODI,2.0,1.0,1.0,New Zealand,Dubai,253802,253802_ODI#4852_2.0
661,#4853,2025-03-04,ODI,1.0,2.0,2.0,Australia,Dubai,253802,253802_ODI#4853_1.0


In [182]:
fielding_master[bowling_master['Player ID']==p3_loader.player_info['Player ID'][0]]

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location,Player ID,Inns ID
0,#1919,2002-12-15,ODI,1.0,0.0,0.0,Australia,Melbourne,8608,8608_ODI#1919_1.0
1,#1922,2002-12-17,ODI,2.0,0.0,0.0,Sri Lanka,Brisbane,8608,8608_ODI#1922_2.0
2,#1924,2002-12-20,ODI,2.0,0.0,0.0,Sri Lanka,Perth,8608,8608_ODI#1924_2.0
3,#1932,2003-01-11,ODI,1.0,0.0,0.0,Australia,Hobart,8608,8608_ODI#1932_1.0
4,#1934,2003-01-13,ODI,1.0,1.0,1.0,Sri Lanka,Sydney,8608,8608_ODI#1934_1.0
...,...,...,...,...,...,...,...,...,...,...
564,#2531,2024-02-23,Test,2.0,0.0,0.0,India,Ranchi,8608,8608_Test#2531_2.0
565,#2531,2024-02-23,Test,4.0,1.0,1.0,India,Ranchi,8608,8608_Test#2531_4.0
566,#2534,2024-03-07,Test,2.0,0.0,0.0,India,Dharamshala,8608,8608_Test#2534_2.0
567,#2538,2024-07-10,Test,1.0,0.0,0.0,West Indies,London,8608,8608_Test#2538_1.0


Excellent. For reference, let us output the number of records for all 3 players. 

In [183]:
print("Original\n------------")
print(f"p1 ({p1_loader.player_info['Player ID'][0]}):",p1_loader.bowlingstats.shape[0])
print(f"p2 ({p2_loader.player_info['Player ID'][0]}):",p2_loader.bowlingstats.shape[0])
print(f"p3 ({p3_loader.player_info['Player ID'][0]}):", p3_loader.bowlingstats.shape[0])

print("\nInside master data sheet------------")
bowling_master['Player ID'].value_counts()

Original
------------
p1 (253802): 663
p2 (45789): 668
p3 (8608): 569

Inside master data sheet------------


Player ID
45789     668
253802    663
8608      569
Name: count, dtype: int64

Perfect! The fielding sheet is tested, and ready. 

#### All round

We will test this, again - one by one, by partitioning data into 2 parts for each player, and then randomly uploading to check. 

In [187]:
# For Virat Kohli
p1_concat_df = p1_loader.allroundstats.copy() if p1_loader.allroundstats is not None else pd.DataFrame()
p1_concat_df['Player ID'] = p1_loader.player_info['Player ID'][0]
p1_concat_df['Inns ID'] = (p1_concat_df['Player ID'].astype(str)+"_"+
                           p1_concat_df['Format'].astype(str)+
                           p1_concat_df['Match ID'].astype(str)+"_"+
                           p1_concat_df['Inns'].astype(str)
                            ) if p1_loader.allroundstats is not None else None
p1_concat_df_1 = p1_concat_df.copy().iloc[:300,]
p1_concat_df_2 = p1_concat_df.copy().iloc[200:,]

print("p1:",p1_concat_df_1.shape[0], p1_concat_df_2.shape[0])

# For Jacques Kallis
p2_concat_df = p2_loader.allroundstats.copy() if p2_loader.allroundstats is not None else pd.DataFrame()
p2_concat_df['Player ID'] = p2_loader.player_info['Player ID'][0]
p2_concat_df['Inns ID'] = (p2_concat_df['Player ID'].astype(str)+"_"+
                           p2_concat_df['Format'].astype(str)+
                           p2_concat_df['Match ID'].astype(str)+"_"+
                           p2_concat_df['Inns'].astype(str)
                            ) if p2_loader.allroundstats is not None else None
p2_concat_df_1 = p2_concat_df.copy().iloc[:300,]
p2_concat_df_2 = p2_concat_df.copy().iloc[200:,]

print("p2:",p2_concat_df_1.shape[0], p2_concat_df_2.shape[0])

# For James Anderson
p3_concat_df = p3_loader.allroundstats.copy() if p3_loader.allroundstats is not None else pd.DataFrame()
p3_concat_df['Player ID'] = p3_loader.player_info['Player ID'][0]
p3_concat_df['Inns ID'] = (p3_concat_df['Player ID'].astype(str)+"_"+
                           p3_concat_df['Format'].astype(str)+
                           p3_concat_df['Match ID'].astype(str)+"_"+
                           p3_concat_df['Inns'].astype(str)
                            ) if p3_loader.allroundstats is not None else None
p3_concat_df_1 = p3_concat_df.copy().iloc[:300,]
p3_concat_df_2 = p3_concat_df.copy().iloc[200:,]

print("p3:",p3_concat_df_1.shape[0], p3_concat_df_2.shape[0])

p1: 0 0
p2: 300 1114
p3: 0 0


##### Loading 1st parts of all 3 players

Now, we will load 1st part of all 3 players. 

In [188]:
allround_master = None

In [189]:
# Adding Virat Kohli's data
if p1_concat_df_1 is None:
    pass
elif allround_master is None:
    allround_master = p1_concat_df_1
else:
    allround_master = pd.concat([allround_master, p1_concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

allround_master

Unnamed: 0,Player ID,Inns ID


In [191]:
allround_master.shape[0]

0

In [192]:
# Adding Jacques Kallis's data
if p2_concat_df_1 is None:
    pass
elif allround_master is None:
    allround_master = p2_concat_df_1
else:
    allround_master = pd.concat([allround_master, p2_concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

allround_master

Unnamed: 0,Player ID,Inns ID,Match ID,Start Date,Format,Inns,Score,Overs,Conc,Wkts,Ct,St,Opposition,Location
0,45789,45789_Test#1318_1,#1318,1995-12-14,Test,1.0,1.0,,,,,,England,Durban
1,45789,45789_Test#1318_2,#1318,1995-12-14,Test,2.0,,,,,0.0,0.0,England,Durban
2,45789,45789_Test#1321_1,#1321,1996-01-02,Test,1.0,,4.0,2.0,0.0,0.0,0.0,England,Cape Town
3,45789,45789_Test#1321_2,#1321,1996-01-02,Test,2.0,7.0,,,,,,England,Cape Town
4,45789,45789_Test#1321_3,#1321,1996-01-02,Test,3.0,,,,,1.0,0.0,England,Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,45789,45789_Test#1486_3,#1486,2000-03-02,Test,3.0,,6.0,10.0,1.0,1.0,0.0,India,Bengaluru
296,45789,45789_ODI#1572_1,#1572,2000-03-09,ODI,1.0,37.0,,,,,,India,Kochi
297,45789,45789_ODI#1572_2,#1572,2000-03-09,ODI,2.0,,6.0,42.0,1.0,0.0,0.0,India,Kochi
298,45789,45789_ODI#1573_1,#1573,2000-03-12,ODI,1.0,18.0,,,,,,India,Jamshedpur


In [193]:
allround_master['Inns ID'].nunique()

300

Perfect! This is taken care of, now. Let us move towards James Anderson's 1st part data.

In [194]:
# Adding James Anderson's data
if p3_concat_df_1 is None:
    pass
elif allround_master is None:
    allround_master = p3_concat_df_1
else:
    allround_master = pd.concat([allround_master, p3_concat_df_1])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

allround_master

Unnamed: 0,Player ID,Inns ID,Match ID,Start Date,Format,Inns,Score,Overs,Conc,Wkts,Ct,St,Opposition,Location
0,45789,45789_Test#1318_1,#1318,1995-12-14,Test,1.0,1.0,,,,,,England,Durban
1,45789,45789_Test#1318_2,#1318,1995-12-14,Test,2.0,,,,,0.0,0.0,England,Durban
2,45789,45789_Test#1321_1,#1321,1996-01-02,Test,1.0,,4.0,2.0,0.0,0.0,0.0,England,Cape Town
3,45789,45789_Test#1321_2,#1321,1996-01-02,Test,2.0,7.0,,,,,,England,Cape Town
4,45789,45789_Test#1321_3,#1321,1996-01-02,Test,3.0,,,,,1.0,0.0,England,Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,45789,45789_Test#1486_3,#1486,2000-03-02,Test,3.0,,6.0,10.0,1.0,1.0,0.0,India,Bengaluru
296,45789,45789_ODI#1572_1,#1572,2000-03-09,ODI,1.0,37.0,,,,,,India,Kochi
297,45789,45789_ODI#1572_2,#1572,2000-03-09,ODI,2.0,,6.0,42.0,1.0,0.0,0.0,India,Kochi
298,45789,45789_ODI#1573_1,#1573,2000-03-12,ODI,1.0,18.0,,,,,,India,Jamshedpur


In [195]:
allround_master.nunique()

Player ID       1
Inns ID       300
Match ID      112
Start Date    122
Format          2
Inns            5
Score          71
Overs          33
Conc           51
Wkts            6
Ct              4
St              1
Opposition     11
Location       38
dtype: int64

Perfect! 1st part is loaded. Now, let's come to 2nd part. 

##### Loading 2nd Part of all 3 players

In [196]:
# Adding Virat Kohli's data
if p1_concat_df_2 is None:
    pass
elif allround_master is None:
    allround_master = p1_concat_df_2
else:
    allround_master = pd.concat([allround_master, p1_concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

allround_master

Unnamed: 0,Player ID,Inns ID,Match ID,Start Date,Format,Inns,Score,Overs,Conc,Wkts,Ct,St,Opposition,Location
0,45789,45789_Test#1318_1,#1318,1995-12-14,Test,1.0,1.0,,,,,,England,Durban
1,45789,45789_Test#1318_2,#1318,1995-12-14,Test,2.0,,,,,0.0,0.0,England,Durban
2,45789,45789_Test#1321_1,#1321,1996-01-02,Test,1.0,,4.0,2.0,0.0,0.0,0.0,England,Cape Town
3,45789,45789_Test#1321_2,#1321,1996-01-02,Test,2.0,7.0,,,,,,England,Cape Town
4,45789,45789_Test#1321_3,#1321,1996-01-02,Test,3.0,,,,,1.0,0.0,England,Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,45789,45789_Test#1486_3,#1486,2000-03-02,Test,3.0,,6.0,10.0,1.0,1.0,0.0,India,Bengaluru
296,45789,45789_ODI#1572_1,#1572,2000-03-09,ODI,1.0,37.0,,,,,,India,Kochi
297,45789,45789_ODI#1572_2,#1572,2000-03-09,ODI,2.0,,6.0,42.0,1.0,0.0,0.0,India,Kochi
298,45789,45789_ODI#1573_1,#1573,2000-03-12,ODI,1.0,18.0,,,,,,India,Jamshedpur


In [204]:
allround_master[allround_master['Player ID']==p1_loader.player_info['Player ID'][0]] \
    if p1_loader.allroundstats is not None \
    else allround_master['Player ID'].value_counts()

Player ID
45789    1314
Name: count, dtype: int64

Perfect! This matches the records, as player 1 is not an all-rounder! Let us check for other two players as well. 

In [201]:
# Adding Jacques Kallis's data
if p2_concat_df_2 is None:
    pass
elif allround_master is None:
    allround_master = p2_concat_df_2
else:
    allround_master = pd.concat([allround_master, p2_concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

allround_master

Unnamed: 0,Player ID,Inns ID,Match ID,Start Date,Format,Inns,Score,Overs,Conc,Wkts,Ct,St,Opposition,Location
0,45789,45789_Test#1318_1,#1318,1995-12-14,Test,1.0,1.0,,,,,,England,Durban
1,45789,45789_Test#1318_2,#1318,1995-12-14,Test,2.0,,,,,0.0,0.0,England,Durban
2,45789,45789_Test#1321_1,#1321,1996-01-02,Test,1.0,,4.0,2.0,0.0,0.0,0.0,England,Cape Town
3,45789,45789_Test#1321_2,#1321,1996-01-02,Test,2.0,7.0,,,,,,England,Cape Town
4,45789,45789_Test#1321_3,#1321,1996-01-02,Test,3.0,,,,,1.0,0.0,England,Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1309,45789,45789_ODI#3500_2,#3500,2014-07-06,ODI,2.0,,,,,1.0,0.0,Sri Lanka,Colombo
1310,45789,45789_ODI#3501_1,#3501,2014-07-09,ODI,1.0,,,,,0.0,0.0,Sri Lanka,Pallekele
1311,45789,45789_ODI#3501_2,#3501,2014-07-09,ODI,2.0,1.0,,,,,,Sri Lanka,Pallekele
1312,45789,45789_ODI#3502_1,#3502,2014-07-12,ODI,1.0,4.0,,,,,,Sri Lanka,Hambantota


In [202]:
allround_master[allround_master['Player ID']==p2_loader.player_info['Player ID'][0]]

Unnamed: 0,Player ID,Inns ID,Match ID,Start Date,Format,Inns,Score,Overs,Conc,Wkts,Ct,St,Opposition,Location
0,45789,45789_Test#1318_1,#1318,1995-12-14,Test,1.0,1.0,,,,,,England,Durban
1,45789,45789_Test#1318_2,#1318,1995-12-14,Test,2.0,,,,,0.0,0.0,England,Durban
2,45789,45789_Test#1321_1,#1321,1996-01-02,Test,1.0,,4.0,2.0,0.0,0.0,0.0,England,Cape Town
3,45789,45789_Test#1321_2,#1321,1996-01-02,Test,2.0,7.0,,,,,,England,Cape Town
4,45789,45789_Test#1321_3,#1321,1996-01-02,Test,3.0,,,,,1.0,0.0,England,Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1309,45789,45789_ODI#3500_2,#3500,2014-07-06,ODI,2.0,,,,,1.0,0.0,Sri Lanka,Colombo
1310,45789,45789_ODI#3501_1,#3501,2014-07-09,ODI,1.0,,,,,0.0,0.0,Sri Lanka,Pallekele
1311,45789,45789_ODI#3501_2,#3501,2014-07-09,ODI,2.0,1.0,,,,,,Sri Lanka,Pallekele
1312,45789,45789_ODI#3502_1,#3502,2014-07-12,ODI,1.0,4.0,,,,,,Sri Lanka,Hambantota


Perfect! This matches with the records as well. Let us check for the last one. 

In [203]:
# Adding James Anderson's data
if p3_concat_df_2 is None:
    pass
elif allround_master is None:
    allround_master = p3_concat_df_2
else:
    allround_master = pd.concat([allround_master, p3_concat_df_2])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

allround_master

Unnamed: 0,Player ID,Inns ID,Match ID,Start Date,Format,Inns,Score,Overs,Conc,Wkts,Ct,St,Opposition,Location
0,45789,45789_Test#1318_1,#1318,1995-12-14,Test,1.0,1.0,,,,,,England,Durban
1,45789,45789_Test#1318_2,#1318,1995-12-14,Test,2.0,,,,,0.0,0.0,England,Durban
2,45789,45789_Test#1321_1,#1321,1996-01-02,Test,1.0,,4.0,2.0,0.0,0.0,0.0,England,Cape Town
3,45789,45789_Test#1321_2,#1321,1996-01-02,Test,2.0,7.0,,,,,,England,Cape Town
4,45789,45789_Test#1321_3,#1321,1996-01-02,Test,3.0,,,,,1.0,0.0,England,Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1309,45789,45789_ODI#3500_2,#3500,2014-07-06,ODI,2.0,,,,,1.0,0.0,Sri Lanka,Colombo
1310,45789,45789_ODI#3501_1,#3501,2014-07-09,ODI,1.0,,,,,0.0,0.0,Sri Lanka,Pallekele
1311,45789,45789_ODI#3501_2,#3501,2014-07-09,ODI,2.0,1.0,,,,,,Sri Lanka,Pallekele
1312,45789,45789_ODI#3502_1,#3502,2014-07-12,ODI,1.0,4.0,,,,,,Sri Lanka,Hambantota


In [205]:
allround_master[allround_master['Player ID']==p3_loader.player_info['Player ID'][0]] \
    if p3_loader.allroundstats is not None \
    else allround_master['Player ID'].value_counts()

Player ID
45789    1314
Name: count, dtype: int64

Excellent. For reference, let us output the number of records for all 3 players. 

In [208]:
print("Original\n------------")
print(f"p1 ({p1_loader.player_info['Player ID'][0]}):",p1_loader.allroundstats.shape[0]) if p1_loader.allroundstats is not None else 0
print(f"p2 ({p2_loader.player_info['Player ID'][0]}):",p2_loader.allroundstats.shape[0]) if p2_loader.allroundstats is not None else 0
print(f"p3 ({p3_loader.player_info['Player ID'][0]}):", p3_loader.allroundstats.shape[0]) if p3_loader.allroundstats is not None else 0

print("\nInside master data sheet------------")
allround_master['Player ID'].value_counts()

Original
------------
p2 (45789): 1314

Inside master data sheet------------


Player ID
45789    1314
Name: count, dtype: int64

Perfect! The allround sheet is tested, and ready. 

#### Personal info

Now, let us set it up for Personal info. 

In [216]:
info_master = None

In [217]:
p1_concat_df = p1_loader.player_info.copy()
p2_concat_df = p2_loader.player_info.copy()
p3_concat_df = p3_loader.player_info.copy()

In [220]:
# for player 1

if p1_concat_df is None:
    pass
elif info_master is None:
    info_master = p1_concat_df
else:
    info_master = pd.concat([info_master, p1_concat_df])\
                    .drop_duplicates(subset=['Player ID'], keep='last')\
                    .sort_values(by=['Player ID'])

info_master

Unnamed: 0,Player ID,Player URL,FULL NAME,BORN,AGE,BATTING STYLE,BOWLING STYLE,PLAYING ROLE
0,253802,https://www.espncricinfo.com/cricketers/virat-...,Virat Kohli,"November 05, 1988, Delhi",36y 165d,Right hand Bat,Right arm Medium,Top order Batter


In [221]:
# for player 2

if p2_concat_df is None:
    pass
elif info_master is None:
    info_master = p2_concat_df
else:
    info_master = pd.concat([info_master, p2_concat_df])\
                    .drop_duplicates(subset=['Player ID'], keep='last')\
                    .sort_values(by=['Player ID'])

info_master

Unnamed: 0,Player ID,Player URL,FULL NAME,BORN,AGE,BATTING STYLE,BOWLING STYLE,PLAYING ROLE
0,45789,https://www.espncricinfo.com/cricketers/jacque...,Jacques Henry Kallis,"October 16, 1975, Pinelands, Cape Town, Cape P...",49y 182d,Right hand Bat,Right arm Fast medium,Allrounder
0,253802,https://www.espncricinfo.com/cricketers/virat-...,Virat Kohli,"November 05, 1988, Delhi",36y 165d,Right hand Bat,Right arm Medium,Top order Batter


In [222]:
# for player 3

if p3_concat_df is None:
    pass
elif info_master is None:
    info_master = p3_concat_df
else:
    info_master = pd.concat([info_master, p3_concat_df])\
                    .drop_duplicates(subset=['Player ID'], keep='last')\
                    .sort_values(by=['Player ID'])

info_master

Unnamed: 0,Player ID,Player URL,FULL NAME,BORN,AGE,BATTING STYLE,BOWLING STYLE,PLAYING ROLE,NICKNAMES,HEIGHT,EDUCATION
0,8608,https://www.espncricinfo.com/cricketers/james-...,James Michael Anderson,"July 30, 1982, Burnley, Lancashire",42y 260d,Left hand Bat,Right arm Fast medium,Bowler,Jimmy,6ft 2in,St Theodore's RC High School; St Theodore's RC...
0,45789,https://www.espncricinfo.com/cricketers/jacque...,Jacques Henry Kallis,"October 16, 1975, Pinelands, Cape Town, Cape P...",49y 182d,Right hand Bat,Right arm Fast medium,Allrounder,,,
0,253802,https://www.espncricinfo.com/cricketers/virat-...,Virat Kohli,"November 05, 1988, Delhi",36y 165d,Right hand Bat,Right arm Medium,Top order Batter,,,


Perfect! This works!

### Final Code for all tables

In [243]:
bucket_name = "cricketer-stats"
player_name = input("Enter player name: ")

#downloading transformed data from s3
player_loader = LoadData(player_name, data_type="tf")
player_loader.load_data(bucket_name, load_type="download", stat_type="all")

#initializing master dataframes
batting_master = None
bowling_master = None
fielding_master = None
allround_master = None
info_master = None

Bucket 'cricketer-stats' already exists.
Downloading jacques_kallis's tf data from S3...
 Downloaded from s3://cricketer-stats/jacques_kallis/tf/personal_info.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/tf/batting_stats.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/tf/bowling_stats.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/tf/fielding_stats.csv
 Downloaded from s3://cricketer-stats/jacques_kallis/tf/allround_stats.csv
All tf data downloaded from s3://cricketer-stats/jacques_kallis/tf/


In [244]:
# preparing concatenated df for allround stats
concat_df = player_loader.allroundstats.copy() if player_loader.allroundstats is not None else pd.DataFrame()
concat_df['Player ID'] = player_loader.player_info['Player ID'][0]
concat_df['Inns ID'] = (concat_df['Player ID'].astype(str)+"_"+
                        concat_df['Format'].astype(str)+
                        concat_df['Match ID'].astype(str)+"_"+
                        concat_df['Inns'].astype(str)
                       ) if player_loader.allroundstats is not None else None

# adding allround stats to master dataframe
if concat_df.empty:
    pass
elif allround_master is None:
    allround_master = concat_df
else:
    allround_master = pd.concat([allround_master, concat_df])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

allround_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Score,Overs,Conc,Wkts,Ct,St,Opposition,Location,Player ID,Inns ID
0,#1318,1995-12-14,Test,1,1.0,,,,,,England,Durban,45789,45789_Test#1318_1
1,#1318,1995-12-14,Test,2,,,,,0.0,0.0,England,Durban,45789,45789_Test#1318_2
2,#1321,1996-01-02,Test,1,,4.0,2.0,0.0,0.0,0.0,England,Cape Town,45789,45789_Test#1321_1
3,#1321,1996-01-02,Test,2,7.0,,,,,,England,Cape Town,45789,45789_Test#1321_2
4,#1321,1996-01-02,Test,3,,,,,1.0,0.0,England,Cape Town,45789,45789_Test#1321_3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1309,#3500,2014-07-06,ODI,2,,,,,1.0,0.0,Sri Lanka,Colombo,45789,45789_ODI#3500_2
1310,#3501,2014-07-09,ODI,1,,,,,0.0,0.0,Sri Lanka,Pallekele,45789,45789_ODI#3501_1
1311,#3501,2014-07-09,ODI,2,1.0,,,,,,Sri Lanka,Pallekele,45789,45789_ODI#3501_2
1312,#3502,2014-07-12,ODI,1,4.0,,,,,,Sri Lanka,Hambantota,45789,45789_ODI#3502_1


In [245]:
# preparing concatenated df for batting stats
concat_df = player_loader.battingstats.copy() if player_loader.battingstats is not None else pd.DataFrame()
concat_df['Player ID'] = player_loader.player_info['Player ID'][0]
concat_df['Inns ID'] = (concat_df['Player ID'].astype(str)+"_"+
                        concat_df['Format'].astype(str)+
                        concat_df['Match ID'].astype(str)+"_"+
                        concat_df['Inns'].astype(str)
                       ) if player_loader.battingstats is not None else None

# adding batting stats to master dataframe
if concat_df.empty:
    pass
elif batting_master is None:
    batting_master = concat_df
else:
    batting_master = pd.concat([batting_master, concat_df])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

batting_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location,Player ID,Inns ID
0,#1318,1995-12-14,Test,1.0,6.0,1.0,12.0,0.0,0.0,8.33,11.0,caught,England,Durban,45789,45789_Test#1318_1.0
1,#1321,1996-01-02,Test,2.0,7.0,7.0,65.0,1.0,0.0,10.76,117.0,lbw,England,Cape Town,45789,45789_Test#1321_2.0
2,#1321,1996-01-02,Test,4.0,,,,,,,,,England,Cape Town,45789,45789_Test#1321_4.0
3,#1033,1996-01-09,ODI,1.0,7.0,38.0,65.0,3.0,0.0,58.46,75.0,caught,England,Cape Town,45789,45789_ODI#1033_1.0
4,#1034,1996-01-11,ODI,1.0,4.0,29.0,35.0,2.0,1.0,82.85,51.0,caught,England,Bloemfontein,45789,45789_ODI#1034_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,#2111,2013-12-26,Test,2.0,4.0,115.0,316.0,13.0,0.0,36.39,393.0,caught,India,Durban,45789,45789_Test#2111_2.0
642,#2111,2013-12-26,Test,4.0,,,,,,,,,India,Durban,45789,45789_Test#2111_4.0
643,#3500,2014-07-06,ODI,1.0,3.0,0.0,2.0,0.0,0.0,0.00,3.0,lbw,Sri Lanka,Colombo,45789,45789_ODI#3500_1.0
644,#3501,2014-07-09,ODI,2.0,3.0,1.0,7.0,0.0,0.0,14.28,15.0,caught,Sri Lanka,Pallekele,45789,45789_ODI#3501_2.0


In [246]:
# preparing concatenated df for bowling stats
concat_df = player_loader.bowlingstats.copy() if player_loader.bowlingstats is not None else pd.DataFrame()
concat_df['Player ID'] = player_loader.player_info['Player ID'][0]
concat_df['Inns ID'] = (concat_df['Player ID'].astype(str)+"_"+
                        concat_df['Format'].astype(str)+
                        concat_df['Match ID'].astype(str)+"_"+
                        concat_df['Inns'].astype(str)
                       ) if player_loader.bowlingstats is not None else None

# adding bowling stats to master dataframe
if concat_df.empty:
    pass
elif bowling_master is None:
    bowling_master = concat_df
else:
    bowling_master = pd.concat([bowling_master, concat_df])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

bowling_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location,Player ID,Inns ID
0,#1318,1995-12-14,Test,2.0,,,,,,,England,Durban,45789,45789_Test#1318_2.0
1,#1321,1996-01-02,Test,1.0,5.0,4.0,2.0,2.0,0.0,0.50,England,Cape Town,45789,45789_Test#1321_1.0
2,#1321,1996-01-02,Test,3.0,,,,,,,England,Cape Town,45789,45789_Test#1321_3.0
3,#1033,1996-01-09,ODI,2.0,7.0,3.0,0.0,14.0,0.0,4.66,England,Cape Town,45789,45789_ODI#1033_2.0
4,#1034,1996-01-11,ODI,2.0,6.0,5.0,0.0,27.0,0.0,5.40,England,Bloemfontein,45789,45789_ODI#1034_2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,#2111,2013-12-26,Test,1.0,4.0,11.0,1.0,36.0,0.0,3.27,India,Durban,45789,45789_Test#2111_1.0
664,#2111,2013-12-26,Test,3.0,,,,,,,India,Durban,45789,45789_Test#2111_3.0
665,#3500,2014-07-06,ODI,2.0,,,,,,,Sri Lanka,Colombo,45789,45789_ODI#3500_2.0
666,#3501,2014-07-09,ODI,1.0,,,,,,,Sri Lanka,Pallekele,45789,45789_ODI#3501_1.0


In [247]:
# preparing concatenated df for fielding stats
concat_df = player_loader.fieldingstats.copy() if player_loader.fieldingstats is not None else pd.DataFrame()
concat_df['Player ID'] = player_loader.player_info['Player ID'][0]
concat_df['Inns ID'] = (concat_df['Player ID'].astype(str)+"_"+
                        concat_df['Format'].astype(str)+
                        concat_df['Match ID'].astype(str)+"_"+
                        concat_df['Inns'].astype(str)
                       ) if player_loader.fieldingstats is not None else None

# adding fielding stats to master dataframe
if concat_df.empty:
    pass
elif fielding_master is None:
    fielding_master = concat_df
else:
    fielding_master = pd.concat([fielding_master, concat_df])\
                    .drop_duplicates(subset=['Inns ID'], keep='last')\
                    .sort_values(by=['Player ID','Start Date'])

fielding_master

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location,Player ID,Inns ID
0,#1318,1995-12-14,Test,2.0,0.0,0.0,England,Durban,45789,45789_Test#1318_2.0
1,#1321,1996-01-02,Test,1.0,0.0,0.0,England,Cape Town,45789,45789_Test#1321_1.0
2,#1321,1996-01-02,Test,3.0,1.0,1.0,England,Cape Town,45789,45789_Test#1321_3.0
3,#1033,1996-01-09,ODI,2.0,0.0,0.0,England,Cape Town,45789,45789_ODI#1033_2.0
4,#1034,1996-01-11,ODI,2.0,0.0,0.0,England,Bloemfontein,45789,45789_ODI#1034_2.0
...,...,...,...,...,...,...,...,...,...,...
663,#2111,2013-12-26,Test,1.0,1.0,1.0,India,Durban,45789,45789_Test#2111_1.0
664,#2111,2013-12-26,Test,3.0,0.0,0.0,India,Durban,45789,45789_Test#2111_3.0
665,#3500,2014-07-06,ODI,2.0,1.0,1.0,Sri Lanka,Colombo,45789,45789_ODI#3500_2.0
666,#3501,2014-07-09,ODI,1.0,0.0,0.0,Sri Lanka,Pallekele,45789,45789_ODI#3501_1.0


In [248]:
# preparing concatenated df for player info
concat_df = player_loader.player_info.copy()

# adding player info to master dataframe
if concat_df.empty:
    pass
elif info_master is None:
    info_master = concat_df
else:
    info_master = pd.concat([info_master, concat_df])\
                    .drop_duplicates(subset=['Player ID'], keep='last')\
                    .sort_values(by=['Player ID'])

info_master

Unnamed: 0,Player ID,Player URL,FULL NAME,BORN,AGE,BATTING STYLE,BOWLING STYLE,PLAYING ROLE
0,45789,https://www.espncricinfo.com/cricketers/jacque...,Jacques Henry Kallis,"October 16, 1975, Pinelands, Cape Town, Cape P...",49y 182d,Right hand Bat,Right arm Fast medium,Allrounder


Excellent! This is done. 