## Scraping test

In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

class Cricketer_Stats_Scraper:

    def __init__(self, player_name):
        self.player_name = player_name
        self.player_id = None
        self.player_url = None
    
        # Initialize class variables for storing stats
        self.battingstats = None
        self.bowlingstats = None
        self.allroundstats = None
        self.fieldingstats = None
        self.player_info = None

        # Set up the WebDriver and open the search URL
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")  # Run in headless mode
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
        print("Setting up WebDriver...")
        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        # Call get_player_url() to fetch the player's URL and ID when the object is initialized
        self.get_player_url()

    def get_player_url(self):
        start_time = time.time()
        print(f"Extracting {self.player_name}'s player URL and Player ID....")
        search_url = f"https://search.espncricinfo.com/ci/content/site/search.html?search={self.player_name.lower().replace(' ', '%20')};type=player"
        self.driver.get(search_url)

        try:
            player_link_element = self.driver.find_element(By.CSS_SELECTOR, "h3.name.link-cta a")
            self.player_url = player_link_element.get_attribute("href")
            self.player_id = self.player_url.split('-')[-1]
            print(f"Extraction Successful for {self.player_name}.")
            end_time = time.time()
            print(f"Time taken to extract URL: {end_time - start_time:.2f} seconds")
        except Exception as e:
            print(f"Error in extracting {self.player_name}'s url:", e)
            return None, None

    def extract_inns_data(self, record_type):
        start_time = time.time()
        print(f"Starting extraction of {self.player_name}'s {record_type} stats....")
        
        # Construct the search URL based on record_type (batting, bowling, etc.)
        search_url = f"https://stats.espncricinfo.com/ci/engine/player/{self.player_id}.html?class=11;template=results;type={record_type};view=innings"
        
        # Open the URL
        self.driver.get(search_url)

        # Step 1: Extract the headers of the table
        headers = self.driver.find_elements(By.CSS_SELECTOR, "thead tr.headlinks th")
        header_names = [header.text for header in headers if header.text != ''] + ['Match id']  # Add match_id column name
        
        # Step 2: Extract the data from the 4th tbody
        rows = self.driver.find_elements(By.XPATH, "(//tbody)[4]//tr")
        
        # Step 3: Extract the data column-wise and store it in a list
        player_data = []
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            row_data = [cell.text for cell in cells if cell.text != '']
            player_data.append(row_data)
        
        # Step 4: Create a DataFrame from the extracted data
        innings_data = pd.DataFrame(player_data, columns=header_names)
        
        end_time = time.time()
        print(f"Extracted {innings_data.shape[0]} records in {end_time - start_time:.2f} seconds")
        
        return innings_data

    def extract_player_info(self):
        try:
            start_time = time.time()
            print(f"Starting extraction of {self.player_name}'s personal info....")
            
            # Start by opening the player info URL
            self.driver.get(self.player_url)

            # Step 1: Extract headers within the specified div tag
            headers = self.driver.find_elements(By.XPATH, "//div[@class='ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8']//p[@class='ds-text-tight-m ds-font-regular ds-uppercase ds-text-typo-mid3']")
            header_names = ['Player ID','Player URL']+[header.text for header in headers]

            # Step 2: Extract values within the specified div tag
            values = self.driver.find_elements(By.XPATH, "//div[@class='ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8']//span[@class='ds-text-title-s ds-font-bold ds-text-typo']")
            value_texts = [self.player_id,self.player_url]+[value.text for value in values]

            # Step 3: Create a DataFrame from the extracted data
            player_info = pd.DataFrame([value_texts], columns=header_names)

            end_time = time.time()
            print(f"Extracted player info in {end_time - start_time:.2f} seconds")

            return player_info
            
        except Exception as e:
            print(f"Error in extracting {self.player_name}'s personal info:", e)
            return None

    def get_player_stats(self, stats_type="all"):
        try:
            # Ensure that player ID or player URL is available
            if not (self.player_id or self.player_url):
                print("Player ID is not available. Run get_player_url() first.")
                return
            
            # Fetch personal information if 'personal_info' is passed
            if stats_type == "personal_info":
                self.player_info = self.extract_player_info()
            
            # Fetch batting stats if 'all' or 'batting' is passed
            if stats_type == "all" or stats_type == "batting":
                self.battingstats = self.extract_inns_data('batting')

            # Fetch bowling stats if 'all' or 'bowling' is passed
            if stats_type == "all" or stats_type == "bowling":
                self.bowlingstats = self.extract_inns_data('bowling')

            # Check if the player is an all-rounder and fetch all-round stats
            if stats_type == "all" or stats_type == "allround":
                self.player_info = self.extract_player_info()
                if self.player_info is not None and 'allround' in self.player_info['PLAYING ROLE'][0].lower():
                    self.allroundstats = self.extract_inns_data('allround')

            # Fetch fielding stats if 'all' or 'fielding' is passed
            if stats_type == "all" or stats_type == "fielding":
                self.fieldingstats = self.extract_inns_data('fielding')

        except Exception as e:
            print(f"Error in extracting stats for {self.player_name}: ", e)


    def __del__(self):
        try:
            self.driver.quit()
            print("WebDriver closed successfully.")
        except Exception as e:
            print("Error while closing the WebDriver:", e)



##### Extracting Ground Information

The following functions are built to extract ground information. However, these are very resource-intensive, so we will take it up later. 

In [3]:
def extract_ground_links(player_id):
    """
    This function extracts ground links from the player innings data, ensuring no duplicate ground info is scraped.
    """
    # Step 1: Initialize the DataFrame to store ground info
    ground_info_df = pd.DataFrame(columns=["Ground ID", "Stadium Name", "Location", "Home Team", "Image URL"])

    # Set up the WebDriver for scraping
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    # Scrape Player Stats Page
    search_url = f"https://stats.espncricinfo.com/ci/engine/player/{player_id}.html?class=11;template=results;type=batting;view=innings"
    driver.get(search_url)
    
    # Extract ground links from innings data
    rows = driver.find_elements(By.XPATH, "(//tbody)[4]//tr")
    ground_links = []
    
    for row in rows:
        try:
            ground_name_element = row.find_element(By.XPATH, ".//td[contains(@class, 'left')][2]/a")
            ground_name = ground_name_element.text
            ground_link = ground_name_element.get_attribute('href')
            ground_links.append((ground_name, ground_link))
        except Exception as e:
            print(f"Error extracting ground data: {e}")
            continue

    # Step 2: Check if the ground has already been scraped (exists in ground_info DataFrame)
    for ground_name, ground_link in ground_links:
        if ground_name not in ground_info_df['Stadium Name'].values:
            # Create a new DataFrame for the new ground
            new_data = pd.DataFrame({"Stadium Name": [ground_name], "Ground Link": [ground_link]})
            ground_info_df = pd.concat([ground_info_df, new_data], ignore_index=True)
        else:
            print(f"Ground {ground_name} has already been scraped. Skipping.")

    # Step 3: Extract ground info for each link and append it to the ground_info_df
    for ground_link in ground_info_df['Ground Link']:
        ground_info_df = extract_ground_info(ground_link, ground_info_df)
    
    driver.quit()
    return ground_info_df

def extract_ground_info(ground_url, ground_info_df):
    """
    This function extracts ground information (ID, stadium name, location, home team, image URL)
    from a given ground URL and appends the data to the provided dataframe.
    """
    start_time = time.time()
    
    # Set up the WebDriver for scraping ground info
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    driver.get(ground_url)
    
    try:
        # 1. Ground ID (numeric portion of the URL)
        ground_id = ground_url.split('/')[-1].split('.')[0]
        
        # 2. Ground image URL
        img_element = driver.find_element(By.XPATH, "//div[@class='ds-p-0']//img[1]")
        image_url = img_element.get_attribute("src")
        
        # 3. Stadium Name
        stadium_name = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-text-title-m') and contains(@class, 'ds-font-bold')]").text
        
        # 4. Location (City)
        location = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-text-compact-s') and contains(@class, 'ds-font-bold')]").text.strip().replace("\n", ", ")
        
        # 5. Home Team (Country)
        home_team_text = driver.find_element(By.XPATH, "//span[contains(text(), 'Grounds in')]").text
        home_team = home_team_text.split("Grounds in")[-1].strip()
        
        # Prepare the ground info as a dictionary
        ground_info = pd.DataFrame({
            "Ground ID": [ground_id],
            "Stadium Name": [stadium_name],
            "Location": [location],
            "Home Team": [home_team],
            "Image URL": [image_url]
        })
        
        # Append the ground info to the DataFrame
        ground_info_df = pd.concat([ground_info_df,ground_info], ignore_index=True)
        print(f"Extracted info for ground {stadium_name} in {time.time() - start_time:.2f} seconds.")
    
    except Exception as e:
        print(f"Error while extracting info for {ground_url}: {e}")
    
    driver.quit()
    return ground_info_df

In [4]:
ground_url = 'https://www.espncricinfo.com/cricket-grounds/rangiri-dambulla-international-stadium-59368'

# Set up the WebDriver for scraping ground info
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
driver.get(ground_url)
    
try:
    # 1. Ground ID (numeric portion of the URL)
    ground_id = ground_url.split('/')[-1].split('.')[0]
        
    # 2. Ground image URL
    img_element = driver.find_element(By.XPATH, "//div[@class='ds-p-0']//img")
    image_url = img_element.get_attribute("src")
        
        # 3. Stadium Name
    stadium_name = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-text-title-m') and contains(@class, 'ds-font-bold')]").text
        
    # 4. Location (City)
    location = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-text-compact-s') and contains(@class, 'ds-font-bold')]").text.strip().replace("\n", ", ")
        
    # 5. Home Team (Country)
    home_team_text = driver.find_element(By.XPATH, "//span[contains(text(), 'Grounds in')]").text
    home_team = home_team_text.split("Grounds in")[-1].strip()
        
    # Prepare the ground info as a dictionary
    ground_info = pd.DataFrame({
            "Ground ID": [ground_id],
            "Stadium Name": [stadium_name],
            "Location": [location],
            "Home Team": [home_team],
            "Image URL": [image_url]
        })

except Exception as e : print(e)
    
ground_info

KeyboardInterrupt: 

In [None]:
player_id = 253802
grounds = extract_ground_links(player_id)

grounds

## Transformation

In [9]:
import time
import pandas as pd
import numpy as np

class Cricketer_Stats_Transformer:
    
    def __init__(self, player_name):
        self.player_name = player_name
        self.player_info = None
        self.battingstats = None
        self.bowlingstats = None
        self.allroundstats = None
        self.fieldingstats = None
        self.player_id = None
        self.player_url = None

batting['Runs']

In [13]:
batting['Mins'].describe()

count     639
unique    208
top         -
freq       96
Name: Mins, dtype: object

In [14]:
batting['Mins'].unique()

array(['33', '82', '40', '87', '45', '6', '28', '-', '121', '53', '32',
       '20', '93', '19', '136', '98', '128', '3', '125', '123', '30',
       '35', '31', '13', '57', '5', '22', '193', '162', '78', '83', '10',
       '122', '43', '51', '126', '11', '113', '12', '16', '2', '38', '69',
       '15', '118', '1', '68', '135', '72', '62', '92', '9', '18', '102',
       '70', '134', '42', '21', '154', '99', '50', '107', '191', '237',
       '96', '29', '156', '41', '44', '116', '36', '133', '148', '211',
       '158', '80', '187', '65', '143', '284', '130', '67', '60', '61',
       '17', '86', '356', '25', '110', '266', '119', '157', '76', '46',
       '66', '120', '8', '26', '104', '4', '159', '7', '257', '254',
       '114', '179', '88', '27', '14', '129', '131', '195', '23', '75',
       '39', '77', '34', '54', '124', '164', '109', '192', '267', '279',
       '380', '315', '24', '168', '106', '49', '64', '71', '55', '178',
       '58', '155', '202', '73', '74', '259', '84', '174', '8

In [15]:
batting[batting['Mins']=='-']

Unnamed: 0,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start Date,match_id
7,,-,-,-,-,-,-,-,-,ODI v Australia,Centurion,28 Sep 2009,ODI # 2901
13,107.0,-,114,11,1,93.85,4,caught,2,ODI v Sri Lanka,Eden Gardens,24 Dec 2009,ODI # 2935
14,,-,-,-,-,-,-,-,-,ODI v Sri Lanka,Delhi,27 Dec 2009,ODI # 2936
20,31.0,-,46,1,0,67.39,5,caught,1,ODI v South Africa,Jaipur,21 Feb 2010,ODI # 2961
21,,-,-,-,-,-,-,-,1,ODI v South Africa,Gwalior,24 Feb 2010,ODI # 2962
...,...,...,...,...,...,...,...,...,...,...,...,...,...
583,,-,-,-,-,-,-,-,3,Test v West Indies,Port of Spain,20 Jul 2023,Test # 2513
584,,-,-,-,-,-,-,-,2,ODI v West Indies,Bridgetown,27 Jul 2023,ODI # 4622
586,,-,-,-,-,-,-,-,2,ODI v Nepal,Pallekele,4 Sep 2023,ODI # 4632
589,,-,-,-,-,-,-,-,2,ODI v Sri Lanka,Colombo (RPS),17 Sep 2023,ODI # 4649


In [16]:
batting['Mins'] = batting['Mins'].replace("-", np.nan).astype(float)
batting['Mins']

0       33.0
1       82.0
2       40.0
3       87.0
4       45.0
       ...  
634      NaN
635    125.0
636     40.0
637    101.0
638     14.0
Name: Mins, Length: 639, dtype: float64

In [17]:
batting['BF'].describe()

count     639
unique    166
top         -
freq       29
Name: BF, dtype: object

In [18]:
batting['BF'].unique()

array(['22', '67', '38', '66', '46', '2', '24', '-', '104', '41', '16',
       '19', '65', '114', '12', '102', '68', '95', '8', '71', '0', '92',
       '29', '21', '27', '14', '34', '3', '57', '121', '73', '70', '6',
       '83', '5', '53', '20', '76', '33', '49', '103', '1', '10', '54',
       '107', '9', '18', '36', '93', '63', '98', '30', '99', '111', '123',
       '11', '85', '81', '136', '213', '62', '94', '28', '25', '86',
       '120', '82', '148', '113', '119', '35', '48', '193', '39', '32',
       '13', '61', '55', '60', '295', '17', '79', '206', '129', '64',
       '52', '108', '88', '78', '84', '100', '181', '87', '135', '122',
       '51', '50', '44', '58', '75', '56', '126', '184', '175', '272',
       '230', '42', '4', '191', '140', '165', '97', '117', '47', '37',
       '283', '90', '40', '366', '134', '267', '109', '127', '340', '105',
       '26', '15', '246', '23', '101', '115', '96', '116', '125', '106',
       '287', '217', '159', '72', '225', '152', '197', '130', '

In [19]:
batting[batting['BF']=='-']

Unnamed: 0,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start Date,match_id
7,,,-,-,-,-,-,-,-,ODI v Australia,Centurion,28 Sep 2009,ODI # 2901
14,,,-,-,-,-,-,-,-,ODI v Sri Lanka,Delhi,27 Dec 2009,ODI # 2936
21,,,-,-,-,-,-,-,1,ODI v South Africa,Gwalior,24 Feb 2010,ODI # 2962
28,,,-,-,-,-,-,-,2,T20I v Zimbabwe,Harare,13 Jun 2010,T20I # 183
68,,,-,-,-,-,-,-,4,Test v West Indies,Roseau,6 Jul 2011,Test # 1999
109,,,-,-,-,-,-,-,2,T20I v South Africa,Johannesburg,30 Mar 2012,T20I # 242
145,,,-,-,-,-,-,-,4,Test v Australia,Chennai,22 Feb 2013,Test # 2074
164,,,-,-,-,-,-,-,2,ODI v Zimbabwe,Bulawayo,1 Aug 2013,ODI # 3402
165,,,-,-,-,-,-,-,2,ODI v Zimbabwe,Bulawayo,3 Aug 2013,ODI # 3403
170,,,-,-,-,-,-,-,2,ODI v Australia,Ranchi,23 Oct 2013,ODI # 3422


In [20]:
batting['BF'] = batting['BF'].replace("-", np.nan).astype(float)
batting['BF']

0      22.0
1      67.0
2      38.0
3      66.0
4      46.0
       ... 
634     NaN
635    86.0
636    29.0
637    69.0
638    12.0
Name: BF, Length: 639, dtype: float64

In [21]:
batting['4s'].describe()

count     639
unique     27
top         0
freq      125
Name: 4s, dtype: object

In [22]:
batting['4s'].unique()

array(['1', '6', '4', '7', '3', '0', '-', '9', '11', '5', '10', '8', '2',
       '16', '14', '22', '12', '15', '13', '18', '20', '24', '25', '17',
       '19', '21', '33'], dtype=object)

In [23]:
batting[batting['4s']=='-']

Unnamed: 0,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start Date,match_id
7,,,,-,-,-,-,-,-,ODI v Australia,Centurion,28 Sep 2009,ODI # 2901
14,,,,-,-,-,-,-,-,ODI v Sri Lanka,Delhi,27 Dec 2009,ODI # 2936
21,,,,-,-,-,-,-,1,ODI v South Africa,Gwalior,24 Feb 2010,ODI # 2962
28,,,,-,-,-,-,-,2,T20I v Zimbabwe,Harare,13 Jun 2010,T20I # 183
68,,,,-,-,-,-,-,4,Test v West Indies,Roseau,6 Jul 2011,Test # 1999
109,,,,-,-,-,-,-,2,T20I v South Africa,Johannesburg,30 Mar 2012,T20I # 242
145,,,,-,-,-,-,-,4,Test v Australia,Chennai,22 Feb 2013,Test # 2074
164,,,,-,-,-,-,-,2,ODI v Zimbabwe,Bulawayo,1 Aug 2013,ODI # 3402
165,,,,-,-,-,-,-,2,ODI v Zimbabwe,Bulawayo,3 Aug 2013,ODI # 3403
170,,,,-,-,-,-,-,2,ODI v Australia,Ranchi,23 Oct 2013,ODI # 3422


In [24]:
batting['4s'] = batting['4s'].replace("-", np.nan).astype(float)
batting['4s']

0      1.0
1      6.0
2      4.0
3      7.0
4      3.0
      ... 
634    NaN
635    4.0
636    0.0
637    0.0
638    1.0
Name: 4s, Length: 639, dtype: float64

In [25]:
batting['6s'].describe()

count     639
unique     10
top         0
freq      434
Name: 6s, dtype: object

In [26]:
batting['6s'].unique()

array(['0', '1', '-', '2', '7', '3', '4', '5', '6', '8'], dtype=object)

In [27]:
batting[batting['6s']=='-']

Unnamed: 0,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start Date,match_id
7,,,,,-,-,-,-,-,ODI v Australia,Centurion,28 Sep 2009,ODI # 2901
14,,,,,-,-,-,-,-,ODI v Sri Lanka,Delhi,27 Dec 2009,ODI # 2936
21,,,,,-,-,-,-,1,ODI v South Africa,Gwalior,24 Feb 2010,ODI # 2962
28,,,,,-,-,-,-,2,T20I v Zimbabwe,Harare,13 Jun 2010,T20I # 183
68,,,,,-,-,-,-,4,Test v West Indies,Roseau,6 Jul 2011,Test # 1999
109,,,,,-,-,-,-,2,T20I v South Africa,Johannesburg,30 Mar 2012,T20I # 242
145,,,,,-,-,-,-,4,Test v Australia,Chennai,22 Feb 2013,Test # 2074
164,,,,,-,-,-,-,2,ODI v Zimbabwe,Bulawayo,1 Aug 2013,ODI # 3402
165,,,,,-,-,-,-,2,ODI v Zimbabwe,Bulawayo,3 Aug 2013,ODI # 3403
170,,,,,-,-,-,-,2,ODI v Australia,Ranchi,23 Oct 2013,ODI # 3422


In [28]:
batting['6s'] = batting['6s'].replace("-", np.nan).astype(float)
batting['6s']

0      0.0
1      0.0
2      0.0
3      0.0
4      1.0
      ... 
634    NaN
635    0.0
636    0.0
637    0.0
638    0.0
Name: 6s, Length: 639, dtype: float64

In [29]:
batting['SR'].describe()

count      639
unique     423
top       0.00
freq        37
Name: SR, dtype: object

In [30]:
batting['SR'].unique()

array(['54.54', '55.22', '65.78', '81.81', '67.39', '100.00', '66.66',
       '-', '75.96', '73.17', '62.50', '142.10', '83.07', '93.85',
       '75.00', '89.21', '104.41', '107.36', '25.00', '80.28', '89.13',
       '62.06', '71.57', '123.80', '50.00', '71.42', '82.35', '0.00',
       '64.91', '97.52', '100.96', '87.67', '90.00', '147.36', '77.14',
       '64.70', '68.29', '94.56', '33.33', '120.48', '160.00', '64.15',
       '60.00', '77.63', '72.72', '42.85', '116.66', '78.64', '90.38',
       '40.00', '27.77', '25.23', '56.60', '80.00', '75.34', '38.88',
       '44.44', '115.05', '58.73', '114.28', '86.86', '93.75', '46.84',
       '55.26', '95.12', '209.09', '94.11', '52.38', '56.09', '37.50',
       '54.32', '55.14', '54.46', '35.48', '104.76', '129.16', '91.17',
       '81.91', '64.28', '48.00', '79.51', '77.77', '154.65', '80.48',
       '123.64', '93.80', '20.00', '58.46', '107.56', '65.71', '141.66',
       '54.20', '53.36', '62.19', '170.73', '128.20', '125.00', '115.38',
  

In [31]:
batting[batting['SR']=='-']

Unnamed: 0,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start Date,match_id
7,,,,,,-,-,-,-,ODI v Australia,Centurion,28 Sep 2009,ODI # 2901
14,,,,,,-,-,-,-,ODI v Sri Lanka,Delhi,27 Dec 2009,ODI # 2936
21,,,,,,-,-,-,1,ODI v South Africa,Gwalior,24 Feb 2010,ODI # 2962
23,0.0,3.0,0.0,0.0,0.0,-,3,run out,1,ODI v Zimbabwe,Bulawayo,28 May 2010,ODI # 2981
28,,,,,,-,-,-,2,T20I v Zimbabwe,Harare,13 Jun 2010,T20I # 183
68,,,,,,-,-,-,4,Test v West Indies,Roseau,6 Jul 2011,Test # 1999
109,,,,,,-,-,-,2,T20I v South Africa,Johannesburg,30 Mar 2012,T20I # 242
145,,,,,,-,-,-,4,Test v Australia,Chennai,22 Feb 2013,Test # 2074
164,,,,,,-,-,-,2,ODI v Zimbabwe,Bulawayo,1 Aug 2013,ODI # 3402
165,,,,,,-,-,-,2,ODI v Zimbabwe,Bulawayo,3 Aug 2013,ODI # 3403


In [32]:
batting.loc[batting['BF']==0, 'SR'] = 0
batting['SR'] = batting['SR'].replace("-", np.nan).astype(float)

batting['SR']

0      54.54
1      55.22
2      65.78
3      81.81
4      67.39
       ...  
634      NaN
635    41.86
636    17.24
637    24.63
638    50.00
Name: SR, Length: 639, dtype: float64

In [33]:
batting['Dismissal'].describe()

count        639
unique         8
top       caught
freq         368
Name: Dismissal, dtype: object

In [34]:
batting['Dismissal'].unique()

array(['lbw', 'caught', 'run out', 'bowled', 'not out', '-', 'stumped',
       'hit wicket'], dtype=object)

In [35]:
batting[batting['Dismissal']=='-']

Unnamed: 0,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start Date,match_id
7,,,,,,,-,-,-,ODI v Australia,Centurion,28 Sep 2009,ODI # 2901
14,,,,,,,-,-,-,ODI v Sri Lanka,Delhi,27 Dec 2009,ODI # 2936
21,,,,,,,-,-,1,ODI v South Africa,Gwalior,24 Feb 2010,ODI # 2962
28,,,,,,,-,-,2,T20I v Zimbabwe,Harare,13 Jun 2010,T20I # 183
68,,,,,,,-,-,4,Test v West Indies,Roseau,6 Jul 2011,Test # 1999
109,,,,,,,-,-,2,T20I v South Africa,Johannesburg,30 Mar 2012,T20I # 242
145,,,,,,,-,-,4,Test v Australia,Chennai,22 Feb 2013,Test # 2074
164,,,,,,,-,-,2,ODI v Zimbabwe,Bulawayo,1 Aug 2013,ODI # 3402
165,,,,,,,-,-,2,ODI v Zimbabwe,Bulawayo,3 Aug 2013,ODI # 3403
170,,,,,,,-,-,2,ODI v Australia,Ranchi,23 Oct 2013,ODI # 3422


In [36]:
batting['Dismissal'] = batting['Dismissal'].replace("-", np.nan)
batting['Dismissal']

0          lbw
1       caught
2      run out
3       bowled
4          lbw
        ...   
634        NaN
635     caught
636     caught
637     caught
638     caught
Name: Dismissal, Length: 639, dtype: object

In [37]:
batting['Pos'].describe()

count     639
unique      8
top         3
freq      316
Name: Pos, dtype: object

In [38]:
batting['Pos'].unique()

array(['2', '1', '7', '4', '-', '3', '5', '6'], dtype=object)

In [39]:
batting[batting['Pos']=='-']

Unnamed: 0,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start Date,match_id
7,,,,,,,-,,-,ODI v Australia,Centurion,28 Sep 2009,ODI # 2901
14,,,,,,,-,,-,ODI v Sri Lanka,Delhi,27 Dec 2009,ODI # 2936
21,,,,,,,-,,1,ODI v South Africa,Gwalior,24 Feb 2010,ODI # 2962
28,,,,,,,-,,2,T20I v Zimbabwe,Harare,13 Jun 2010,T20I # 183
68,,,,,,,-,,4,Test v West Indies,Roseau,6 Jul 2011,Test # 1999
109,,,,,,,-,,2,T20I v South Africa,Johannesburg,30 Mar 2012,T20I # 242
145,,,,,,,-,,4,Test v Australia,Chennai,22 Feb 2013,Test # 2074
164,,,,,,,-,,2,ODI v Zimbabwe,Bulawayo,1 Aug 2013,ODI # 3402
165,,,,,,,-,,2,ODI v Zimbabwe,Bulawayo,3 Aug 2013,ODI # 3403
170,,,,,,,-,,2,ODI v Australia,Ranchi,23 Oct 2013,ODI # 3422


In [40]:
batting['Pos'] = batting['Pos'].replace("-", np.nan).astype(float)
batting['Pos']

0      2.0
1      2.0
2      1.0
3      1.0
4      1.0
      ... 
634    NaN
635    4.0
636    4.0
637    4.0
638    4.0
Name: Pos, Length: 639, dtype: float64

In [41]:
batting['Inns'].describe()

count     639
unique      5
top         1
freq      268
Name: Inns, dtype: object

In [42]:
batting['Inns'].unique()

array(['1', '2', '-', '3', '4'], dtype=object)

In [43]:
batting[batting['Inns']=='-']

Unnamed: 0,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start Date,match_id
7,,,,,,,,,-,ODI v Australia,Centurion,28 Sep 2009,ODI # 2901
14,,,,,,,,,-,ODI v Sri Lanka,Delhi,27 Dec 2009,ODI # 2936
180,,,,,,,,,-,ODI v South Africa,Centurion,11 Dec 2013,ODI # 3444
293,,,,,,,,,-,Test v West Indies,Port of Spain,18 Aug 2016,Test # 2218
408,,,,,,,,,-,T20I v Australia,Melbourne,23 Nov 2018,T20I # 713
442,,,,,,,,,-,ODI v West Indies,Providence,8 Aug 2019,ODI # 4196
463,,,,,,,,,-,T20I v Sri Lanka,Guwahati,5 Jan 2020,T20I # 1025


In [44]:
batting['Inns'] = batting['Inns'].replace("-", np.nan).astype(float)
batting['Inns']

0      1.0
1      2.0
2      1.0
3      1.0
4      2.0
      ... 
634    4.0
635    2.0
636    4.0
637    1.0
638    3.0
Name: Inns, Length: 639, dtype: float64

In [45]:
batting['Opposition'].describe()

count                 639
unique                 38
top       ODI v Sri Lanka
freq                   56
Name: Opposition, dtype: object

In [46]:
batting['Opposition'].unique()

array(['ODI v Sri Lanka', 'ODI v Pakistan', 'ODI v Australia',
       'ODI v West Indies', 'ODI v Bangladesh', 'ODI v South Africa',
       'ODI v Zimbabwe', 'T20I v Zimbabwe', 'ODI v New Zealand',
       'T20I v South Africa', 'ODI v England', 'ODI v Ireland',
       'ODI v Netherlands', 'T20I v West Indies', 'Test v West Indies',
       'T20I v England', 'Test v Australia', 'T20I v Australia',
       'T20I v Sri Lanka', 'Test v New Zealand', 'T20I v New Zealand',
       'T20I v Afghanistan', 'T20I v Pakistan', 'Test v England',
       'Test v South Africa', 'ODI v Afghanistan', 'T20I v Bangladesh',
       'ODI v U.A.E.', 'Test v Bangladesh', 'Test v Sri Lanka',
       'T20I v U.A.E.', 'T20I v Ireland', 'T20I v Scotland',
       'T20I v Namibia', 'T20I v Hong Kong', 'T20I v Netherlands',
       'ODI v Nepal', 'T20I v U.S.A.'], dtype=object)

We will split this into 2 columns - "Format" and "Opposition". 

In [47]:
#extracting the format (content before " v ") and 
# opposition (content after " v ") from the 'Opposition' column

batting['Format'] = batting['Opposition'].str.extract(r'(^.*?)\sv\s')
batting['Opposition'] = batting['Opposition'].str.extract(r'\sv\s(.*?$)')

In [48]:
batting['Format'].unique()

array(['ODI', 'T20I', 'Test'], dtype=object)

In [49]:
batting['Opposition'].unique()

array(['Sri Lanka', 'Pakistan', 'Australia', 'West Indies', 'Bangladesh',
       'South Africa', 'Zimbabwe', 'New Zealand', 'England', 'Ireland',
       'Netherlands', 'Afghanistan', 'U.A.E.', 'Scotland', 'Namibia',
       'Hong Kong', 'Nepal', 'U.S.A.'], dtype=object)

In [50]:
batting['Ground'].describe()

count        639
unique        80
top       Mirpur
freq          29
Name: Ground, dtype: object

In [51]:
batting['Ground'].unique()

array(['Dambulla', 'Colombo (RPS)', 'Centurion', 'Johannesburg',
       'Vadodara', 'Mohali', 'Rajkot', 'Nagpur', 'Eden Gardens', 'Delhi',
       'Mirpur', 'Jaipur', 'Gwalior', 'Ahmedabad', 'Bulawayo', 'Harare',
       'Visakhapatnam', 'Guwahati', 'Bengaluru', 'Chennai', 'Durban',
       'Cape Town', 'Gqeberha', 'Wankhede', 'Port of Spain',
       'North Sound', 'Kingston', 'Bridgetown', 'Roseau', 'Manchester',
       'Chester-le-Street', 'Southampton', 'The Oval', "Lord's",
       'Cardiff', 'Hyderabad', 'Cuttack', 'Indore', 'Melbourne', 'Sydney',
       'W.A.C.A', 'Adelaide', 'Brisbane', 'Hobart', 'Hambantota',
       'Pallekele', 'Pune', 'Kochi', 'Ranchi', 'Dharamsala', 'Birmingham',
       'Kanpur', 'Napier', 'Hamilton', 'Auckland', 'Wellington',
       'Fatullah', 'Nottingham', 'Leeds', 'Galle', 'Colombo (PSS)',
       'Colombo (SSC)', 'Canberra', 'Gros Islet', 'Lauderhill',
       'Thiruvananthapuram', 'Dublin (Malahide)', 'Bristol', 'Brabourne',
       'Perth', 'Mount Maunganui'

We can see inconsistency here. Some places, the Ground names (Eden Gardens, W.A.C.A.) are given, and others - city names (Melbourne, Kanpur). 

To make this consistent, we will keep only the city names. For clearer context, we will rename the column to "Location". 

The list of mapping is as follows - 

1. All Colombo grounds (containing the word "Colombo") to be renamed as Colombo. 
2. Eden Gardens -> Kolkata
3. Wankhede,Brabourne -> Mumbai
4. Kingston -> Kingston Jamaica
5. The Oval, Lords -> London
6. W.A.C.A -> Perth
7. Dharamsala -> Dharamshala
8. Hamilton -> Hamilton Waikato
9. Fatullah -> Fatullah Dhaka
10. Providence -> Providence Guyana
11. Dubai (DICS) -> Dubai
12. Chattogram -> Chattogram Chittagong

Let us create a dictionary for this.

In [63]:
# Define the mapping dictionary
ground_mapping = {
    "Colombo (SSC)": "Colombo",
    "Colombo (PSS)": "Colombo",
    "Colombo (RPS)": "Colombo",
    "Eden Gardens": "Kolkata",
    "Wankhede": "Mumbai",
    "Brabourne": "Mumbai",
    "Kingston": "Kingston Jamaica",
    "The Oval": "London",
    "Lord's": "London",
    "W.A.C.A": "Perth",
    "Dharamsala": "Dharamshala",
    "Hamilton": "Hamilton Waikato",
    "Fatullah": "Fatullah Dhaka",
    "Providence": "Providence Guyana",
    "Dubai (DICS)": "Dubai",
    "Chattogram": "Chattogram Chittagong"
}

batting['Ground'].replace(ground_mapping).unique()

array(['Dambulla', 'Colombo', 'Centurion', 'Johannesburg', 'Vadodara',
       'Mohali', 'Rajkot', 'Nagpur', 'Kolkata', 'Delhi', 'Mirpur',
       'Jaipur', 'Gwalior', 'Ahmedabad', 'Bulawayo', 'Harare',
       'Visakhapatnam', 'Guwahati', 'Bengaluru', 'Chennai', 'Durban',
       'Cape Town', 'Gqeberha', 'Mumbai', 'Port of Spain', 'North Sound',
       'Kingston Jamaica', 'Bridgetown', 'Roseau', 'Manchester',
       'Chester-le-Street', 'Southampton', 'London', 'Cardiff',
       'Hyderabad', 'Cuttack', 'Indore', 'Melbourne', 'Sydney', 'Perth',
       'Adelaide', 'Brisbane', 'Hobart', 'Hambantota', 'Pallekele',
       'Pune', 'Kochi', 'Ranchi', 'Dharamshala', 'Birmingham', 'Kanpur',
       'Napier', 'Hamilton Waikato', 'Auckland', 'Wellington',
       'Fatullah Dhaka', 'Nottingham', 'Leeds', 'Galle', 'Canberra',
       'Gros Islet', 'Lauderhill', 'Thiruvananthapuram',
       'Dublin (Malahide)', 'Bristol', 'Mount Maunganui',
       'Providence Guyana', 'Christchurch', 'Dubai', 'Abu Dhabi',

It works. Now, let us apply this, and rename the column to 'Location'.

In [75]:
batting['Ground']=batting['Ground'].replace(ground_mapping)
batting = batting.rename(columns={'Ground':'Location'})

batting['Location'].unique()

array(['Dambulla', 'Colombo', 'Centurion', 'Johannesburg', 'Vadodara',
       'Mohali', 'Rajkot', 'Nagpur', 'Kolkata', 'Delhi', 'Mirpur',
       'Jaipur', 'Gwalior', 'Ahmedabad', 'Bulawayo', 'Harare',
       'Visakhapatnam', 'Guwahati', 'Bengaluru', 'Chennai', 'Durban',
       'Cape Town', 'Gqeberha', 'Mumbai', 'Port of Spain', 'North Sound',
       'Kingston Jamaica', 'Bridgetown', 'Roseau', 'Manchester',
       'Chester-le-Street', 'Southampton', 'London', 'Cardiff',
       'Hyderabad', 'Cuttack', 'Indore', 'Melbourne', 'Sydney', 'Perth',
       'Adelaide', 'Brisbane', 'Hobart', 'Hambantota', 'Pallekele',
       'Pune', 'Kochi', 'Ranchi', 'Dharamshala', 'Birmingham', 'Kanpur',
       'Napier', 'Hamilton Waikato', 'Auckland', 'Wellington',
       'Fatullah Dhaka', 'Nottingham', 'Leeds', 'Galle', 'Canberra',
       'Gros Islet', 'Lauderhill', 'Thiruvananthapuram',
       'Dublin (Malahide)', 'Bristol', 'Mount Maunganui',
       'Providence Guyana', 'Christchurch', 'Dubai', 'Abu Dhabi',

In [76]:
batting['match_id'].describe()

Match ID      string[python]
Start Date    datetime64[ns]
Format        string[python]
Inns                   Int64
Pos                    Int64
Runs                   Int64
BF                     Int64
4s                     Int64
6s                     Int64
SR                   float64
Mins                   Int64
Dismissal     string[python]
Opposition    string[python]
Location      string[python]
dtype: object

Match ID      string[python]
Start Date    datetime64[ns]
Format        string[python]
Inns                   Int64
Pos                    Int64
Overs                float64
Mdns                   Int64
Runs                   Int64
Wkts                   Int64
Econ                 float64
Opposition    string[python]
Location      string[python]
dtype: object

Match ID      string[python]
Start Date    datetime64[ns]
Format        string[python]
Inns                   Int64
Dis                    Int64
Ct                     Int64
Opposition    string[python]
Location      string[python]
dtype: object

count             639
unique            543
top       Test # 2081
freq                2
Name: match_id, dtype: object

In [82]:
batting['match_id'].unique()

array(['ODI # 2742', 'ODI # 2745', 'ODI # 2750', 'ODI # 2755',
       'ODI # 2756', 'ODI # 2889', 'ODI # 2898', 'ODI # 2901',
       'ODI # 2904', 'ODI # 2913', 'ODI # 2919', 'ODI # 2932',
       'ODI # 2933', 'ODI # 2935', 'ODI # 2936', 'ODI # 2938',
       'ODI # 2939', 'ODI # 2941', 'ODI # 2942', 'ODI # 2943',
       'ODI # 2961', 'ODI # 2962', 'ODI # 2963', 'ODI # 2981',
       'ODI # 2983', 'ODI # 2986', 'ODI # 2988', 'T20I # 182',
       'T20I # 183', 'ODI # 2993', 'ODI # 2996', 'ODI # 2999',
       'ODI # 3001', 'ODI # 3032', 'ODI # 3039', 'ODI # 3040',
       'ODI # 3060', 'ODI # 3070', 'ODI # 3072', 'ODI # 3074',
       'ODI # 3076', 'ODI # 3077', 'T20I # 196', 'ODI # 3079',
       'ODI # 3080', 'ODI # 3082', 'ODI # 3084', 'ODI # 3087',
       'ODI # 3100', 'ODI # 3110', 'ODI # 3121', 'ODI # 3124',
       'ODI # 3128', 'ODI # 3141', 'ODI # 3143', 'ODI # 3147',
       'ODI # 3148', 'T20I # 200', 'ODI # 3159', 'ODI # 3160',
       'ODI # 3161', 'ODI # 3162', 'ODI # 3163', 'Test 

We can clearly see that the format mentioned here is not needed. We can simply strip it to contain text after the #. 

In [83]:
#extracting the match id by keeping the hashtag and numbers
batting['match_id']='#'+batting['match_id'].str.extract(r'(\d+$)')
batting = batting.rename(columns={'match_id':'Match ID'})

batting['Match ID']

0      #2742
1      #2745
2      #2750
3      #2755
4      #2756
       ...  
634    #2570
635    #2571
636    #2571
637    #2575
638    #2575
Name: Match ID, Length: 639, dtype: object

In [84]:
batting = batting[['Match ID','Start Date','Format','Inns','Pos','Runs','BF','4s','6s','SR','Mins','Dismissal',
         'Opposition','Location']]

batting.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639 entries, 0 to 638
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Match ID    639 non-null    object        
 1   Start Date  639 non-null    datetime64[ns]
 2   Format      639 non-null    object        
 3   Inns        632 non-null    float64       
 4   Pos         610 non-null    float64       
 5   Runs        610 non-null    float64       
 6   BF          610 non-null    float64       
 7   4s          610 non-null    float64       
 8   6s          610 non-null    float64       
 9   SR          610 non-null    float64       
 10  Mins        543 non-null    float64       
 11  Dismissal   610 non-null    object        
 12  Opposition  639 non-null    object        
 13  Location    639 non-null    object        
dtypes: datetime64[ns](1), float64(8), object(5)
memory usage: 70.0+ KB


In [85]:
batting.head()

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Runs,BF,4s,6s,SR,Mins,Dismissal,Opposition,Location
0,#2742,2008-08-18,ODI,1.0,2.0,12.0,22.0,1.0,0.0,54.54,33.0,lbw,Sri Lanka,Dambulla
1,#2745,2008-08-20,ODI,2.0,2.0,37.0,67.0,6.0,0.0,55.22,82.0,caught,Sri Lanka,Dambulla
2,#2750,2008-08-24,ODI,1.0,1.0,25.0,38.0,4.0,0.0,65.78,40.0,run out,Sri Lanka,Colombo
3,#2755,2008-08-27,ODI,1.0,1.0,54.0,66.0,7.0,0.0,81.81,87.0,bowled,Sri Lanka,Colombo
4,#2756,2008-08-29,ODI,2.0,1.0,31.0,46.0,3.0,1.0,67.39,45.0,lbw,Sri Lanka,Colombo


Our Batting Dataset is ready.

In [86]:
bowling.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656 entries, 0 to 655
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Overs       656 non-null    object
 1   Mdns        656 non-null    object
 2   Runs        656 non-null    object
 3   Wkts        656 non-null    object
 4   Econ        656 non-null    object
 5   Pos         656 non-null    object
 6   Inns        656 non-null    object
 7   Opposition  656 non-null    object
 8   Ground      656 non-null    object
 9   Start Date  656 non-null    object
 10  match_id    656 non-null    object
dtypes: object(11)
memory usage: 56.5+ KB


In [87]:
bowling.head()

Unnamed: 0,Overs,Mdns,Runs,Wkts,Econ,Pos,Inns,Opposition,Ground,Start Date,match_id
0,DNB,-,-,-,-,-,2,ODI v Sri Lanka,Dambulla,18 Aug 2008,ODI # 2742
1,DNB,-,-,-,-,-,1,ODI v Sri Lanka,Dambulla,20 Aug 2008,ODI # 2745
2,DNB,-,-,-,-,-,2,ODI v Sri Lanka,Colombo (RPS),24 Aug 2008,ODI # 2750
3,DNB,-,-,-,-,-,2,ODI v Sri Lanka,Colombo (RPS),27 Aug 2008,ODI # 2755
4,DNB,-,-,-,-,-,1,ODI v Sri Lanka,Colombo (RPS),29 Aug 2008,ODI # 2756


We will follow the same process here:

1. Replace all the DNB, TDNB values with NaN. 
2. Replace all the '-' values with NaN. 
3. Split the opposition column into Opposition and Format. 
4. Clean the Ground column to keep only Location.
5. Change Data type of start date. 
6. Extract only the numeric part of match ID. 

Let us begin. 

In [88]:
bowling['Overs'].describe()

count     656
unique     17
top       DNB
freq      578
Name: Overs, dtype: object

In [89]:
bowling['Overs'].unique()

array(['DNB', '3.0', '1.4', '2.0', '1.0', '6.0', '5.0', '4.0', '2.4',
       '8.0', '2.5', '7.0', '1.1', '0.5', 'TDNB', '0.2', '0.3'],
      dtype=object)

In [93]:
bowling['Overs'] = bowling['Overs'].str.strip("*").replace(
    ["TDNB","DNB"], np.nan).astype(float)

bowling['Overs'].unique()

array([nan, 3. , 1.4, 2. , 1. , 6. , 5. , 4. , 2.4, 8. , 2.5, 7. , 1.1,
       0.5, 0.2, 0.3])

In [94]:
bowling['Mdns'].describe()

count     656
unique      3
top         -
freq      582
Name: Mdns, dtype: object

In [95]:
bowling['Mdns'].unique()

array(['-', '0', '1'], dtype=object)

In [99]:
bowling['Mdns'] = bowling['Mdns'].replace('-',np.nan).astype(float)
bowling['Mdns'].unique()

array([nan,  0.,  1.])

In [100]:
bowling['Runs'].describe()

count     656
unique     28
top         -
freq      582
Name: Runs, dtype: object

In [101]:
bowling['Runs'].unique()

array(['-', '21', '12', '11', '16', '6', '7', '13', '22', '44', '18',
       '20', '14', '9', '23', '3', '4', '15', '27', '10', '26', '17',
       '24', '36', '1', '0', '5', '2'], dtype=object)

In [105]:
bowling['Runs'] = bowling['Runs'].replace('-',np.nan).astype(float)
bowling['Runs'].unique()

array([nan, 21., 12., 11., 16.,  6.,  7., 13., 22., 44., 18., 20., 14.,
        9., 23.,  3.,  4., 15., 27., 10., 26., 17., 24., 36.,  1.,  0.,
        5.,  2.])

In [106]:
bowling['Wkts'].describe()

count     656
unique      3
top         -
freq      582
Name: Wkts, dtype: object

In [107]:
bowling['Wkts'].unique()

array(['-', '0', '1'], dtype=object)

In [109]:
bowling['Wkts'] = bowling['Wkts'].replace('-',np.nan).astype(float)
bowling['Wkts'].unique()

array([nan,  0.,  1.])

In [110]:
bowling['Econ'].describe()

count     656
unique     34
top         -
freq      582
Name: Econ, dtype: object

In [111]:
bowling['Econ'].unique()

array(['-', '7.00', '7.20', '5.50', '5.33', '6.00', '4.33', '7.33',
       '3.66', '3.60', '6.66', '3.50', '4.87', '4.50', '11.00', '2.87',
       '3.00', '4.00', '14.00', '15.00', '9.00', '10.00', '10.50', '7.41',
       '6.50', '5.66', '12.00', '18.00', '7.50', '5.14', '2.16', '0.00',
       '4.28', '1.00'], dtype=object)

In [113]:
bowling['Econ'] = bowling['Econ'].replace('-',np.nan).astype(float)
bowling['Econ'].unique()

array([  nan,  7.  ,  7.2 ,  5.5 ,  5.33,  6.  ,  4.33,  7.33,  3.66,
        3.6 ,  6.66,  3.5 ,  4.87,  4.5 , 11.  ,  2.87,  3.  ,  4.  ,
       14.  , 15.  ,  9.  , 10.  , 10.5 ,  7.41,  6.5 ,  5.66, 12.  ,
       18.  ,  7.5 ,  5.14,  2.16,  0.  ,  4.28,  1.  ])

In [114]:
bowling['Pos'].describe()

count     656
unique      7
top         -
freq      582
Name: Pos, dtype: object

In [115]:
bowling['Pos'].unique()

array(['-', '4', '3', '6', '5', '7', '8'], dtype=object)

In [117]:
bowling['Pos'] = bowling['Pos'].replace('-',np.nan).astype(float)
bowling['Pos'].unique()

array([nan,  4.,  3.,  6.,  5.,  7.,  8.])

In [118]:
bowling['Inns'].describe()

count     656
unique      5
top         1
freq      274
Name: Inns, dtype: object

In [119]:
bowling['Inns'].unique()

array(['2', '1', '4', '3', '-'], dtype=object)

In [121]:
bowling['Inns'] = bowling['Inns'].replace('-',np.nan).astype(float)
bowling['Inns'].unique()

array([ 2.,  1.,  4.,  3., nan])

In [122]:
bowling['Opposition'].describe()

count                  656
unique                  38
top       Test v Australia
freq                    58
Name: Opposition, dtype: object

In [123]:
bowling['Opposition'].unique()

array(['ODI v Sri Lanka', 'ODI v Pakistan', 'ODI v Australia',
       'ODI v West Indies', 'ODI v Bangladesh', 'ODI v South Africa',
       'ODI v Zimbabwe', 'T20I v Zimbabwe', 'ODI v New Zealand',
       'T20I v South Africa', 'ODI v England', 'ODI v Ireland',
       'ODI v Netherlands', 'T20I v West Indies', 'Test v West Indies',
       'T20I v England', 'Test v Australia', 'T20I v Australia',
       'T20I v Sri Lanka', 'Test v New Zealand', 'T20I v New Zealand',
       'T20I v Afghanistan', 'T20I v Pakistan', 'Test v England',
       'Test v South Africa', 'ODI v Afghanistan', 'T20I v Bangladesh',
       'ODI v U.A.E.', 'Test v Bangladesh', 'Test v Sri Lanka',
       'T20I v U.A.E.', 'T20I v Ireland', 'T20I v Scotland',
       'T20I v Namibia', 'T20I v Hong Kong', 'T20I v Netherlands',
       'ODI v Nepal', 'T20I v U.S.A.'], dtype=object)

In [124]:
bowling['Format'] = bowling['Opposition'].str.extract(r'(^.*?)\sv\s')
bowling['Opposition'] = bowling['Opposition'].str.extract(r'\sv\s(.*?$)')

bowling[['Opposition','Format']]

Unnamed: 0,Opposition,Format
0,Sri Lanka,ODI
1,Sri Lanka,ODI
2,Sri Lanka,ODI
3,Sri Lanka,ODI
4,Sri Lanka,ODI
...,...,...
651,Australia,Test
652,Australia,Test
653,Australia,Test
654,Australia,Test


In [125]:
bowling['Ground'].describe()

count        656
unique        80
top       Mirpur
freq          29
Name: Ground, dtype: object

In [126]:
bowling['Ground'].unique()

array(['Dambulla', 'Colombo (RPS)', 'Centurion', 'Johannesburg',
       'Vadodara', 'Mohali', 'Rajkot', 'Nagpur', 'Eden Gardens', 'Delhi',
       'Mirpur', 'Jaipur', 'Gwalior', 'Ahmedabad', 'Bulawayo', 'Harare',
       'Visakhapatnam', 'Guwahati', 'Bengaluru', 'Chennai', 'Durban',
       'Cape Town', 'Gqeberha', 'Wankhede', 'Port of Spain',
       'North Sound', 'Kingston', 'Bridgetown', 'Roseau', 'Manchester',
       'Chester-le-Street', 'Southampton', 'The Oval', "Lord's",
       'Cardiff', 'Hyderabad', 'Cuttack', 'Indore', 'Melbourne', 'Sydney',
       'W.A.C.A', 'Adelaide', 'Brisbane', 'Hobart', 'Hambantota',
       'Pallekele', 'Pune', 'Kochi', 'Ranchi', 'Dharamsala', 'Birmingham',
       'Kanpur', 'Napier', 'Hamilton', 'Auckland', 'Wellington',
       'Fatullah', 'Nottingham', 'Leeds', 'Galle', 'Colombo (PSS)',
       'Colombo (SSC)', 'Canberra', 'Gros Islet', 'Lauderhill',
       'Thiruvananthapuram', 'Dublin (Malahide)', 'Bristol', 'Brabourne',
       'Perth', 'Mount Maunganui'

In [127]:
# Define the mapping dictionary
ground_mapping = {
    "Colombo (SSC)": "Colombo",
    "Colombo (PSS)": "Colombo",
    "Colombo (RPS)": "Colombo",
    "Eden Gardens": "Kolkata",
    "Wankhede": "Mumbai",
    "Brabourne": "Mumbai",
    "Kingston": "Kingston Jamaica",
    "The Oval": "London",
    "Lord's": "London",
    "W.A.C.A": "Perth",
    "Dharamsala": "Dharamshala",
    "Hamilton": "Hamilton Waikato",
    "Fatullah": "Fatullah Dhaka",
    "Providence": "Providence Guyana",
    "Dubai (DICS)": "Dubai",
    "Chattogram": "Chattogram Chittagong"
}

bowling['Ground'].replace(ground_mapping).unique()

array(['Dambulla', 'Colombo', 'Centurion', 'Johannesburg', 'Vadodara',
       'Mohali', 'Rajkot', 'Nagpur', 'Kolkata', 'Delhi', 'Mirpur',
       'Jaipur', 'Gwalior', 'Ahmedabad', 'Bulawayo', 'Harare',
       'Visakhapatnam', 'Guwahati', 'Bengaluru', 'Chennai', 'Durban',
       'Cape Town', 'Gqeberha', 'Mumbai', 'Port of Spain', 'North Sound',
       'Kingston Jamaica', 'Bridgetown', 'Roseau', 'Manchester',
       'Chester-le-Street', 'Southampton', 'London', 'Cardiff',
       'Hyderabad', 'Cuttack', 'Indore', 'Melbourne', 'Sydney', 'Perth',
       'Adelaide', 'Brisbane', 'Hobart', 'Hambantota', 'Pallekele',
       'Pune', 'Kochi', 'Ranchi', 'Dharamshala', 'Birmingham', 'Kanpur',
       'Napier', 'Hamilton Waikato', 'Auckland', 'Wellington',
       'Fatullah Dhaka', 'Nottingham', 'Leeds', 'Galle', 'Canberra',
       'Gros Islet', 'Lauderhill', 'Thiruvananthapuram',
       'Dublin (Malahide)', 'Bristol', 'Mount Maunganui',
       'Providence Guyana', 'Christchurch', 'Dubai', 'Abu Dhabi',

In [128]:
bowling['Ground']=bowling['Ground'].replace(ground_mapping)
bowling = bowling.rename(columns={'Ground':'Location'})

bowling['Location'].unique()

array(['Dambulla', 'Colombo', 'Centurion', 'Johannesburg', 'Vadodara',
       'Mohali', 'Rajkot', 'Nagpur', 'Kolkata', 'Delhi', 'Mirpur',
       'Jaipur', 'Gwalior', 'Ahmedabad', 'Bulawayo', 'Harare',
       'Visakhapatnam', 'Guwahati', 'Bengaluru', 'Chennai', 'Durban',
       'Cape Town', 'Gqeberha', 'Mumbai', 'Port of Spain', 'North Sound',
       'Kingston Jamaica', 'Bridgetown', 'Roseau', 'Manchester',
       'Chester-le-Street', 'Southampton', 'London', 'Cardiff',
       'Hyderabad', 'Cuttack', 'Indore', 'Melbourne', 'Sydney', 'Perth',
       'Adelaide', 'Brisbane', 'Hobart', 'Hambantota', 'Pallekele',
       'Pune', 'Kochi', 'Ranchi', 'Dharamshala', 'Birmingham', 'Kanpur',
       'Napier', 'Hamilton Waikato', 'Auckland', 'Wellington',
       'Fatullah Dhaka', 'Nottingham', 'Leeds', 'Galle', 'Canberra',
       'Gros Islet', 'Lauderhill', 'Thiruvananthapuram',
       'Dublin (Malahide)', 'Bristol', 'Mount Maunganui',
       'Providence Guyana', 'Christchurch', 'Dubai', 'Abu Dhabi',

In [129]:
bowling['Start Date'].describe()

count             656
unique            543
top       14 Mar 2013
freq                2
Name: Start Date, dtype: object

In [130]:
bowling['Start Date'].unique()

array(['18 Aug 2008', '20 Aug 2008', '24 Aug 2008', '27 Aug 2008',
       '29 Aug 2008', '14 Sep 2009', '26 Sep 2009', '28 Sep 2009',
       '30 Sep 2009', '25 Oct 2009', '2 Nov 2009', '15 Dec 2009',
       '18 Dec 2009', '24 Dec 2009', '27 Dec 2009', '5 Jan 2010',
       '7 Jan 2010', '10 Jan 2010', '11 Jan 2010', '13 Jan 2010',
       '21 Feb 2010', '24 Feb 2010', '27 Feb 2010', '28 May 2010',
       '30 May 2010', '3 Jun 2010', '5 Jun 2010', '12 Jun 2010',
       '13 Jun 2010', '16 Jun 2010', '19 Jun 2010', '22 Jun 2010',
       '24 Jun 2010', '16 Aug 2010', '25 Aug 2010', '28 Aug 2010',
       '20 Oct 2010', '28 Nov 2010', '1 Dec 2010', '4 Dec 2010',
       '7 Dec 2010', '10 Dec 2010', '9 Jan 2011', '12 Jan 2011',
       '15 Jan 2011', '18 Jan 2011', '21 Jan 2011', '23 Jan 2011',
       '19 Feb 2011', '27 Feb 2011', '6 Mar 2011', '9 Mar 2011',
       '12 Mar 2011', '20 Mar 2011', '24 Mar 2011', '30 Mar 2011',
       '2 Apr 2011', '4 Jun 2011', '6 Jun 2011', '8 Jun 2011',
       '11

In [132]:
bowling['Start Date'] = bowling['Start Date'].astype('datetime64[ns]')
bowling['Start Date']

0     2008-08-18
1     2008-08-20
2     2008-08-24
3     2008-08-27
4     2008-08-29
         ...    
651   2024-12-14
652   2024-12-26
653   2024-12-26
654   2025-01-03
655   2025-01-03
Name: Start Date, Length: 656, dtype: datetime64[ns]

In [134]:
bowling['match_id'].describe()

count             656
unique            543
top       Test # 2081
freq                2
Name: match_id, dtype: object

In [135]:
bowling['match_id'].unique()

array(['ODI # 2742', 'ODI # 2745', 'ODI # 2750', 'ODI # 2755',
       'ODI # 2756', 'ODI # 2889', 'ODI # 2898', 'ODI # 2901',
       'ODI # 2904', 'ODI # 2913', 'ODI # 2919', 'ODI # 2932',
       'ODI # 2933', 'ODI # 2935', 'ODI # 2936', 'ODI # 2938',
       'ODI # 2939', 'ODI # 2941', 'ODI # 2942', 'ODI # 2943',
       'ODI # 2961', 'ODI # 2962', 'ODI # 2963', 'ODI # 2981',
       'ODI # 2983', 'ODI # 2986', 'ODI # 2988', 'T20I # 182',
       'T20I # 183', 'ODI # 2993', 'ODI # 2996', 'ODI # 2999',
       'ODI # 3001', 'ODI # 3032', 'ODI # 3039', 'ODI # 3040',
       'ODI # 3060', 'ODI # 3070', 'ODI # 3072', 'ODI # 3074',
       'ODI # 3076', 'ODI # 3077', 'T20I # 196', 'ODI # 3079',
       'ODI # 3080', 'ODI # 3082', 'ODI # 3084', 'ODI # 3087',
       'ODI # 3100', 'ODI # 3110', 'ODI # 3121', 'ODI # 3124',
       'ODI # 3128', 'ODI # 3141', 'ODI # 3143', 'ODI # 3147',
       'ODI # 3148', 'T20I # 200', 'ODI # 3159', 'ODI # 3160',
       'ODI # 3161', 'ODI # 3162', 'ODI # 3163', 'Test 

In [136]:
#extracting the match id by keeping the hashtag and numbers
bowling['match_id']='#'+bowling['match_id'].str.extract(r'(\d+$)')
bowling = bowling.rename(columns={'match_id':'Match ID'})

bowling['Match ID']

0      #2742
1      #2745
2      #2750
3      #2755
4      #2756
       ...  
651    #2570
652    #2571
653    #2571
654    #2575
655    #2575
Name: Match ID, Length: 656, dtype: object

In [138]:
bowling = bowling[['Match ID','Start Date','Format','Inns','Pos','Overs','Mdns','Runs','Wkts','Econ',
         'Opposition','Location']]

bowling.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656 entries, 0 to 655
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Match ID    656 non-null    object        
 1   Start Date  656 non-null    datetime64[ns]
 2   Format      656 non-null    object        
 3   Inns        652 non-null    float64       
 4   Pos         74 non-null     float64       
 5   Overs       74 non-null     float64       
 6   Mdns        74 non-null     float64       
 7   Runs        74 non-null     float64       
 8   Wkts        74 non-null     float64       
 9   Econ        74 non-null     float64       
 10  Opposition  656 non-null    object        
 11  Location    656 non-null    object        
dtypes: datetime64[ns](1), float64(7), object(4)
memory usage: 61.6+ KB


In [140]:
bowling.head(10)

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location
0,#2742,2008-08-18,ODI,2.0,,,,,,,Sri Lanka,Dambulla
1,#2745,2008-08-20,ODI,1.0,,,,,,,Sri Lanka,Dambulla
2,#2750,2008-08-24,ODI,2.0,,,,,,,Sri Lanka,Colombo
3,#2755,2008-08-27,ODI,2.0,,,,,,,Sri Lanka,Colombo
4,#2756,2008-08-29,ODI,1.0,,,,,,,Sri Lanka,Colombo
5,#2889,2009-09-14,ODI,2.0,,,,,,,Sri Lanka,Colombo
6,#2898,2009-09-26,ODI,1.0,4.0,3.0,0.0,21.0,0.0,7.0,Pakistan,Centurion
7,#2901,2009-09-28,ODI,1.0,,,,,,,Australia,Centurion
8,#2904,2009-09-30,ODI,1.0,,,,,,,West Indies,Johannesburg
9,#2913,2009-10-25,ODI,1.0,,,,,,,Australia,Vadodara


Our bowling Dataset is ready.

In [141]:
fielding.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656 entries, 0 to 655
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Dis         656 non-null    object
 1   Ct          656 non-null    object
 2   St          656 non-null    object
 3   Ct Wk       656 non-null    object
 4   Ct Fi       656 non-null    object
 5   Inns        656 non-null    object
 6   Opposition  656 non-null    object
 7   Ground      656 non-null    object
 8   Start Date  656 non-null    object
 9   match_id    656 non-null    object
dtypes: object(10)
memory usage: 51.4+ KB


In [142]:
fielding.head()

Unnamed: 0,Dis,Ct,St,Ct Wk,Ct Fi,Inns,Opposition,Ground,Start Date,match_id
0,0,0,0,0,0,2,ODI v Sri Lanka,Dambulla,18 Aug 2008,ODI # 2742
1,3,3,0,0,3,1,ODI v Sri Lanka,Dambulla,20 Aug 2008,ODI # 2745
2,0,0,0,0,0,2,ODI v Sri Lanka,Colombo (RPS),24 Aug 2008,ODI # 2750
3,0,0,0,0,0,2,ODI v Sri Lanka,Colombo (RPS),27 Aug 2008,ODI # 2755
4,0,0,0,0,0,1,ODI v Sri Lanka,Colombo (RPS),29 Aug 2008,ODI # 2756


We need to follow a different approach here: 

1. First, we will check for inappropriate values in the dataset, one by one.
2. Then, we will drop the 'St' and 'Ct Wk' column, since it is only relevant to a cricketer who is a keeper. 
3. Same formatting principles for Opposition, Ground, Start Date and Match ID. 

Let us begin.

In [143]:
fielding['Dis'].describe()

count     656
unique      5
top         0
freq      392
Name: Dis, dtype: object

In [144]:
fielding['Dis'].unique()

array(['0', '3', '1', '2', 'TDNF'], dtype=object)

We can see that the incorrect entry is 'TDNF', i.e. Team Did Not Field. Let us correct this. 

In [146]:
fielding['Dis']= fielding['Dis'].replace("TDNF", np.nan).astype(float)

fielding['Dis'].unique()

array([ 0.,  3.,  1.,  2., nan])

In [147]:
fielding['Ct'].describe()

count     656
unique      5
top         0
freq      392
Name: Ct, dtype: object

In [148]:
fielding['Ct'].unique()

array(['0', '3', '1', '2', '-'], dtype=object)

In [149]:
fielding['Ct'] = fielding['Ct'].replace('-',np.nan).astype(float)
fielding['Ct'].unique()

array([ 0.,  3.,  1.,  2., nan])

In [150]:
fielding['St'].describe()

count     656
unique      2
top         0
freq      652
Name: St, dtype: object

In [151]:
fielding['St'].unique()

array(['0', '-'], dtype=object)

In [152]:
fielding['St'] = fielding['St'].replace('-',np.nan).astype(float)
fielding['St'].unique()

array([ 0., nan])

In [153]:
fielding['Ct Wk'].describe()

count     656
unique      2
top         0
freq      652
Name: Ct Wk, dtype: object

In [154]:
fielding['Ct Wk'].unique()

array(['0', '-'], dtype=object)

In [155]:
fielding['Ct Wk'] = fielding['Ct Wk'].replace('-',np.nan).astype(float)
fielding['Ct Wk'].unique()

array([ 0., nan])

In [156]:
fielding['Ct Fi'].describe()

count     656
unique      5
top         0
freq      392
Name: Ct Fi, dtype: object

In [157]:
fielding['Ct Fi'].unique()

array(['0', '3', '1', '2', '-'], dtype=object)

In [158]:
fielding['Ct Fi'] = fielding['Ct Fi'].replace('-',np.nan).astype(float)
fielding['Ct Fi'].unique()

array([ 0.,  3.,  1.,  2., nan])

We can notice something logical - since the player will only participate as a fielder, this column should be equivalent to the 'Ct' column. Let us check for this. 

In [164]:
fielding[fielding['Ct']!=fielding['Ct Fi']]

Unnamed: 0,Dis,Ct,St,Ct Wk,Ct Fi,Inns,Opposition,Ground,Start Date,match_id
237,,,,,,-,ODI v Australia,Sydney,26 Jan 2015,ODI # 3592
339,,,,,,-,ODI v West Indies,Port of Spain,23 Jun 2017,ODI # 3895
477,,,,,,-,T20I v Sri Lanka,Guwahati,5 Jan 2020,T20I # 1025
603,,,,,,-,ODI v Pakistan,Pallekele,2 Sep 2023,ODI # 4630


Clearly, they are equal. Hence, we will be dropping this column as well. 

In [None]:
bowling['Pos'].describe()

count     656
unique      7
top         -
freq      582
Name: Pos, dtype: object

In [None]:
bowling['Pos'].unique()

array(['-', '4', '3', '6', '5', '7', '8'], dtype=object)

In [None]:
bowling['Pos'] = bowling['Pos'].replace('-',np.nan).astype(float)
bowling['Pos'].unique()

array([nan,  4.,  3.,  6.,  5.,  7.,  8.])

In [166]:
fielding['Inns'].describe()

count     656
unique      5
top         1
freq      274
Name: Inns, dtype: object

In [167]:
fielding['Inns'].unique()

array(['2', '1', '4', '3', '-'], dtype=object)

In [168]:
fielding['Inns'] = fielding['Inns'].replace('-',np.nan).astype(float)
fielding['Inns'].unique()

array([ 2.,  1.,  4.,  3., nan])

In [169]:
fielding['Opposition'].describe()

count                  656
unique                  38
top       Test v Australia
freq                    58
Name: Opposition, dtype: object

In [170]:
fielding['Opposition'].unique()

array(['ODI v Sri Lanka', 'ODI v Pakistan', 'ODI v Australia',
       'ODI v West Indies', 'ODI v Bangladesh', 'ODI v South Africa',
       'ODI v Zimbabwe', 'T20I v Zimbabwe', 'ODI v New Zealand',
       'T20I v South Africa', 'ODI v England', 'ODI v Ireland',
       'ODI v Netherlands', 'T20I v West Indies', 'Test v West Indies',
       'T20I v England', 'Test v Australia', 'T20I v Australia',
       'T20I v Sri Lanka', 'Test v New Zealand', 'T20I v New Zealand',
       'T20I v Afghanistan', 'T20I v Pakistan', 'Test v England',
       'Test v South Africa', 'ODI v Afghanistan', 'T20I v Bangladesh',
       'ODI v U.A.E.', 'Test v Bangladesh', 'Test v Sri Lanka',
       'T20I v U.A.E.', 'T20I v Ireland', 'T20I v Scotland',
       'T20I v Namibia', 'T20I v Hong Kong', 'T20I v Netherlands',
       'ODI v Nepal', 'T20I v U.S.A.'], dtype=object)

In [171]:
fielding['Format'] = fielding['Opposition'].str.extract(r'(^.*?)\sv\s')
fielding['Opposition'] = fielding['Opposition'].str.extract(r'\sv\s(.*?$)')

fielding[['Opposition','Format']]

Unnamed: 0,Opposition,Format
0,Sri Lanka,ODI
1,Sri Lanka,ODI
2,Sri Lanka,ODI
3,Sri Lanka,ODI
4,Sri Lanka,ODI
...,...,...
651,Australia,Test
652,Australia,Test
653,Australia,Test
654,Australia,Test


In [172]:
fielding['Ground'].describe()

count        656
unique        80
top       Mirpur
freq          29
Name: Ground, dtype: object

In [173]:
fielding['Ground'].unique()

array(['Dambulla', 'Colombo (RPS)', 'Centurion', 'Johannesburg',
       'Vadodara', 'Mohali', 'Rajkot', 'Nagpur', 'Eden Gardens', 'Delhi',
       'Mirpur', 'Jaipur', 'Gwalior', 'Ahmedabad', 'Bulawayo', 'Harare',
       'Visakhapatnam', 'Guwahati', 'Bengaluru', 'Chennai', 'Durban',
       'Cape Town', 'Gqeberha', 'Wankhede', 'Port of Spain',
       'North Sound', 'Kingston', 'Bridgetown', 'Roseau', 'Manchester',
       'Chester-le-Street', 'Southampton', 'The Oval', "Lord's",
       'Cardiff', 'Hyderabad', 'Cuttack', 'Indore', 'Melbourne', 'Sydney',
       'W.A.C.A', 'Adelaide', 'Brisbane', 'Hobart', 'Hambantota',
       'Pallekele', 'Pune', 'Kochi', 'Ranchi', 'Dharamsala', 'Birmingham',
       'Kanpur', 'Napier', 'Hamilton', 'Auckland', 'Wellington',
       'Fatullah', 'Nottingham', 'Leeds', 'Galle', 'Colombo (PSS)',
       'Colombo (SSC)', 'Canberra', 'Gros Islet', 'Lauderhill',
       'Thiruvananthapuram', 'Dublin (Malahide)', 'Bristol', 'Brabourne',
       'Perth', 'Mount Maunganui'

In [174]:
# Define the mapping dictionary
ground_mapping = {
    "Colombo (SSC)": "Colombo",
    "Colombo (PSS)": "Colombo",
    "Colombo (RPS)": "Colombo",
    "Eden Gardens": "Kolkata",
    "Wankhede": "Mumbai",
    "Brabourne": "Mumbai",
    "Kingston": "Kingston Jamaica",
    "The Oval": "London",
    "Lord's": "London",
    "W.A.C.A": "Perth",
    "Dharamsala": "Dharamshala",
    "Hamilton": "Hamilton Waikato",
    "Fatullah": "Fatullah Dhaka",
    "Providence": "Providence Guyana",
    "Dubai (DICS)": "Dubai",
    "Chattogram": "Chattogram Chittagong"
}

fielding['Ground'].replace(ground_mapping).unique()

array(['Dambulla', 'Colombo', 'Centurion', 'Johannesburg', 'Vadodara',
       'Mohali', 'Rajkot', 'Nagpur', 'Kolkata', 'Delhi', 'Mirpur',
       'Jaipur', 'Gwalior', 'Ahmedabad', 'Bulawayo', 'Harare',
       'Visakhapatnam', 'Guwahati', 'Bengaluru', 'Chennai', 'Durban',
       'Cape Town', 'Gqeberha', 'Mumbai', 'Port of Spain', 'North Sound',
       'Kingston Jamaica', 'Bridgetown', 'Roseau', 'Manchester',
       'Chester-le-Street', 'Southampton', 'London', 'Cardiff',
       'Hyderabad', 'Cuttack', 'Indore', 'Melbourne', 'Sydney', 'Perth',
       'Adelaide', 'Brisbane', 'Hobart', 'Hambantota', 'Pallekele',
       'Pune', 'Kochi', 'Ranchi', 'Dharamshala', 'Birmingham', 'Kanpur',
       'Napier', 'Hamilton Waikato', 'Auckland', 'Wellington',
       'Fatullah Dhaka', 'Nottingham', 'Leeds', 'Galle', 'Canberra',
       'Gros Islet', 'Lauderhill', 'Thiruvananthapuram',
       'Dublin (Malahide)', 'Bristol', 'Mount Maunganui',
       'Providence Guyana', 'Christchurch', 'Dubai', 'Abu Dhabi',

In [175]:
fielding['Ground']= fielding['Ground'].replace(ground_mapping)
fielding = fielding.rename(columns={'Ground':'Location'})

fielding['Location'].unique()

array(['Dambulla', 'Colombo', 'Centurion', 'Johannesburg', 'Vadodara',
       'Mohali', 'Rajkot', 'Nagpur', 'Kolkata', 'Delhi', 'Mirpur',
       'Jaipur', 'Gwalior', 'Ahmedabad', 'Bulawayo', 'Harare',
       'Visakhapatnam', 'Guwahati', 'Bengaluru', 'Chennai', 'Durban',
       'Cape Town', 'Gqeberha', 'Mumbai', 'Port of Spain', 'North Sound',
       'Kingston Jamaica', 'Bridgetown', 'Roseau', 'Manchester',
       'Chester-le-Street', 'Southampton', 'London', 'Cardiff',
       'Hyderabad', 'Cuttack', 'Indore', 'Melbourne', 'Sydney', 'Perth',
       'Adelaide', 'Brisbane', 'Hobart', 'Hambantota', 'Pallekele',
       'Pune', 'Kochi', 'Ranchi', 'Dharamshala', 'Birmingham', 'Kanpur',
       'Napier', 'Hamilton Waikato', 'Auckland', 'Wellington',
       'Fatullah Dhaka', 'Nottingham', 'Leeds', 'Galle', 'Canberra',
       'Gros Islet', 'Lauderhill', 'Thiruvananthapuram',
       'Dublin (Malahide)', 'Bristol', 'Mount Maunganui',
       'Providence Guyana', 'Christchurch', 'Dubai', 'Abu Dhabi',

In [None]:
fielding['Start Date'].describe()

count             656
unique            543
top       14 Mar 2013
freq                2
Name: Start Date, dtype: object

In [176]:
fielding['Start Date'].unique()

array(['18 Aug 2008', '20 Aug 2008', '24 Aug 2008', '27 Aug 2008',
       '29 Aug 2008', '14 Sep 2009', '26 Sep 2009', '28 Sep 2009',
       '30 Sep 2009', '25 Oct 2009', '2 Nov 2009', '15 Dec 2009',
       '18 Dec 2009', '24 Dec 2009', '27 Dec 2009', '5 Jan 2010',
       '7 Jan 2010', '10 Jan 2010', '11 Jan 2010', '13 Jan 2010',
       '21 Feb 2010', '24 Feb 2010', '27 Feb 2010', '28 May 2010',
       '30 May 2010', '3 Jun 2010', '5 Jun 2010', '12 Jun 2010',
       '13 Jun 2010', '16 Jun 2010', '19 Jun 2010', '22 Jun 2010',
       '24 Jun 2010', '16 Aug 2010', '25 Aug 2010', '28 Aug 2010',
       '20 Oct 2010', '28 Nov 2010', '1 Dec 2010', '4 Dec 2010',
       '7 Dec 2010', '10 Dec 2010', '9 Jan 2011', '12 Jan 2011',
       '15 Jan 2011', '18 Jan 2011', '21 Jan 2011', '23 Jan 2011',
       '19 Feb 2011', '27 Feb 2011', '6 Mar 2011', '9 Mar 2011',
       '12 Mar 2011', '20 Mar 2011', '24 Mar 2011', '30 Mar 2011',
       '2 Apr 2011', '4 Jun 2011', '6 Jun 2011', '8 Jun 2011',
       '11

In [177]:
fielding['Start Date'] = fielding['Start Date'].astype('datetime64[ns]')
fielding['Start Date']

0     2008-08-18
1     2008-08-20
2     2008-08-24
3     2008-08-27
4     2008-08-29
         ...    
651   2024-12-14
652   2024-12-26
653   2024-12-26
654   2025-01-03
655   2025-01-03
Name: Start Date, Length: 656, dtype: datetime64[ns]

In [178]:
fielding['match_id'].describe()

count             656
unique            543
top       Test # 2081
freq                2
Name: match_id, dtype: object

In [179]:
fielding['match_id'].unique()

array(['ODI # 2742', 'ODI # 2745', 'ODI # 2750', 'ODI # 2755',
       'ODI # 2756', 'ODI # 2889', 'ODI # 2898', 'ODI # 2901',
       'ODI # 2904', 'ODI # 2913', 'ODI # 2919', 'ODI # 2932',
       'ODI # 2933', 'ODI # 2935', 'ODI # 2936', 'ODI # 2938',
       'ODI # 2939', 'ODI # 2941', 'ODI # 2942', 'ODI # 2943',
       'ODI # 2961', 'ODI # 2962', 'ODI # 2963', 'ODI # 2981',
       'ODI # 2983', 'ODI # 2986', 'ODI # 2988', 'T20I # 182',
       'T20I # 183', 'ODI # 2993', 'ODI # 2996', 'ODI # 2999',
       'ODI # 3001', 'ODI # 3032', 'ODI # 3039', 'ODI # 3040',
       'ODI # 3060', 'ODI # 3070', 'ODI # 3072', 'ODI # 3074',
       'ODI # 3076', 'ODI # 3077', 'T20I # 196', 'ODI # 3079',
       'ODI # 3080', 'ODI # 3082', 'ODI # 3084', 'ODI # 3087',
       'ODI # 3100', 'ODI # 3110', 'ODI # 3121', 'ODI # 3124',
       'ODI # 3128', 'ODI # 3141', 'ODI # 3143', 'ODI # 3147',
       'ODI # 3148', 'T20I # 200', 'ODI # 3159', 'ODI # 3160',
       'ODI # 3161', 'ODI # 3162', 'ODI # 3163', 'Test 

In [180]:
#extracting the match id by keeping the hashtag and numbers
fielding['match_id']='#' + fielding['match_id'].str.extract(r'(\d+$)')
fielding = fielding.rename(columns={'match_id':'Match ID'})

fielding['Match ID']

0      #2742
1      #2745
2      #2750
3      #2755
4      #2756
       ...  
651    #2570
652    #2571
653    #2571
654    #2575
655    #2575
Name: Match ID, Length: 656, dtype: object

Now, we will drop the following columns - 'St','Ct Wk','Ct Fi' from our dataset, and re-order the columns. 

In [182]:
fielding.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656 entries, 0 to 655
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Dis         652 non-null    float64       
 1   Ct          652 non-null    float64       
 2   St          652 non-null    float64       
 3   Ct Wk       652 non-null    float64       
 4   Ct Fi       652 non-null    float64       
 5   Inns        652 non-null    float64       
 6   Opposition  656 non-null    object        
 7   Location    656 non-null    object        
 8   Start Date  656 non-null    datetime64[ns]
 9   Match ID    656 non-null    object        
 10  Format      656 non-null    object        
dtypes: datetime64[ns](1), float64(6), object(4)
memory usage: 56.5+ KB


In [184]:
fielding = fielding[['Match ID','Start Date','Format','Inns',
                     'Dis','Ct','Opposition','Location']]

fielding.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656 entries, 0 to 655
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Match ID    656 non-null    object        
 1   Start Date  656 non-null    datetime64[ns]
 2   Format      656 non-null    object        
 3   Inns        652 non-null    float64       
 4   Dis         652 non-null    float64       
 5   Ct          652 non-null    float64       
 6   Opposition  656 non-null    object        
 7   Location    656 non-null    object        
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 41.1+ KB


In [185]:
fielding.head(10)

Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location
0,#2742,2008-08-18,ODI,2.0,0.0,0.0,Sri Lanka,Dambulla
1,#2745,2008-08-20,ODI,1.0,3.0,3.0,Sri Lanka,Dambulla
2,#2750,2008-08-24,ODI,2.0,0.0,0.0,Sri Lanka,Colombo
3,#2755,2008-08-27,ODI,2.0,0.0,0.0,Sri Lanka,Colombo
4,#2756,2008-08-29,ODI,1.0,0.0,0.0,Sri Lanka,Colombo
5,#2889,2009-09-14,ODI,2.0,0.0,0.0,Sri Lanka,Colombo
6,#2898,2009-09-26,ODI,1.0,1.0,1.0,Pakistan,Centurion
7,#2901,2009-09-28,ODI,1.0,0.0,0.0,Australia,Centurion
8,#2904,2009-09-30,ODI,1.0,0.0,0.0,West Indies,Johannesburg
9,#2913,2009-10-25,ODI,1.0,1.0,1.0,Australia,Vadodara


Our fielding Dataset is ready.

In [187]:
allround = pd.read_csv('../data/player_allround_stats.csv')

In [188]:
allround.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1294 entries, 0 to 1293
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Inns        1294 non-null   int64 
 1   Score       1294 non-null   object
 2   Overs       1294 non-null   object
 3   Conc        1294 non-null   object
 4   Wkts        1294 non-null   object
 5   Ct          1294 non-null   object
 6   St          1294 non-null   object
 7   Opposition  1294 non-null   object
 8   Ground      1294 non-null   object
 9   Start Date  1294 non-null   object
 10  match_id    1294 non-null   object
dtypes: int64(1), object(10)
memory usage: 111.3+ KB


In [189]:
allround.head()

Unnamed: 0,Inns,Score,Overs,Conc,Wkts,Ct,St,Opposition,Ground,Start Date,match_id
0,1,12,-,-,-,-,-,ODI v Sri Lanka,Dambulla,18 Aug 2008,ODI # 2742
1,2,-,DNB,-,-,0,0,ODI v Sri Lanka,Dambulla,18 Aug 2008,ODI # 2742
2,1,-,DNB,-,-,3,0,ODI v Sri Lanka,Dambulla,20 Aug 2008,ODI # 2745
3,2,37,-,-,-,-,-,ODI v Sri Lanka,Dambulla,20 Aug 2008,ODI # 2745
4,1,25,-,-,-,-,-,ODI v Sri Lanka,Colombo (RPS),24 Aug 2008,ODI # 2750


We can clearly see that the information available here can be obtained by simply combining the previous 3 datasets on the common columns. 

Hence, there is no need to examine this Dataset directly. We will simply do this when we come to our analysis by creating a Data Model. 

In [224]:
display(
    batting.head(),
    bowling.head(),
    fielding.head())

Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location


Unnamed: 0,Match ID,Start Date,Format,Inns,Pos,Overs,Mdns,Runs,Wkts,Econ,Opposition,Location
0,#2742,2008-08-18,ODI,2.0,,,,,,,Sri Lanka,Dambulla
1,#2745,2008-08-20,ODI,1.0,,,,,,,Sri Lanka,Dambulla
2,#2750,2008-08-24,ODI,2.0,,,,,,,Sri Lanka,Colombo
3,#2755,2008-08-27,ODI,2.0,,,,,,,Sri Lanka,Colombo
4,#2756,2008-08-29,ODI,1.0,,,,,,,Sri Lanka,Colombo


Unnamed: 0,Match ID,Start Date,Format,Inns,Dis,Ct,Opposition,Location
0,#2742,2008-08-18,ODI,2.0,0.0,0.0,Sri Lanka,Dambulla
1,#2745,2008-08-20,ODI,1.0,3.0,3.0,Sri Lanka,Dambulla
2,#2750,2008-08-24,ODI,2.0,0.0,0.0,Sri Lanka,Colombo
3,#2755,2008-08-27,ODI,2.0,0.0,0.0,Sri Lanka,Colombo
4,#2756,2008-08-29,ODI,1.0,0.0,0.0,Sri Lanka,Colombo


In [229]:
repl_dict = {
    '*':'',
    'DNB':np.nan,
    'TDNB':np.nan,
    'DNF':np.nan,
    'TDNF':np.nan,
    '-':np.nan
}

In [233]:
allround.replace(repl_dict)

Unnamed: 0,Inns,Score,Overs,Conc,Wkts,Ct,St,Opposition,Ground,Start Date,match_id
0,1,12,,,,,,ODI v Sri Lanka,Dambulla,18 Aug 2008,ODI # 2742
1,2,,,,,0,0,ODI v Sri Lanka,Dambulla,18 Aug 2008,ODI # 2742
2,1,,,,,3,0,ODI v Sri Lanka,Dambulla,20 Aug 2008,ODI # 2745
3,2,37,,,,,,ODI v Sri Lanka,Dambulla,20 Aug 2008,ODI # 2745
4,1,25,,,,,,ODI v Sri Lanka,Colombo (RPS),24 Aug 2008,ODI # 2750
...,...,...,...,...,...,...,...,...,...,...,...
1289,4,5,,,,,,Test v Australia,Melbourne,26 Dec 2024,Test # 2571
1290,1,17,,,,,,Test v Australia,Sydney,3 Jan 2025,Test # 2575
1291,2,,,,,1,0,Test v Australia,Sydney,3 Jan 2025,Test # 2575
1292,3,6,,,,,,Test v Australia,Sydney,3 Jan 2025,Test # 2575


In [234]:
kallis_tf.allroundstats[kallis_tf.allroundstats['Location'].isna()]

Unnamed: 0,Match ID,Start Date,Format,Inns,Score,Overs,Conc,Wkts,Ct,St,Opposition,Location
