In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
from IPython.display import display

# Define User-Agent
headers = {"User-Agent": "Mozilla/5.0"}

# Base URL for IPL series
BASE_URL = "https://www.espncricinfo.com/series/"

# IPL Series IDs for 2021, 2022, 2023
IPL_SERIES = {
    "IPL 2021": "indian-premier-league-2021-1249214",
    "IPL 2022": "indian-premier-league-2022-1298423",
    "IPL 2023": "indian-premier-league-2023-1345038",
}

# Initialize Data Storage
all_data = []

# Loop through each IPL Series
for series_name, series_id in IPL_SERIES.items():
    series_url = f"{BASE_URL}{series_id}"
    print(f"🔄 Fetching matches for: {series_name}")

    # Fetch Series Page
    response = requests.get(series_url, headers=headers)
    if response.status_code != 200:
        print(f"❌ Failed to load {series_name} page")
        continue

    # Parse Series Page
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all match links
    match_links = []
    for link in soup.find_all("a", href=True):
        if "full-scorecard" in link["href"]:
            match_links.append("https://www.espncricinfo.com" + link["href"].replace("full-scorecard", "ball-by-ball-commentary"))

    print(f"✅ Found {len(match_links)} matches for {series_name}")

    # Loop through each match
    for match_url in match_links:
        time.sleep(2)  # Avoid request flooding
        print(f"📌 Fetching commentary for match: {match_url}")

        # Fetch Match Commentary Page
        match_response = requests.get(match_url, headers=headers)
        if match_response.status_code != 200:
            print(f"❌ Failed to load match commentary: {match_url}")
            continue

        # Parse Match Commentary
        match_soup = BeautifulSoup(match_response.text, "html.parser")
        commentary_divs = match_soup.find_all("div", class_="ds-text-tight-m")

        # Extract Match Details
        match_details = match_soup.find("h1").text.strip() if match_soup.find("h1") else "Unknown Match"
        venue_date_info = match_soup.find("span", class_="ds-text-title-xs ds-font-bold ds-text-typo-mid3")
        venue_date = venue_date_info.text.strip() if venue_date_info else "Unknown Venue & Date"

        # Extract Team Scores (if available)
        team_scores = match_soup.find_all("div", class_="ds-text-compact-s ds-text-typo-mid3")
        team1_score = team_scores[0].text.strip() if len(team_scores) > 0 else "N/A"
        team2_score = team_scores[1].text.strip() if len(team_scores) > 1 else "N/A"

        # Extract ball-by-ball commentary
        for comment in commentary_divs:
            text = comment.get_text(strip=True)

            # Extract ball number
            if "." in text[:4]:  
                ball_no = text.split()[0]
                over, ball = ball_no.split(".")

                # Extract bowler and batter
                details = text.split(",")
                bowler_batter = details[0].split("to")

                if len(bowler_batter) == 2:
                    bowler_name = bowler_batter[0].strip()
                    batter_name = bowler_batter[1].strip()
                else:
                    continue  # Skip invalid entries

                # Determine shot type and runs scored
                if "SIX" in text:
                    shot_type = "boundary"
                    runs_scored = 6
                elif "FOUR" in text:
                    shot_type = "boundary"
                    runs_scored = 4
                elif "1 run" in text:
                    shot_type = "single"
                    runs_scored = 1
                elif "2 runs" in text:
                    shot_type = "double"
                    runs_scored = 2
                elif "3 runs" in text:
                    shot_type = "triple"
                    runs_scored = 3
                elif "no run" in text:
                    shot_type = "dot"
                    runs_scored = 0
                else:
                    shot_type = "other"
                    runs_scored = "unknown"

                ball_type = "Unknown"
                speed = "N/A"

                # Append structured data
                all_data.append([
                    series_name, series_id.split("-")[-1],  # Series Name & Year
                    match_details, venue_date, team1_score, team2_score,  # Match Info
                    ball_no, over, bowler_name, batter_name, ball_type, shot_type, speed, runs_scored  # Ball Details
                ])

# Define column names
columns = [
    "Series Name", "Series Year", "Match Name", "Match Venue & Date", "Team 1 Score", "Team 2 Score",
    "Ball No", "Over", "Bowler Name", "Batter Name", "Ball Type", "Shot Type", "Speed of Ball", "Runs Scored"
]

# Create DataFrame
df = pd.DataFrame(all_data, columns=columns)

# Save to CSV
file_name = "IPL_2021_2023_BALLBYBALL_COMMENTARY.csv"
df.to_csv(file_name, index=False)

# Print file path
file_path = os.path.abspath(file_name)
print(f"✅ Data saved successfully at: {file_path}")

# Display first 5 rows
display(df.head())


🔄 Fetching matches for: IPL 2021
❌ Failed to load IPL 2021 page
🔄 Fetching matches for: IPL 2022
❌ Failed to load IPL 2022 page
🔄 Fetching matches for: IPL 2023
❌ Failed to load IPL 2023 page
✅ Data saved successfully at: C:\Users\HP\IPL_2021_2023_BALLBYBALL_COMMENTARY.csv


Unnamed: 0,Series Name,Series Year,Match Name,Match Venue & Date,Team 1 Score,Team 2 Score,Ball No,Over,Bowler Name,Batter Name,Ball Type,Shot Type,Speed of Ball,Runs Scored
