In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import json
from tqdm import tqdm

import os

from google.colab import drive


In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define the path to your dataset folder
data_path = "/content/drive/MyDrive/Colab Notebooks/Zaryab Project/Dataset"


In [None]:
# List files in the directory to verify the upload
json_files = [f for f in os.listdir(data_path) if f.endswith('.json')]
print(f"Total JSON files found: {len(json_files)}")

Total JSON files found: 15011


In [None]:
# Load a sample JSON file
sample_file = os.path.join(data_path, json_files[0])
with open(sample_file, 'r') as f:
    match_data = json.load(f)

# Display the top-level keys
print(match_data.keys())


dict_keys(['meta', 'info', 'innings'])


In [None]:
# Print Match Info
print(json.dumps(match_data['info'], indent=4))

{
    "balls_per_over": 6,
    "city": "Colombo",
    "dates": [
        "2024-08-02"
    ],
    "event": {
        "name": "India tour of Sri Lanka",
        "match_number": 1
    },
    "gender": "male",
    "match_type": "ODI",
    "match_type_number": 4752,
    "officials": {
        "match_referees": [
            "RS Madugalle"
        ],
        "reserve_umpires": [
            "RSA Palliyaguruge"
        ],
        "tv_umpires": [
            "PR Reiffel"
        ],
        "umpires": [
            "JS Wilson",
            "RR Wimalasiri"
        ]
    },
    "outcome": {
        "result": "tie"
    },
    "overs": 50,
    "player_of_match": [
        "DN Wellalage"
    ],
    "players": {
        "Sri Lanka": [
            "P Nissanka",
            "WIA Fernando",
            "BKG Mendis",
            "S Samarawickrama",
            "KIC Asalanka",
            "J Liyanage",
            "DN Wellalage",
            "PWH de Silva",
            "A Dananjaya",
            "M Shiraz

In [None]:
# List to store match data
match_data = []

# Loop through all JSON files
for file in tqdm(json_files):  # Process first 1000 files (adjust as needed)
    file_path = os.path.join(data_path, file)

    with open(file_path, 'r') as f:
        match = json.load(f)

    # Extract Match Info
    info = match.get("info", {})
    match_id = info.get("match_type_number", -1)  # Unique match identifier
    teams = info.get("teams", [])

    # Extract Outcome
    outcome = info.get("outcome", {})

    if "winner" in outcome:
        winner = outcome["winner"]  # Team that won
        win_by_runs = outcome.get("by", {}).get("runs", 0)  # Runs margin
        win_by_innings = outcome.get("by", {}).get("innings", 0)  # Innings margin
        outcome_result = f"{winner} won by {win_by_innings} innings and {win_by_runs} runs" \
            if win_by_innings > 0 else f"{winner} won by {win_by_runs} runs"

    elif "result" in outcome:
        outcome_result = outcome["result"]  # Handle "draw" or "tie"

    else:
        outcome_result = "unknown"

    # Extract Toss Details
    toss_winner = info.get("toss", {}).get("winner", "unknown")
    toss_decision = info.get("toss", {}).get("decision", "unknown")

    # Extract Player Lists
    players = info.get("players", {})

    # Store extracted data
    match_data.append({
        "match_id": match_id,  # Primary Key
        "match_type": info.get("match_type", ""),
        "season": info.get("season", ""),
        "venue": info.get("venue", ""),
        "team_1": teams[0] if len(teams) > 1 else "unknown",
        "team_2": teams[1] if len(teams) > 1 else "unknown",
        "team_1_players": players.get(teams[0], []),
        "team_2_players": players.get(teams[1], []),
        "toss_winner": toss_winner,
        "toss_decision": toss_decision,
        "outcome": outcome_result
    })

# Convert to DataFrame
df_matches = pd.DataFrame(match_data)

# Display first few rows
df_matches.head()

100%|██████████| 15011/15011 [01:55<00:00, 129.77it/s]


Unnamed: 0,match_id,match_type,season,venue,team_1,team_2,team_1_players,team_2_players,toss_winner,toss_decision,outcome
0,4752,ODI,2024,"R Premadasa Stadium, Colombo",Sri Lanka,India,"[P Nissanka, WIA Fernando, BKG Mendis, S Samar...","[RG Sharma, Shubman Gill, V Kohli, Washington ...",Sri Lanka,bat,tie
1,-1,ODM,2024,"County Ground, Derby",Derbyshire,Worcestershire,"[HRC Came, LM Reece, BD Guest, DL Lloyd, MJ La...","[GH Roderick, EJ Pollock, RP Jones, JD Libby, ...",Worcestershire,field,Worcestershire won by 0 runs
2,-1,T20,2024,"Lord's, London",Oval Invincibles,London Spirit,"[WG Jacks, DJ Malan, JM Cox, SW Billings, SM C...","[MS Pepper, KK Jennings, OJ Pope, DW Lawrence,...",Oval Invincibles,bat,Oval Invincibles won by 30 runs
3,-1,ODM,2024,"The Cooper Associates County Ground, Taunton",Lancashire,Somerset,"[GJ Bell, Harry Singh, JJ Bohannon, GP Balders...","[GW Thomas, ARI Umeed, LP Goldsworthy, JEK Rew...",Somerset,field,Somerset won by 0 runs
4,-1,T20,2024,"Sophia Gardens, Cardiff",Southern Brave,Welsh Fire,"[AL Davies, JM Vince, JL du Plooy, LJ Evans, K...","[LWP Wells, JM Bairstow, JM Clarke, TB Abell, ...",Welsh Fire,field,Southern Brave won by 42 runs


In [None]:
df_matches.head(100)

Unnamed: 0,match_id,match_type,season,venue,team_1,team_2,team_1_players,team_2_players,toss_winner,toss_decision,outcome
0,4752,ODI,2024,"R Premadasa Stadium, Colombo",Sri Lanka,India,"[P Nissanka, WIA Fernando, BKG Mendis, S Samar...","[RG Sharma, Shubman Gill, V Kohli, Washington ...",Sri Lanka,bat,tie
1,-1,ODM,2024,"County Ground, Derby",Derbyshire,Worcestershire,"[HRC Came, LM Reece, BD Guest, DL Lloyd, MJ La...","[GH Roderick, EJ Pollock, RP Jones, JD Libby, ...",Worcestershire,field,Worcestershire won by 0 runs
2,-1,T20,2024,"Lord's, London",Oval Invincibles,London Spirit,"[WG Jacks, DJ Malan, JM Cox, SW Billings, SM C...","[MS Pepper, KK Jennings, OJ Pope, DW Lawrence,...",Oval Invincibles,bat,Oval Invincibles won by 30 runs
3,-1,ODM,2024,"The Cooper Associates County Ground, Taunton",Lancashire,Somerset,"[GJ Bell, Harry Singh, JJ Bohannon, GP Balders...","[GW Thomas, ARI Umeed, LP Goldsworthy, JEK Rew...",Somerset,field,Somerset won by 0 runs
4,-1,T20,2024,"Sophia Gardens, Cardiff",Southern Brave,Welsh Fire,"[AL Davies, JM Vince, JL du Plooy, LJ Evans, K...","[LWP Wells, JM Bairstow, JM Clarke, TB Abell, ...",Welsh Fire,field,Southern Brave won by 42 runs
...,...,...,...,...,...,...,...,...,...,...,...
95,-1,IT20,2018/19,Bayuemas Oval,Myanmar,Singapore,"[KK Lin Thu, Khin Aye, Y Naing Tun, H Lin Aung...","[Aritra Dutta, S Chandramohan, A Mutreja, CR S...",Myanmar,bat,Singapore won by 0 runs
96,-1,T20,2009/10,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals,"[DA Warner, V Sehwag, G Gambhir, PD Collingwoo...","[MJ Lumb, NV Ojha, FY Fazal, AA Jhunjhunwala, ...",Delhi Daredevils,bat,Delhi Daredevils won by 67 runs
97,-1,T20,2024,"Headingley, Leeds",London Spirit,Northern Superchargers,"[MS Pepper, KK Jennings, DW Lawrence, MJJ Crit...","[MW Short, G Clark, OG Robinson, HC Brook, N P...",Northern Superchargers,field,Northern Superchargers won by 21 runs
98,-1,IT20,2018,Gahanga International Cricket Stadium. Rwanda,Tanzania,Kenya,"[Jitin Singh, AR Patwa, R Amarshi, Muhammad Za...","[DM Gondaria, AA Obanda, RR Patel, Gurdeep Sin...",Tanzania,bat,Kenya won by 0 runs


In [None]:
# Define save paths in Google Drive
match_csv_path = "/content/drive/My Drive/Colab Notebooks/Zaryab Project/cricket_data/match_data.csv"
# Save to CSV
df_matches.to_csv(match_csv_path, index=False)

print(f"✅ Match data saved to {match_csv_path}")


✅ Match data saved to /content/drive/My Drive/Colab Notebooks/Zaryab Project/cricket_data/match_data.csv


In [None]:
# Define save path in Google Drive
player_csv_path = "/content/drive/My Drive/Colab Notebooks/Zaryab Project/cricket_data/player_performance.csv"

# Initialize CSV with headers
pd.DataFrame(columns=[
    "match_id", "match_type", "season", "venue", "team", "player", "role",
    "runs_scored", "total_runs", "extras", "dismissal", "player_out",
    "player_of_match", "runs_conceded", "extras_given", "wickets_taken"
]).to_csv(player_csv_path, index=False, mode='w')

# Process JSON files in batches
batch_size = 500  # Adjust as needed
batch = []

for i, file in enumerate(tqdm(json_files)):  # Process all files
    file_path = os.path.join(data_path, file)

    with open(file_path, 'r') as f:
        match = json.load(f)

    info = match.get("info", {})
    match_id = info.get("match_type_number", -1)  # Unique match identifier

    teams = info.get("teams", [])

    # Get Player of the Match
    player_of_match_list = set(info.get("player_of_match", []))  # Convert to set for faster lookup

    # Process innings data
    for inning in match.get("innings", []):
        team_name = inning.get("team", "unknown")

        for over in inning.get("overs", []):
            for delivery in over.get("deliveries", []):
                batter = delivery.get("batter", "")
                bowler = delivery.get("bowler", "")

                runs_batter = delivery.get("runs", {}).get("batter", 0)
                total_runs = delivery.get("runs", {}).get("total", 0)
                extras = delivery.get("runs", {}).get("extras", 0)

                # Wicket Handling
                wicket = delivery.get("wickets", [])
                if wicket:
                    wicket_info = wicket[0]  # First wicket event
                    dismissal_type = wicket_info.get("kind", "")
                    player_out = wicket_info.get("player_out", "")
                else:
                    dismissal_type, player_out = None, None

                # Check if the player is Player of the Match
                batter_potm = 1 if batter in player_of_match_list else 0
                bowler_potm = 1 if bowler in player_of_match_list else 0

                # Store batting stats
                if batter:
                    batch.append({
                        "match_id": match_id, "match_type": info.get("match_type", ""),
                        "season": info.get("season", ""), "venue": info.get("venue", ""),
                        "team": team_name, "player": batter, "role": "batter",
                        "runs_scored": runs_batter, "total_runs": total_runs, "extras": extras,
                        "dismissal": dismissal_type, "player_out": player_out, "player_of_match": batter_potm,
                        "runs_conceded": None, "extras_given": None, "wickets_taken": None
                    })

                # Store bowling stats
                if bowler:
                    batch.append({
                        "match_id": match_id, "match_type": info.get("match_type", ""),
                        "season": info.get("season", ""), "venue": info.get("venue", ""),
                        "team": team_name, "player": bowler, "role": "bowler",
                        "runs_scored": None, "total_runs": None, "extras": None,
                        "dismissal": None, "player_out": None, "player_of_match": bowler_potm,
                        "runs_conceded": total_runs, "extras_given": extras, "wickets_taken": 1 if dismissal_type else 0
                    })

    # Write batch to CSV every `batch_size` matches
    if (i + 1) % batch_size == 0 or i == len(json_files) - 1:
        pd.DataFrame(batch).to_csv(player_csv_path, index=False, mode='a', header=False)
        batch = []  # Clear memory

print(f"✅ Player performance data saved to {player_csv_path}")


100%|██████████| 15011/15011 [06:52<00:00, 36.39it/s]

✅ Player performance data saved to /content/drive/My Drive/Colab Notebooks/Zaryab Project/cricket_data/player_performance.csv





In [None]:
match_csv_path = "/content/drive/My Drive/Colab Notebooks/Zaryab Project/cricket_data/match_data.csv"
player_csv_path = "/content/drive/My Drive/Colab Notebooks/Zaryab Project/cricket_data/player_performance.csv"

df_matches = pd.read_csv(match_csv_path)
df_players = pd.read_csv(player_csv_path)

# Display first few rows
df_matches.head()

  df_players = pd.read_csv(player_csv_path)


Unnamed: 0,match_id,match_type,season,venue,team_1,team_2,team_1_players,team_2_players,toss_winner,toss_decision,outcome
0,4752,ODI,2024,"R Premadasa Stadium, Colombo",Sri Lanka,India,"['P Nissanka', 'WIA Fernando', 'BKG Mendis', '...","['RG Sharma', 'Shubman Gill', 'V Kohli', 'Wash...",Sri Lanka,bat,tie
1,-1,ODM,2024,"County Ground, Derby",Derbyshire,Worcestershire,"['HRC Came', 'LM Reece', 'BD Guest', 'DL Lloyd...","['GH Roderick', 'EJ Pollock', 'RP Jones', 'JD ...",Worcestershire,field,Worcestershire won by 0 runs
2,-1,T20,2024,"Lord's, London",Oval Invincibles,London Spirit,"['WG Jacks', 'DJ Malan', 'JM Cox', 'SW Billing...","['MS Pepper', 'KK Jennings', 'OJ Pope', 'DW La...",Oval Invincibles,bat,Oval Invincibles won by 30 runs
3,-1,ODM,2024,"The Cooper Associates County Ground, Taunton",Lancashire,Somerset,"['GJ Bell', 'Harry Singh', 'JJ Bohannon', 'GP ...","['GW Thomas', 'ARI Umeed', 'LP Goldsworthy', '...",Somerset,field,Somerset won by 0 runs
4,-1,T20,2024,"Sophia Gardens, Cardiff",Southern Brave,Welsh Fire,"['AL Davies', 'JM Vince', 'JL du Plooy', 'LJ E...","['LWP Wells', 'JM Bairstow', 'JM Clarke', 'TB ...",Welsh Fire,field,Southern Brave won by 42 runs


In [None]:
df_players.head()

Unnamed: 0,match_id,match_type,season,venue,team,player,role,runs_scored,total_runs,extras,dismissal,player_out,player_of_match,runs_conceded,extras_given,wickets_taken
0,4752,ODI,2024,"R Premadasa Stadium, Colombo",Sri Lanka,P Nissanka,batter,0.0,0.0,0.0,,,0,,,
1,4752,ODI,2024,"R Premadasa Stadium, Colombo",Sri Lanka,Mohammed Siraj,bowler,,,,,,0,0.0,0.0,0.0
2,4752,ODI,2024,"R Premadasa Stadium, Colombo",Sri Lanka,P Nissanka,batter,0.0,0.0,0.0,,,0,,,
3,4752,ODI,2024,"R Premadasa Stadium, Colombo",Sri Lanka,Mohammed Siraj,bowler,,,,,,0,0.0,0.0,0.0
4,4752,ODI,2024,"R Premadasa Stadium, Colombo",Sri Lanka,P Nissanka,batter,0.0,0.0,0.0,,,0,,,
