In [4]:
import os
import json
import pandas as pd
import mysql.connector


t20_folder = 'data/t20_matches'
t20_files = [file for file in os.listdir(t20_folder) if file.endswith('.json')]

t20_matches = []
t20_deliveries = []

for file_name in t20_files:
    file_path = os.path.join(t20_folder, file_name)

    with open(file_path, 'r') as f:
        match_data = json.load(f)

    info = match_data.get('info', {})
    match_id = file_name.replace('.json', '')

    
    match_record = {
        'match_id': match_id,
        'date': info.get('dates', [None])[0],
        'venue': info.get('venue'),
        'team1': info.get('teams', [None, None])[0],
        'team2': info.get('teams', [None, None])[1],
        'toss_winner': info.get('toss', {}).get('winner'),
        'toss_decision': info.get('toss', {}).get('decision'),
        'winner': info.get('outcome', {}).get('winner'),
        'match_type': 'T20'
    }
    t20_matches.append(match_record)

    
    for inning_index, inning_data in enumerate(match_data.get("innings", []), start=1):
        batting_team = inning_data.get("team")
        overs = inning_data.get("overs", [])

        for over_info in overs:
            over_number = over_info.get("over")
            deliveries = over_info.get("deliveries", [])

            for ball_index, ball_data in enumerate(deliveries):
                record = {
                    'match_id': match_id,
                    'batting_team': batting_team,
                    'innings': inning_index,
                    'over': int(over_number),
                    'ball': ball_index + 1,
                    'batsman': ball_data.get('batter'),
                    'non_striker': ball_data.get('non_striker'),
                    'bowler': ball_data.get('bowler'),
                    'runs_batsman': ball_data.get('runs', {}).get('batter', 0),
                    'runs_extras': ball_data.get('runs', {}).get('extras', 0),
                    'runs_total': ball_data.get('runs', {}).get('total', 0),
                    'wicket_kind': ball_data.get('wicket', {}).get('kind') if 'wicket' in ball_data else None,
                    'player_out': ball_data.get('wicket', {}).get('player_out') if 'wicket' in ball_data else None
                }
                t20_deliveries.append(record)


t20_matches_df = pd.DataFrame(t20_matches)
t20_deliveries_df = pd.DataFrame(t20_deliveries)

df_combined = t20_deliveries_df.merge(t20_matches_df, on='match_id', how='left')

connection = mysql.connector.connect(
    host='localhost',
    user='root',
    password='root',
    database='cric_sheet',
    ssl_disabled=True
)
cursor = connection.cursor()

create_table_query = '''
CREATE TABLE IF NOT EXISTS t20_match_deliveries (
    match_id VARCHAR(100),
    date DATE,
    venue VARCHAR(255),
    team1 VARCHAR(100),
    team2 VARCHAR(100),
    toss_winner VARCHAR(100),
    toss_decision VARCHAR(10),
    winner VARCHAR(100),
    match_type VARCHAR(10),
    batting_team VARCHAR(100),
    innings INT,
    `over` INT,
    ball INT,
    batsman VARCHAR(100),
    non_striker VARCHAR(100),
    bowler VARCHAR(100),
    runs_batsman INT,
    runs_extras INT,
    runs_total INT,
    wicket_kind VARCHAR(100),
    player_out VARCHAR(100)
)
'''
cursor.execute(create_table_query)
connection.commit()

required_columns = [
    'match_id', 'date', 'venue', 'team1', 'team2', 'toss_winner', 'toss_decision',
    'winner', 'match_type', 'batting_team', 'innings', 'over', 'ball', 'batsman',
    'non_striker', 'bowler', 'runs_batsman', 'runs_extras', 'runs_total',
    'wicket_kind', 'player_out'
]

df_combined = df_combined.where(pd.notnull(df_combined), None)  # Replace NaN with None
df_combined = df_combined[required_columns]  # Reorder columns
data_tuples = list(df_combined.itertuples(index=False, name=None))

insert_query = '''
INSERT INTO t20_match_deliveries (
    match_id, date, venue, team1, team2, toss_winner, toss_decision, winner,
    match_type, batting_team, innings, `over`, ball, batsman, non_striker, bowler,
    runs_batsman, runs_extras, runs_total, wicket_kind, player_out
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
'''

chunk_size = 20000
for i in range(0, len(data_tuples), chunk_size):
    try:
        chunk = data_tuples[i:i + chunk_size]
        cursor.executemany(insert_query, chunk)
        connection.commit()
        print(f"✅ Inserted records {i} to {i + len(chunk) - 1}")
    except Exception as e:
        print(f"❌ Failed at chunk {i}: {e}")
        break

cursor.close()
connection.close()


✅ Inserted records 0 to 19999
✅ Inserted records 20000 to 39999
✅ Inserted records 40000 to 59999
✅ Inserted records 60000 to 79999
✅ Inserted records 80000 to 99999
✅ Inserted records 100000 to 119999
✅ Inserted records 120000 to 139999
✅ Inserted records 140000 to 159999
✅ Inserted records 160000 to 179999
✅ Inserted records 180000 to 199999
✅ Inserted records 200000 to 219999
✅ Inserted records 220000 to 239999
✅ Inserted records 240000 to 259999
✅ Inserted records 260000 to 279999
✅ Inserted records 280000 to 299999
✅ Inserted records 300000 to 319999
✅ Inserted records 320000 to 339999
✅ Inserted records 340000 to 359999
✅ Inserted records 360000 to 379999
✅ Inserted records 380000 to 399999
✅ Inserted records 400000 to 419999
✅ Inserted records 420000 to 439999
✅ Inserted records 440000 to 459999
✅ Inserted records 460000 to 479999
✅ Inserted records 480000 to 499999
✅ Inserted records 500000 to 519999
✅ Inserted records 520000 to 539999
✅ Inserted records 540000 to 559999
✅ Inse