In [1]:
import json
import csv
import re
import os
import pandas as pd
import numpy as np

In [2]:
def extract_lines_with_features(file_path):
    extracted_lines = []
    extracted_values = []

    if not os.path.isfile(file_path):
        raise ValueError(f"The path {file_path} is not a file or doesn't exist.")

    with open(file_path, 'r') as file:
        for line in file:
            if 'international' in line and 'Test' in line and not 'female' in line and ('India' in line or 'Australia' in line):
                extracted_lines.append(line.strip())
                match = re.search(r'\d{4}-\d{2}-\d{2} - international - Test - male - (\d+) -', line)
                if match:
                    extracted_values.append(match.group(1))
    return extracted_lines, extracted_values


file_path = 'Readme.txt'
print(file_path)
if os.path.exists(file_path) and os.access(file_path, os.R_OK):
    lines, values = extract_lines_with_features(file_path)
print(len(values))

Readme.txt
368


In [3]:
def json_to_csv(files):
    
    def make_dict(name):
        file_name = name+'.json'
        file_path = 'data\\tests_json'
        file_path = os.path.join(file_path,file_name)
    
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
        
        ball_by_ball_data = []
        for inning in data.get('innings', []):
            team = inning.get('team', 'Unknown')
            for over in inning.get('overs', []):
                over_number = over.get('over', -1)
                for delivery in over.get('deliveries', []):
                    # Extract delivery details
                    ball = {
                        'team': team,
                        'over': over_number,
                        'batter': delivery.get('batter', 'Unknown'),
                        'bowler': delivery.get('bowler', 'Unknown'),
                        'non_striker': delivery.get('non_striker', 'Unknown'),
                        'batter_runs': delivery.get('runs', {}).get('batter', 0),
                        'extras_runs': delivery.get('runs', {}).get('extras', 0),
                        'total_runs': delivery.get('runs', {}).get('total', 0),
                        'wicket_player_out': None,
                        'wicket_kind': None,
                        'wicket_fielders': None,
                        'match_id' : name 
                    }
        
                    if 'wickets' in delivery:
                        wicket = delivery['wickets'][0]  
                        ball['wicket_player_out'] = wicket.get('player_out', None)
                        ball['wicket_kind'] = wicket.get('kind', None)
                        ball['wicket_fielders'] = ', '.join(
                            fielder.get('name', '') for fielder in wicket.get('fielders', [])
                        )
        
                    ball_by_ball_data.append(ball)
        return ball_by_ball_data
    def match_info(name):
        file_name = name+'.json'
        file_path = 'data\\tests_json'
        file_path = os.path.join(file_path,file_name)
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
        match_info = {
            'balls_per_over': data.get('info', {}).get('balls_per_over', 0),
            'city': data.get('info', {}).get('city', 'Unknown'),
            'dates': ', '.join(data.get('info', {}).get('dates', [])),
            'event_name': data.get('info', {}).get('event', {}).get('name', 'Unknown'),
            'match_number': data.get('info', {}).get('event', {}).get('match_number', 'Unknown'),
            'gender': data.get('info', {}).get('gender', 'male'),
            'match_type': data.get('info', {}).get('match_type', 'Test'),
            'venue': data.get('info', {}).get('venue', 'Unknown'),
            'season': data.get('info', {}).get('season', 'Unknown'),
            'team_type': data.get('info', {}).get('team_type', 'international'),
            'toss_decision': data.get('info', {}).get('toss', {}).get('decision', 'Unknown'),
            'toss_winner': data.get('info', {}).get('toss', {}).get('winner', 'Unknown'),
            'outcome_result': data.get('info', {}).get('outcome', {}).get('result', 'Unknown'),
            'player_of_match': ', '.join(data.get('info', {}).get('player_of_match', [])),
            'teams': ', '.join(data.get('info', {}).get('teams')),
            'players_team1': ', '.join(data.get('info', {}).get('players', {}).get((data.get('info', {}).get('teams'))[0], [])),
            'players_team2': ', '.join(data.get('info', {}).get('players', {}).get((data.get('info', {}).get('teams'))[1], [])),
            'match_referees': ', '.join(data.get('info', {}).get('officials', {}).get('match_referees', [])),
            'tv_umpires': ', '.join(data.get('info', {}).get('officials', {}).get('tv_umpires', [])),
            'umpires': ', '.join(data.get('info', {}).get('officials', {}).get('umpires', [])),
            'match_id' : name }
        return match_info

    output_csv_ball_path = 'data\\ball_by_ball.csv' 
    output_csv_match_path = 'data\\matches.csv'
    
    with open(output_csv_ball_path, 'a', newline='', encoding='utf-8') as csv_file:
        fieldnames_ball = [
            'team', 'over', 'batter', 'bowler', 'non_striker','batter_runs', 'extras_runs', 'total_runs',
            'wicket_player_out', 'wicket_kind', 'wicket_fielders','match_id']
        
        csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames_ball)
        csv_writer.writeheader()
        for file in files:
            ball_by_ball_data = make_dict(file)
            csv_writer.writerows(ball_by_ball_data)


    with open(output_csv_match_path, 'a', newline='', encoding='utf-8') as csv_file:
        fieldnames_match = ['balls_per_over','city','dates','event_name','match_number','gender','match_type','venue','season',
                           'team_type','toss_decision','toss_winner','outcome_result','player_of_match','teams',
                           'players_team1','players_team2','match_referees','tv_umpires','umpires','match_id']
        
        csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames_match)
        csv_writer.writeheader()
        for file in files:
            csv_writer.writerow(match_info(file))

    print("files have been converted to csv")

json_to_csv(values)

files have been converted to csv


In [30]:
df = pd.read_csv('data\\ball_by_ball.csv')
df.head()
print(df.info())
print(len(df['match_id'].unique()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 722735 entries, 0 to 722734
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   team               722735 non-null  object
 1   over               722735 non-null  int64 
 2   batter             722735 non-null  object
 3   bowler             722735 non-null  object
 4   non_striker        722735 non-null  object
 5   batter_runs        722735 non-null  int64 
 6   extras_runs        722735 non-null  int64 
 7   total_runs         722735 non-null  int64 
 8   wicket_player_out  11883 non-null   object
 9   wicket_kind        11883 non-null   object
 10  wicket_fielders    7480 non-null    object
 11  match_id           722735 non-null  object
dtypes: int64(4), object(8)
memory usage: 66.2+ MB
None
368


In [5]:
import json
import csv

# Load the JSON file
file_path = 'data\\tests_json\\63963.json'  # Replace with your JSON file path
with open(file_path, 'r') as json_file:
    data = json.load(json_file)

# Extract match-related information
match_info = {
    'data_version': data.get('meta', {}).get('data_version', 'Unknown'),
    'created': data.get('meta', {}).get('created', 'Unknown'),
    'revision': data.get('meta', {}).get('revision', 'Unknown'),
    'balls_per_over': data.get('info', {}).get('balls_per_over', 0),
    'city': data.get('info', {}).get('city', 'Unknown'),
    'dates': ', '.join(data.get('info', {}).get('dates', [])),
    'event_name': data.get('info', {}).get('event', {}).get('name', 'Unknown'),
    'match_number': data.get('info', {}).get('event', {}).get('match_number', 'Unknown'),
    'gender': data.get('info', {}).get('gender', 'Unknown'),
    'match_type': data.get('info', {}).get('match_type', 'Unknown'),
    'venue': data.get('info', {}).get('venue', 'Unknown'),
    'season': data.get('info', {}).get('season', 'Unknown'),
    'team_type': data.get('info', {}).get('team_type', 'Unknown'),
    'toss_decision': data.get('info', {}).get('toss', {}).get('decision', 'Unknown'),
    'toss_winner': data.get('info', {}).get('toss', {}).get('winner', 'Unknown'),
    'outcome_result': data.get('info', {}).get('outcome', {}).get('result', 'Unknown'),
    'player_of_match': ', '.join(data.get('info', {}).get('player_of_match', [])),
    'teams': ', '.join(data.get('info', {}).get('teams', [])),
    'players_England': ', '.join(data.get('info', {}).get('players', {}).get('England', [])),
    'players_India': ', '.join(data.get('info', {}).get('players', {}).get('India', [])),
    'match_referees': ', '.join(data.get('info', {}).get('officials', {}).get('match_referees', [])),
    'tv_umpires': ', '.join(data.get('info', {}).get('officials', {}).get('tv_umpires', [])),
    'umpires': ', '.join(data.get('info', {}).get('officials', {}).get('umpires', []))
}

# Define CSV output path
output_csv_path = 'data\\match_info.csv'  # Replace with your desired CSV output path

# Write the extracted data to CSV
with open(output_csv_path, 'w', newline='', encoding='utf-8') as csv_file:
    fieldnames = match_info.keys()  # Use all the keys from the match_info dictionary
    csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    csv_writer.writeheader()
    csv_writer.writerow(match_info)

print(f"Match-related information has been saved to {output_csv_path}")


Match-related information has been saved to data\match_info.csv
