In [1]:
import duckdb
import glob
import os
import json
import pandas as pd

from tqdm import tqdm
from dateutil import parser
from datetime import datetime

In [2]:
def parse_dates(date_string):
    try:
        return parser.parse(date_string)
    except ValueError:
        return pd.NaT  # Not a Time - pandas' equivalent of NaN for datetime

In [3]:
con = duckdb.connect("E:/duckdb/tennis.duckdb")

# Betfair - Market summaries

In [47]:
market_summ = []
for path in glob.glob("E:/Data/tennis/market-summaries/*"):
    market = pd.read_csv(path, encoding='latin-1', dtype={'market_id':'str'})
    market_summ.append(market)
    
market_summ_df = pd.concat(market_summ)

market_summ_df.loc[market_summ_df['market_id'].str.len() < 11, 'market_id'] = \
    market_summ_df.loc[market_summ_df['market_id'].str.len() < 11, 'market_id'].apply(lambda x: x.ljust(11, '0'))

# Assuming your data is in a DataFrame called 'df'
market_summ_df['event_date'] = market_summ_df['event_date'].apply(parse_dates)
market_summ_df = market_summ_df.query('event_date <= "2098-01-01"').drop(columns='handicap')

con.execute("DROP TABLE IF EXISTS market_summaries")
con.execute("CREATE TABLE market_summaries AS SELECT * FROM market_summ_df")

ConnectionException: Connection Error: Connection already closed!

In [64]:
# con.execute("SELECT * FROM market_summaries LIMIT 100").df()

# Betfair - Competition mappings

In [56]:
comp_mappings = []
for path in glob.glob("E:/Data/tennis/competition-mapping/*"):
    mapping = pd.read_csv(path)
    comp_mappings.append(mapping)
    
comp_mappings_df = pd.concat(comp_mappings)
comp_mappings_df['market_id'] = "1." + comp_mappings_df['MARKET_ID'].astype(str).str.pad(9, fillchar='0', side='right')

comp_mappings_df.drop(columns='MARKET_ID',inplace=True)

con.execute("DROP TABLE IF EXISTS competition_mappings")
con.execute("CREATE TABLE competition_mappings AS SELECT * FROM comp_mappings_df")

<duckdb.duckdb.DuckDBPyConnection at 0x1a864ec09f0>

In [66]:
# con.execute("SELECT * FROM competition_mappings LIMIT 100").df()

# Sofascore - Events

In [4]:
# Directory containing the JSON files
directory = r'E:\Data\tennis\sofascore\events'

# List to store DataFrames from each file
dfs = []

# Function to safely select columns
def safe_select_columns(df, columns):
    return df.reindex(columns=columns, fill_value=pd.NA)

# Loop through all JSON files in the directory
for filename in tqdm(os.listdir(directory)):
    if filename.endswith('.json'):
        # Extract event_fetch_date from filename
        event_fetch_date = filename.split('_')[-1].split('.')[0]
        
        with open(os.path.join(directory, filename), 'r') as f:
            data = json.load(f)
        events = data['events']
        df = pd.json_normalize(events)
        # df = safe_select_columns(df, selected_columns)
        
        # Add event_fetch_date column
        df['event_fetch_date'] = event_fetch_date
        
        dfs.append(df)

# Concatenate all DataFrames
combined_df = pd.concat(dfs, ignore_index=True)

combined_df['datetime'] = combined_df['startTimestamp'].apply(datetime.fromtimestamp)

filtered_df = (
    combined_df.sort_values('event_fetch_date')
    .groupby('id')
    .last()
    .reset_index()
)

100%|██████████| 3921/3921 [04:43<00:00, 13.82it/s]
  combined_df['datetime'] = combined_df['startTimestamp'].apply(datetime.fromtimestamp)


In [10]:
# Select the desired columns
selected_columns = ['id', 'startTimestamp', 'slug', 'groundType', 'tournament.uniqueTournament.name', 'tournament.category.name', 'tournament.uniqueTournament.tennisPoints', 'tournament.uniqueTournament.hasEventPlayerStatistics', 'season.name', 'season.year', 'roundInfo.name', 'status.description', 'homeTeam.name', 'homeTeam.slug', 'homeTeam.shortName', 'homeTeam.country.name', 
                    'awayTeam.name', 'awayTeam.slug', 'awayTeam.shortName', 'awayTeam.country.name', 'winnerCode',
                    'homeScore.period1','homeScore.period2','homeScore.period3','homeScore.period4','homeScore.period5',
                    'awayScore.period1','awayScore.period2','awayScore.period3','awayScore.period4','awayScore.period5', 'event_fetch_date', 'datetime']

filtered_df = filtered_df[selected_columns].rename(columns={
    'tournament.uniqueTournament.name': 'tournament_name', 
    'tournament.category.name': 'tournament_category', 
    'tournament.uniqueTournament.tennisPoints': 'tournament_points', 
    'tournament.uniqueTournament.hasEventPlayerStatistics': 'tournament_has_stats', 
    'season.name': 'season_name', 
    'season.year': 'season_year', 
    'roundInfo.name': 'tournament_round', 
    'status.description': 'match_status', 
    'homeTeam.name': 'home_team', 
    'homeTeam.slug': 'home_team_slug', 
    'homeTeam.shortName': 'home_team_short', 
    'hometeam.country.name': 'home_team_country', 
    'awayTeam.name': 'away_team', 
    'awayTeam.slug': 'away_team_slug', 
    'awayTeam.shortName': 'away_team_short', 
    'awayTeam.country.name': 'away_team_country',
    'homeScore.period1': 'home_score_period1',
    'homeScore.period2': 'home_score_period2',
    'homeScore.period3': 'home_score_period3',
    'homeScore.period4': 'home_score_period4',
    'homeScore.period5': 'home_score_period5',
    'awayScore.period1': 'away_score_period1',
    'awayScore.period2': 'away_score_period2',
    'awayScore.period3': 'away_score_period3',
    'awayScore.period4': 'away_score_period4',
    'awayScore.period5': 'away_score_period5'
}).drop(columns='startTimestamp')

con.execute("DROP TABLE IF EXISTS sofascore_events")
con.execute("CREATE TABLE sofascore_events AS SELECT * FROM filtered_df")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x298e218da30>

In [11]:
con.execute("SELECT * FROM sofascore_events LIMIT 100").df()

Unnamed: 0,id,slug,groundType,tournament_name,tournament_category,tournament_points,tournament_has_stats,season_name,season_year,tournament_round,...,home_score_period3,home_score_period4,home_score_period5,away_score_period1,away_score_period2,away_score_period3,away_score_period4,away_score_period5,event_fetch_date,datetime
0,4493462.0,czech-republic-netherlands,,Davis Cup,Davis Cup,,True,Davis Cup 2010,2010,Round of 16,...,,,,,,,,,2014-02-01,2014-02-01 01:00:00
1,4493464.0,japan-canada,,Davis Cup,Davis Cup,,False,Davis Cup 2010,2010,Final,...,,,,,,,,,2014-01-31,2014-01-31 17:30:00
2,4493466.0,germany-spain,,Davis Cup,Davis Cup,,False,Davis Cup 2010,2010,Quarterfinals,...,,,,,,,,,2014-02-01,2014-02-01 01:15:00
3,4493468.0,australia-france,,Davis Cup,Davis Cup,,True,Davis Cup 2010,2010,Semifinals,...,,,,,,,,,2014-02-01,2014-01-31 23:45:00
4,4493470.0,great-britain-usa,,Davis Cup,Davis Cup,,True,Davis Cup 2010,2010,Final,...,,,,,,,,,2014-02-01,2014-02-01 07:30:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4843878.0,de-paula-turini,Clay,"Sao Paulo, Brazil Men Singles",Challenger,,False,,,1/16-finals (R32),...,,,,8.0,6.0,,,,2014-01-01,2013-12-31 23:55:00
96,4843884.0,andreozzi-demoliner-ratiwatana-sa-ratiwatana-so,Clay,"Sao Paulo, Brazil Men Double",Challenger,,False,,,1/8-finals (R16),...,10.0,,,7.0,6.0,4.0,,,2014-01-01,2013-12-31 23:50:00
97,4843894.0,alund-pella-kretschmer-satschko,Clay,"Sao Paulo, Brazil Men Double",Challenger,,True,,,1/8-finals (R16),...,,,,,,,,,2014-01-01,2014-01-01 00:15:00
98,4843896.0,gonzalez-arguello-ghem-souza,Clay,"Sao Paulo, Brazil Men Double",Challenger,,False,,,1/8-finals (R16),...,,,,3.0,4.0,,,,2014-01-01,2014-01-01 01:00:00


# Sofascore - Match stats

In [13]:
import os
import json
import glob
import pandas as pd
from tqdm import tqdm

def flatten_json(json_data, match_id):
    rows = []
    
    for period in json_data.get('statistics', []):
        period_name = period.get('period', '')
        
        for group in period.get('groups', []):
            group_name = group.get('groupName', '')
            
            for item in group.get('statisticsItems', []):
                row = {
                    'match_id': match_id,
                    'period': period_name,
                    'group': group_name,
                    'name': item.get('name', ''),
                    'home': item.get('home', ''),
                    'away': item.get('away', ''),
                    'compareCode': item.get('compareCode', ''),
                    'statisticsType': item.get('statisticsType', ''),
                    'valueType': item.get('valueType', ''),
                    'homeValue': item.get('homeValue', ''),
                    'awayValue': item.get('awayValue', ''),
                    'key': item.get('key', '')
                }
                
                if 'homeTotal' in item:
                    row['homeTotal'] = item['homeTotal']
                if 'awayTotal' in item:
                    row['awayTotal'] = item['awayTotal']
                
                rows.append(row)
    
    return rows

def process_files(folder_path, con, chunk_size=10000):
    all_data = []
    table_created = False
    
    for file_path in tqdm(glob.glob(os.path.join(folder_path, '*.json'))):
        match_id = os.path.splitext(os.path.basename(file_path))[0].split('_')[-1]
        
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                json_data = json.load(file)
                all_data.extend(flatten_json(json_data, match_id))
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in file {file_path}: {str(e)}")
        except Exception as e:
            print(f"Error processing file {file_path}: {str(e)}")
        
        # If we've collected enough rows or it's the first chunk, process the data
        if len(all_data) >= chunk_size or not table_created:
            df = pd.DataFrame(all_data)
            
            if not table_created:
                # Create the table using the first chunk of data
                con.execute("DROP TABLE IF EXISTS sofascore_match_stats")
                con.execute("CREATE TABLE sofascore_match_stats AS SELECT * FROM df LIMIT 0")
                table_created = True
            
            # Insert the data
            con.execute("INSERT INTO sofascore_match_stats SELECT * FROM df")
            all_data.clear()
    
    # Write any remaining data
    if all_data:
        df = pd.DataFrame(all_data)
        con.execute("INSERT INTO sofascore_match_stats SELECT * FROM df")

# Usage
folder_path = os.path.join('E:/', 'Data', 'tennis', 'sofascore', 'match-stats')

# Process files and insert data in chunks
process_files(folder_path, con)

  2%|▏         | 12212/656787 [00:06<05:25, 1979.07it/s]

Error decoding JSON in file E:/Data\tennis\sofascore\match-stats\match_stat_10182423.json: Expecting value: line 1 column 1 (char 0)


 51%|█████     | 332249/656787 [04:18<04:36, 1175.78it/s]

Error decoding JSON in file E:/Data\tennis\sofascore\match-stats\match_stat_7051288.json: Expecting value: line 1 column 1 (char 0)


 89%|████████▉ | 585916/656787 [07:58<01:07, 1054.05it/s]

Error decoding JSON in file E:/Data\tennis\sofascore\match-stats\match_stat_8570059.json: Expecting value: line 1 column 1 (char 0)


100%|██████████| 656787/656787 [09:00<00:00, 1215.00it/s]


# Sofascore - Point by point

In [21]:
con.execute("SELECT * FROM sofascore_match_stats LIMIT 100").df()

Unnamed: 0,match_id,period,group,name,home,away,compareCode,statisticsType,valueType,homeValue,awayValue,key,homeTotal,awayTotal
0,10000150,ALL,Service,Aces,1,4,2,positive,event,0,2,aces,,
1,10000150,ALL,Service,Double faults,2,5,2,negative,event,0,2,doubleFaults,,
2,10000150,ALL,Service,First serve,26/47 (55%),32/53 (60%),2,positive,team,42,32,firstServeAccuracy,47.0,53.0
3,10000150,ALL,Service,Second serve,19/21 (90%),16/21 (76%),1,positive,team,22,2,secondServeAccuracy,21.0,21.0
4,10000150,ALL,Service,First serve points,16/26 (62%),26/32 (81%),2,positive,team,22,8,firstServePointsAccuracy,26.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,10000152,2ND,Service,Service games played,4,4,3,positive,event,2,6,serviceGamesTotal,,
96,10000152,2ND,Service,Break points saved,2/3 (66%),1/4 (25%),1,positive,team,2,0,breakPointsSaved,3.0,4.0
97,10000152,2ND,Points,Service points won,13,9,1,positive,event,10,16,servicePointsScored,,
98,10000152,2ND,Points,Receiver points won,14,8,1,positive,event,14,14,receiverPointsScored,,


In [13]:
def flatten_json(json_data, match_id):
    rows = []
    
    if 'pointByPoint' not in json_data or not json_data['pointByPoint']:
        # print(f"Warning: No point-by-point data found for match {match_id}")
        return rows

    for set_data in json_data['pointByPoint']:
        set_number = set_data.get('set', 'Unknown')
        
        for game in set_data.get('games', []):
            game_number = game.get('game', 'Unknown')
            
            for point in game.get('points', []):
                row = {
                    'match_id': match_id,
                    'set': set_number,
                    'game': game_number,
                    'homePoint': point.get('homePoint', 'Unknown'),
                    'awayPoint': point.get('awayPoint', 'Unknown'),
                    'pointDescription': point.get('pointDescription', 'Unknown'),
                    'homePointType': point.get('homePointType', 'Unknown'),
                    'awayPointType': point.get('awayPointType', 'Unknown')
                }
                rows.append(row)
            
            # Add a row for the game score if it exists
            score = game.get('score')
            if score:
                row = {
                    'match_id': match_id,
                    'set': set_number,
                    'game': game_number,
                    'homePoint': 'GAME',
                    'awayPoint': 'GAME',
                    'pointDescription': -1,  # Use -1 to indicate this is a game score
                    'homePointType': score.get('homeScore', 'Unknown'),
                    'awayPointType': score.get('awayScore', 'Unknown'),
                    'serving': score.get('serving', 'Unknown'),
                    'scoring': score.get('scoring', 'Unknown')
                }
                rows.append(row)
    
    return rows

def process_files(folder_path):
    all_data = []
    
    for file_path in tqdm(glob.glob(os.path.join(folder_path, '*.json'))[450000:]):
        match_id = os.path.splitext(os.path.basename(file_path))[0].split('_')[-1]
        
        try:
            with open(file_path, 'r') as file:
                json_data = json.load(file)
                all_data.extend(flatten_json(json_data, match_id))
        except json.JSONDecodeError:
            print(f"Warning: Unable to parse JSON in file {file_path}")
        except Exception as e:
            print(f"Error processing file {file_path}: {str(e)}")
    
    return pd.DataFrame(all_data)

# Usage
folder_path = os.path.join('E:/', 'Data', 'tennis', 'sofascore', 'point-by-point-itf')
df = process_files(folder_path)

# con.execute("DROP TABLE IF EXISTS sofascore_point_by_point")
con.execute("INSERT INTO sofascore_point_by_point SELECT * FROM df")

  8%|▊         | 9381/112517 [00:02<00:32, 3147.53it/s]



 60%|██████    | 67540/112517 [00:21<00:13, 3245.30it/s]



 99%|█████████▉| 111429/112517 [00:35<00:00, 2963.49it/s]



100%|██████████| 112517/112517 [00:35<00:00, 3134.54it/s]


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x1fd1bd0a2b0>

In [16]:
con.close()

In [15]:
con.execute("SELECT * FROM sofascore_point_by_point LIMIT 100").df()

Unnamed: 0,match_id,set,game,homePoint,awayPoint,pointDescription,homePointType,awayPointType,serving,scoring
0,10000273,3,8,0,15,0,5,1,,
1,10000273,3,8,0,30,0,5,1,,
2,10000273,3,8,0,40,0,5,3,,
3,10000273,3,8,GAME,GAME,-1,2,6,2.0,2.0
4,10000273,3,7,0,15,0,5,1,,
...,...,...,...,...,...,...,...,...,...,...
95,10000273,1,13,1,0,0,1,5,,
96,10000273,1,13,1,1,0,5,1,,
97,10000273,1,13,2,1,0,6,5,,
98,10000273,1,13,3,1,0,1,5,,
