In [1]:
import duckdb
import glob
import os
import json
import pandas as pd

from tqdm import tqdm
from dateutil import parser
from datetime import datetime

In [2]:
def parse_dates(date_string):
    try:
        return parser.parse(date_string)
    except ValueError:
        return pd.NaT  # Not a Time - pandas' equivalent of NaN for datetime

In [3]:
con = duckdb.connect("E:/duckdb/tennis.duckdb")

# Betfair - Market summaries

In [47]:
market_summ = []
for path in glob.glob("E:/Data/tennis/market-summaries/*"):
    market = pd.read_csv(path, encoding='latin-1', dtype={'market_id':'str'})
    market_summ.append(market)
    
market_summ_df = pd.concat(market_summ)

market_summ_df.loc[market_summ_df['market_id'].str.len() < 11, 'market_id'] = \
    market_summ_df.loc[market_summ_df['market_id'].str.len() < 11, 'market_id'].apply(lambda x: x.ljust(11, '0'))

# Assuming your data is in a DataFrame called 'df'
market_summ_df['event_date'] = market_summ_df['event_date'].apply(parse_dates)
market_summ_df = market_summ_df.query('event_date <= "2098-01-01"').drop(columns='handicap')

con.execute("DROP TABLE IF EXISTS market_summaries")
con.execute("CREATE TABLE market_summaries AS SELECT * FROM market_summ_df")

ConnectionException: Connection Error: Connection already closed!

In [64]:
# con.execute("SELECT * FROM market_summaries LIMIT 100").df()

# Betfair - Competition mappings

In [56]:
comp_mappings = []
for path in glob.glob("E:/Data/tennis/competition-mapping/*"):
    mapping = pd.read_csv(path)
    comp_mappings.append(mapping)
    
comp_mappings_df = pd.concat(comp_mappings)
comp_mappings_df['market_id'] = "1." + comp_mappings_df['MARKET_ID'].astype(str).str.pad(9, fillchar='0', side='right')

comp_mappings_df.drop(columns='MARKET_ID',inplace=True)

con.execute("DROP TABLE IF EXISTS competition_mappings")
con.execute("CREATE TABLE competition_mappings AS SELECT * FROM comp_mappings_df")

<duckdb.duckdb.DuckDBPyConnection at 0x1a864ec09f0>

In [66]:
# con.execute("SELECT * FROM competition_mappings LIMIT 100").df()

# Sofascore - Events

In [4]:
# Directory containing the JSON files
directory = r'E:\Data\tennis\sofascore\events'

# List to store DataFrames from each file
dfs = []

# Function to safely select columns
def safe_select_columns(df, columns):
    return df.reindex(columns=columns, fill_value=pd.NA)

# Loop through all JSON files in the directory
for filename in tqdm(os.listdir(directory)):
    if filename.endswith('.json'):
        # Extract event_fetch_date from filename
        event_fetch_date = filename.split('_')[-1].split('.')[0]
        
        with open(os.path.join(directory, filename), 'r') as f:
            data = json.load(f)
        events = data['events']
        df = pd.json_normalize(events)
        # df = safe_select_columns(df, selected_columns)
        
        # Add event_fetch_date column
        df['event_fetch_date'] = event_fetch_date
        
        dfs.append(df)

# Concatenate all DataFrames
combined_df = pd.concat(dfs, ignore_index=True)

combined_df['datetime'] = combined_df['startTimestamp'].apply(datetime.fromtimestamp)

filtered_df = (
    combined_df.sort_values('event_fetch_date')
    .groupby('id')
    .last()
    .reset_index()
)

100%|██████████| 3921/3921 [04:43<00:00, 13.82it/s]
  combined_df['datetime'] = combined_df['startTimestamp'].apply(datetime.fromtimestamp)


In [10]:
# Select the desired columns
selected_columns = ['id', 'startTimestamp', 'slug', 'groundType', 'tournament.uniqueTournament.name', 'tournament.category.name', 'tournament.uniqueTournament.tennisPoints', 'tournament.uniqueTournament.hasEventPlayerStatistics', 'season.name', 'season.year', 'roundInfo.name', 'status.description', 'homeTeam.name', 'homeTeam.slug', 'homeTeam.shortName', 'homeTeam.country.name', 
                    'awayTeam.name', 'awayTeam.slug', 'awayTeam.shortName', 'awayTeam.country.name', 'winnerCode',
                    'homeScore.period1','homeScore.period2','homeScore.period3','homeScore.period4','homeScore.period5',
                    'awayScore.period1','awayScore.period2','awayScore.period3','awayScore.period4','awayScore.period5', 'event_fetch_date', 'datetime']

filtered_df = filtered_df[selected_columns].rename(columns={
    'tournament.uniqueTournament.name': 'tournament_name', 
    'tournament.category.name': 'tournament_category', 
    'tournament.uniqueTournament.tennisPoints': 'tournament_points', 
    'tournament.uniqueTournament.hasEventPlayerStatistics': 'tournament_has_stats', 
    'season.name': 'season_name', 
    'season.year': 'season_year', 
    'roundInfo.name': 'tournament_round', 
    'status.description': 'match_status', 
    'homeTeam.name': 'home_team', 
    'homeTeam.slug': 'home_team_slug', 
    'homeTeam.shortName': 'home_team_short', 
    'hometeam.country.name': 'home_team_country', 
    'awayTeam.name': 'away_team', 
    'awayTeam.slug': 'away_team_slug', 
    'awayTeam.shortName': 'away_team_short', 
    'awayTeam.country.name': 'away_team_country',
    'homeScore.period1': 'home_score_period1',
    'homeScore.period2': 'home_score_period2',
    'homeScore.period3': 'home_score_period3',
    'homeScore.period4': 'home_score_period4',
    'homeScore.period5': 'home_score_period5',
    'awayScore.period1': 'away_score_period1',
    'awayScore.period2': 'away_score_period2',
    'awayScore.period3': 'away_score_period3',
    'awayScore.period4': 'away_score_period4',
    'awayScore.period5': 'away_score_period5'
}).drop(columns='startTimestamp')

con.execute("DROP TABLE IF EXISTS sofascore_events")
con.execute("CREATE TABLE sofascore_events AS SELECT * FROM filtered_df")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x298e218da30>

In [11]:
con.execute("SELECT * FROM sofascore_events LIMIT 100").df()

Unnamed: 0,id,slug,groundType,tournament_name,tournament_category,tournament_points,tournament_has_stats,season_name,season_year,tournament_round,...,home_score_period3,home_score_period4,home_score_period5,away_score_period1,away_score_period2,away_score_period3,away_score_period4,away_score_period5,event_fetch_date,datetime
0,4493462.0,czech-republic-netherlands,,Davis Cup,Davis Cup,,True,Davis Cup 2010,2010,Round of 16,...,,,,,,,,,2014-02-01,2014-02-01 01:00:00
1,4493464.0,japan-canada,,Davis Cup,Davis Cup,,False,Davis Cup 2010,2010,Final,...,,,,,,,,,2014-01-31,2014-01-31 17:30:00
2,4493466.0,germany-spain,,Davis Cup,Davis Cup,,False,Davis Cup 2010,2010,Quarterfinals,...,,,,,,,,,2014-02-01,2014-02-01 01:15:00
3,4493468.0,australia-france,,Davis Cup,Davis Cup,,True,Davis Cup 2010,2010,Semifinals,...,,,,,,,,,2014-02-01,2014-01-31 23:45:00
4,4493470.0,great-britain-usa,,Davis Cup,Davis Cup,,True,Davis Cup 2010,2010,Final,...,,,,,,,,,2014-02-01,2014-02-01 07:30:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4843878.0,de-paula-turini,Clay,"Sao Paulo, Brazil Men Singles",Challenger,,False,,,1/16-finals (R32),...,,,,8.0,6.0,,,,2014-01-01,2013-12-31 23:55:00
96,4843884.0,andreozzi-demoliner-ratiwatana-sa-ratiwatana-so,Clay,"Sao Paulo, Brazil Men Double",Challenger,,False,,,1/8-finals (R16),...,10.0,,,7.0,6.0,4.0,,,2014-01-01,2013-12-31 23:50:00
97,4843894.0,alund-pella-kretschmer-satschko,Clay,"Sao Paulo, Brazil Men Double",Challenger,,True,,,1/8-finals (R16),...,,,,,,,,,2014-01-01,2014-01-01 00:15:00
98,4843896.0,gonzalez-arguello-ghem-souza,Clay,"Sao Paulo, Brazil Men Double",Challenger,,False,,,1/8-finals (R16),...,,,,3.0,4.0,,,,2014-01-01,2014-01-01 01:00:00


# Sofascore - Match stats

In [4]:
import os
import json
import glob
import pandas as pd
from tqdm import tqdm

def flatten_json(json_data, match_id):
    rows = []
    
    for period in json_data.get('statistics', []):
        period_name = period.get('period', '')
        
        for group in period.get('groups', []):
            group_name = group.get('groupName', '')
            
            for item in group.get('statisticsItems', []):
                row = {
                    'match_id': match_id,
                    'period': period_name,
                    'group': group_name,
                    'name': item.get('name', ''),
                    'home': item.get('home', ''),
                    'away': item.get('away', ''),
                    'compareCode': item.get('compareCode', ''),
                    'statisticsType': item.get('statisticsType', ''),
                    'valueType': item.get('valueType', ''),
                    'homeValue': item.get('homeValue', ''),
                    'awayValue': item.get('awayValue', ''),
                    'key': item.get('key', '')
                }
                
                if 'homeTotal' in item:
                    row['homeTotal'] = item['homeTotal']
                if 'awayTotal' in item:
                    row['awayTotal'] = item['awayTotal']
                
                rows.append(row)
    
    return rows

def process_files(folder_path, con, chunk_size=10000):
    all_data = []
    table_created = True
    
    for file_path in tqdm(glob.glob(os.path.join(folder_path, '*.json'))):
        match_id = os.path.splitext(os.path.basename(file_path))[0].split('_')[-1]
        
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                json_data = json.load(file)
                all_data.extend(flatten_json(json_data, match_id))
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in file {file_path}: {str(e)}")
        except Exception as e:
            print(f"Error processing file {file_path}: {str(e)}")
        
        # If we've collected enough rows or it's the first chunk, process the data
        if len(all_data) >= chunk_size or not table_created:
            df = pd.DataFrame(all_data)
            
            if not table_created:
                # Create the table using the first chunk of data
                con.execute("DROP TABLE IF EXISTS sofascore_match_stats")
                con.execute("CREATE TABLE sofascore_match_stats AS SELECT * FROM df LIMIT 0")
                table_created = True
            
            # Insert the data
            con.execute("INSERT INTO sofascore_match_stats SELECT * FROM df")
            all_data.clear()
    
    # Write any remaining data
    if all_data:
        df = pd.DataFrame(all_data)
        con.execute("INSERT INTO sofascore_match_stats SELECT * FROM df")

# Usage
folder_path = os.path.join('E:/', 'Data', 'tennis', 'sofascore', 'match-stats')

# Process files and insert data in chunks
process_files(folder_path, con)

100%|██████████| 5335/5335 [00:03<00:00, 1684.80it/s]


# Sofascore - Point by point

In [7]:
def flatten_json(json_data, match_id):
    rows = []
    
    if 'pointByPoint' not in json_data or not json_data['pointByPoint']:
        # print(f"Warning: No point-by-point data found for match {match_id}")
        return rows

    for set_data in json_data['pointByPoint']:
        set_number = set_data.get('set', 'Unknown')
        
        for game in set_data.get('games', []):
            game_number = game.get('game', 'Unknown')
            
            for point in game.get('points', []):
                row = {
                    'match_id': match_id,
                    'set': set_number,
                    'game': game_number,
                    'homePoint': point.get('homePoint', 'Unknown'),
                    'awayPoint': point.get('awayPoint', 'Unknown'),
                    'pointDescription': point.get('pointDescription', 'Unknown'),
                    'homePointType': point.get('homePointType', 'Unknown'),
                    'awayPointType': point.get('awayPointType', 'Unknown')
                }
                rows.append(row)
            
            # Add a row for the game score if it exists
            score = game.get('score')
            if score:
                row = {
                    'match_id': match_id,
                    'set': set_number,
                    'game': game_number,
                    'homePoint': 'GAME',
                    'awayPoint': 'GAME',
                    'pointDescription': -1,  # Use -1 to indicate this is a game score
                    'homePointType': score.get('homeScore', 'Unknown'),
                    'awayPointType': score.get('awayScore', 'Unknown'),
                    'serving': score.get('serving', 'Unknown'),
                    'scoring': score.get('scoring', 'Unknown')
                }
                rows.append(row)
    
    return rows

def process_files(folder_path):
    all_data = []
    
    for file_path in tqdm(glob.glob(os.path.join(folder_path, '*.json'))):
        match_id = os.path.splitext(os.path.basename(file_path))[0].split('_')[-1]
        
        try:
            with open(file_path, 'r') as file:
                json_data = json.load(file)
                all_data.extend(flatten_json(json_data, match_id))
        except json.JSONDecodeError:
            print(f"Warning: Unable to parse JSON in file {file_path}")
        except Exception as e:
            print(f"Error processing file {file_path}: {str(e)}")
    
    return pd.DataFrame(all_data)

# Usage
folder_path = os.path.join('E:/', 'Data', 'tennis', 'sofascore', 'point-by-point')
df = process_files(folder_path)

# con.execute("DROP TABLE IF EXISTS sofascore_point_by_point")
con.execute("INSERT INTO sofascore_point_by_point SELECT * FROM df")

100%|██████████| 5335/5335 [00:01<00:00, 2864.38it/s]


<duckdb.duckdb.DuckDBPyConnection at 0x1a57cb73270>

# Cleaned sofascore point by point data

In [7]:
def clean_pbp_data(pbp):
    pbp.sort_values(['match_id', 'set', 'game'], inplace=True)

    pbp['home_prev_point'] = pbp.groupby(['match_id', 'set', 'game'])['homePoint'].shift(1).fillna('0')
    pbp['away_prev_point'] = pbp.groupby(['match_id', 'set', 'game'])['awayPoint'].shift(1).fillna('0')

    pbp['homePoint_num'] = pd.to_numeric(pbp['homePoint'], errors='coerce')
    pbp['awayPoint_num'] = pd.to_numeric(pbp['awayPoint'], errors='coerce')
    pbp['home_prev_point_num'] = pd.to_numeric(pbp['home_prev_point'], errors='coerce')
    pbp['away_prev_point_num'] = pd.to_numeric(pbp['away_prev_point'], errors='coerce')

    pbp['homePointWinner'] = (
            (pbp['homePoint_num'] > pbp['home_prev_point_num']) |
            ((pbp['homePoint'] == "A") & (pbp['home_prev_point'] == "40")) |
            ((pbp['away_prev_point'] == "A") & (pbp['awayPoint'] == "40")) |
            ((pbp['homePoint'] == "GAME") & ((pbp['home_prev_point'] == "A") | (
                    (pbp['home_prev_point'] == "40") & (pbp['away_prev_point'] != "A"))))
    )

    pbp['awayPointWinner'] = (
            (pbp['awayPoint_num'] > pbp['away_prev_point_num']) |
            ((pbp['awayPoint'] == "A") & (pbp['away_prev_point'] == "40")) |
            ((pbp['home_prev_point'] == "A") & (pbp['homePoint'] == "40")) |
            ((pbp['awayPoint'] == "GAME") & ((pbp['away_prev_point'] == "A") | (
                    (pbp['away_prev_point'] == "40") & (pbp['home_prev_point'] != "A"))))
    )

    game_scores = pbp.query('pointDescription == -1')[
        ['match_id', 'set', 'game', 'homePointType', 'awayPointType', 'serving']].drop_duplicates()

    game_scores.sort_values(['match_id', 'set', 'game'], inplace=True)
    game_scores['homeGames'] = game_scores.groupby(['match_id', 'set'])['homePointType'].shift(1).fillna(0)
    game_scores['awayGames'] = game_scores.groupby(['match_id', 'set'])['awayPointType'].shift(1).fillna(0)
    game_scores['homeSetWinner'] = (
            (game_scores['homePointType'] == 6) & (game_scores['awayPointType'] <= 4) |
            (game_scores['homePointType'] == 7) & (game_scores['awayPointType'].isin([5, 6])) |
            (game_scores['homePointType'] > 7) & ((game_scores['homePointType'] - game_scores['awayPointType']) == 2)
    )

    game_scores['awaySetWinner'] = (
            (game_scores['awayPointType'] == 6) & (game_scores['homePointType'] <= 4) |
            (game_scores['awayPointType'] == 7) & (game_scores['homePointType'].isin([5, 6])) |
            (game_scores['awayPointType'] > 7) & ((game_scores['awayPointType'] - game_scores['homePointType']) == 2)
    )

    game_scores['homeSets'] = game_scores.groupby('match_id')['homeSetWinner'].cumsum()
    game_scores['homeSets'] = game_scores.groupby('match_id')['homeSets'].shift(1).fillna(0)
    game_scores['awaySets'] = game_scores.groupby('match_id')['awaySetWinner'].cumsum()
    game_scores['awaySets'] = game_scores.groupby('match_id')['awaySets'].shift(1).fillna(0)

    game_scores['homeServing'] = game_scores['serving'] == 1.0
    game_scores['awayServing'] = game_scores['serving'] == 2.0

    pbp_merged = pbp.merge(game_scores[
                               ['match_id', 'set', 'game', 'homeGames', 'awayGames', 'homeSets', 'awaySets',
                                'homeServing',
                                'awayServing']], on=['match_id', 'set', 'game'])

    pbp_merged['homeMatchWinner'] = pbp_merged['winnerCode'] == 1.0
    pbp_merged['awayMatchWinner'] = pbp_merged['winnerCode'] == 2.0

    invalid_points_non_tb = ['1',
                             '2',
                             '3',
                             '4',
                             '5',
                             '6',
                             '7',
                             '8',
                             '9',
                             '10',
                             '11',
                             '12',
                             '13',
                             '14',
                             '16',
                             '17',
                             '18',
                             '19',
                             '20',
                             '21',
                             '22',
                             '23',
                             '41']

    invalid_match_ids = pbp_merged[
        ((pbp_merged['bo5'] == False) & ((pbp_merged['homeSets'] >= 2) | (pbp_merged['awaySets'] >= 2))) |
        (pbp_merged['game'] != pbp_merged['homeGames'] + pbp_merged['awayGames'] + 1) |
        (pbp_merged['set'] != pbp_merged['homeSets'] + pbp_merged['awaySets'] + 1) |
        ((pbp_merged['game'] != 13) & (pbp_merged['home_prev_point'].isin(invalid_points_non_tb)))
        ]['match_id'].unique().tolist()

    pbp_merged['next_home_pt_is_game'] = pbp_merged.groupby('match_id')['homePoint'].shift(-1) == "GAME"
    pbp_merged = pbp_merged.query('~(game == 13 and next_home_pt_is_game == True)').copy()

    pbp_merged['home_win_tiebreak'] = ((pbp_merged['homePoint_num'] == 7.0) & (pbp_merged['awayPoint_num'] <= 5.0)) | (
                (pbp_merged['homePoint_num'] > 7.0) & (
                    pbp_merged['homePoint_num'] - pbp_merged['awayPoint_num']) >= 2.0)
    pbp_merged['away_win_tiebreak'] = ((pbp_merged['awayPoint_num'] == 7.0) & (pbp_merged['homePoint_num'] <= 5.0)) | (
                (pbp_merged['awayPoint_num'] > 7.0) & (
                    pbp_merged['awayPoint_num'] - pbp_merged['homePoint_num']) >= 2.0)
    pbp_merged['home_prev_win_tiebreak'] = ((pbp_merged['home_prev_point_num'] == 7.0) & (
                pbp_merged['away_prev_point_num'] <= 5.0)) | ((pbp_merged['home_prev_point_num'] > 7.0) & (
                pbp_merged['home_prev_point_num'] - pbp_merged['away_prev_point_num']) >= 2.0)
    pbp_merged['away_prev_win_tiebreak'] = ((pbp_merged['away_prev_point_num'] == 7.0) & (
                pbp_merged['home_prev_point_num'] <= 5.0)) | ((pbp_merged['away_prev_point_num'] > 7.0) & (
                pbp_merged['away_prev_point_num'] - pbp_merged['home_prev_point_num']) >= 2.0)
    pbp_merged = pbp_merged.query(
        "home_win_tiebreak == False and away_win_tiebreak == False and home_prev_win_tiebreak == False and away_prev_win_tiebreak == False").copy()

    pbp_merged = pbp_merged.query(
        '~(homePoint == "GAME" and game == 13) and match_id not in @invalid_match_ids').copy().drop(columns=[
        'away_prev_win_tiebreak', 'home_prev_win_tiebreak', 'away_win_tiebreak', 'home_win_tiebreak',
        'next_home_pt_is_game', 'awayServing', 'home_prev_point_num', 'away_prev_point_num', 'homePoint_num',
        'awayPoint_num',
        'homePoint', 'awayPoint', 'pointDescription', 'homePointType', 'awayPointType', 'serving', 'scoring',
        'winnerCode', 'tournament_category', 'tournament_points', 'bo5', 'awayMatchWinner', 'awayPointWinner',
    ]).rename(columns={
        'home_prev_point': 'home_game_score',
        'away_prev_point': 'away_game_score',
        'homePointWinner': 'home_point_winner',
        'homeGames': 'home_games_won',
        'awayGames': 'away_games_won',
        'homeSets': 'home_sets_won',
        'awaySets': 'away_sets_won',
        'homeServing': 'home_serving',
        'homeMatchWinner': 'home_match_winner'
    })

    pbp_merged['home_sets_won'] = pbp_merged['home_sets_won'].astype(int)
    pbp_merged['away_sets_won'] = pbp_merged['away_sets_won'].astype(int)
    pbp_merged['home_games_won'] = pbp_merged['home_games_won'].astype(int)
    pbp_merged['away_games_won'] = pbp_merged['away_games_won'].astype(int)
    pbp_merged['home_point_winner'] = pbp_merged['home_point_winner'].astype(int)
    pbp_merged['home_serving'] = pbp_merged['home_serving'].astype(int)
    pbp_merged['home_match_winner'] = pbp_merged['home_match_winner'].astype(int)

    df = pbp_merged[['match_id', 'set', 'game', 'home_sets_won', 'away_sets_won', 'home_games_won', 'away_games_won',
                       'home_game_score', 'away_game_score', 'home_serving', 'home_point_winner', 'home_match_winner']]

    df_home = df.rename(columns={
        'home_sets_won': 'sets_for',
        'away_sets_won': 'sets_against',
        'home_games_won': 'games_for',
        'away_games_won': 'games_against',
        'home_game_score': 'points_for',
        'away_game_score': 'points_against',
        'home_serving': 'serving',
        'home_point_winner': 'point_winner',
        'home_match_winner': 'match_winner'
    })

    df_home['position'] = 'home'

    df_away = df.rename(columns={
        'home_sets_won': 'sets_against',
        'away_sets_won': 'sets_for',
        'home_games_won': 'games_against',
        'away_games_won': 'games_for',
        'home_game_score': 'points_against',
        'away_game_score': 'points_for',
        'home_serving': 'serving',
        'home_point_winner': 'point_winner',
        'home_match_winner': 'match_winner'
    })

    df_away['position'] = 'away'
    df_away['serving'] = (df_away['serving'] != 1).astype(int)
    df_away['point_winner'] = (df_away['point_winner'] != 1).astype(int)
    df_away['match_winner'] = (df_away['match_winner'] != 1).astype(int)

    df_final = pd.concat([df_home, df_away])

    return df_final


def chunk_list(data, n):
    # Split the list 'data' into chunks of size 'n'
    for i in range(0, len(data), n):
        yield data[i:i + n]

In [9]:
con.execute("DROP TABLE sofascore_point_by_point_clean")
con.execute("CREATE TABLE sofascore_point_by_point_clean AS SELECT * FROM pbp_clean")

<duckdb.duckdb.DuckDBPyConnection at 0x142affc88b0>

In [10]:
con.execute("TRUNCATE TABLE sofascore_point_by_point_clean")

<duckdb.duckdb.DuckDBPyConnection at 0x142affc88b0>

In [11]:
events = con.execute("SELECT id, winnerCode, tournament_category, tournament_points FROM sofascore_events").df()

event_ids = events['id'].astype(int).unique().tolist()
chunked_event_ids = list(chunk_list(event_ids, 40000))

for chk in tqdm(chunked_event_ids):
    pbp = con.execute(f"""
    SELECT p.*,  e.winnerCode, e.tournament_category, e.tournament_points
    FROM sofascore_point_by_point p 
    INNER JOIN sofascore_events e ON p.match_id = e.id
    WHERE p.match_id IN ({','.join(map(str, chk))})
    """).df()

    pbp['bo5'] = (pbp['tournament_category'] == "ATP") & (pbp['tournament_points'] == 2000.0)

    pbp_clean = clean_pbp_data(pbp)
    con.execute("INSERT INTO sofascore_point_by_point_clean SELECT * FROM pbp_clean")

 25%|██▌       | 7/28 [01:00<04:58, 14.22s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 29%|██▊       | 8/28 [01:24<05:41, 17.09s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 86%|████████▌ | 24/28 [06:50<01:24, 21.07s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 89%|████████▉ | 25/28 [07:13<01:05, 21.69s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 93%|█████████▎| 26/28 [07:38<00:45, 22.69s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

100%|██████████| 28/28 [08:06<00:00, 17.37s/it]


In [12]:
con.close()