In [1]:
import pandas as pd
import duckdb
import re
import unidecode

In [7]:
def process_name(name, words_to_reverse, slug=True, first_name_initial=None):
    name = name.lower()
    name = unidecode.unidecode(name)
    name = re.sub('-', ' ', name)
    name = re.sub(r'[^a-z\s]', '', name)
    
    words = name.split()
    
    if words_to_reverse > 0:
        if len(words) > words_to_reverse:
            words = words[words_to_reverse:] + words[0:words_to_reverse]
            if first_name_initial:
                words[0] = words[0][0]
        
    if slug:
        name = '-'.join(words)
    else:
        name = ' '.join(words)
        
    return name

def unpivot_events(df):
    # Create a DataFrame for home teams
    home_df = df[['id', 'datetime', 'home_clean_name', 'index_home']]
    home_df['team_type'] = 'home'
    home_df = home_df.rename(columns={'home_clean_name': 'team_name', 'index_home': 'index'})

    # Create a DataFrame for away teams
    away_df = df[['id', 'datetime', 'away_clean_name', 'index_away']]
    away_df['team_type'] = 'away'
    away_df = away_df.rename(columns={'away_clean_name': 'team_name', 'index_away': 'index'})

    # Combine the two DataFrames
    result = pd.concat([home_df, away_df], ignore_index=True)

    # Sort the result
    result = result.sort_values(['id', 'team_type']).reset_index(drop=True)

    return result

In [5]:
con = duckdb.connect("E:/duckdb/tennis.duckdb", read_only=True)
tennis_markets = con.execute("""
SELECT *

FROM competition_mappings c
INNER JOIN market_summaries m
ON c.market_id = m.market_id
""").df()

excluded_selection_names = pd.read_csv('../mappings/excluded_selection_names.csv', header=None)[0].tolist()

tennis_markets = tennis_markets[~tennis_markets['selection_name'].str.contains("/")]
tennis_markets = tennis_markets[~tennis_markets['selection_name'].isin(excluded_selection_names)]
tennis_markets['bf_name'] = [process_name(x, 0, False) for x in tennis_markets['selection_name']]
tennis_markets['FORMATTED_DATE'] = pd.to_datetime(tennis_markets['FORMATTED_DATE'])

sofascore_events = con.execute("SELECT * FROM sofascore_events WHERE tournament_category IN ('ATP','WTA','Challenger','ITF Men','ITF Women','WTA 125')").df()

sofascore_events = sofascore_events[~sofascore_events['home_team'].str.contains('/')]
sofascore_events = sofascore_events[~sofascore_events['away_team'].str.contains('/')]
sofascore_events = sofascore_events[sofascore_events['match_status'] != 'Not started']
sofascore_events['event_fetch_date'] = pd.to_datetime(sofascore_events['event_fetch_date'])

sofascore_events['home_clean_name'] = [process_name(x, 1, False) for x in sofascore_events['home_team_slug']]
sofascore_events['away_clean_name'] = [process_name(x, 1, False) for x in sofascore_events['away_team_slug']]

con.close()

In [8]:
name_mapping = pd.read_csv('../mappings/player_name_mapping.csv')

tennis_markets_with_mapping = tennis_markets.merge(name_mapping, left_on='bf_name', right_on='name')
events_short = sofascore_events.merge(name_mapping, left_on='home_clean_name', right_on='name').merge(name_mapping,
                                                                                                      left_on='away_clean_name',
                                                                                                      right_on='name',
                                                                                                      suffixes=('_home',
                                                                                                                '_away'))[
    ['id', 'datetime', 'home_clean_name', 'away_clean_name', 'index_home', 'index_away']]
events_short['id'] = events_short['id'].astype(int)

events_short_unpiv = unpivot_events(events_short)
tennis_markets_with_mapping_short = tennis_markets_with_mapping[['market_id', 'event_date', 'name', 'index']]
tennis_markets_with_mapping_short['index'] = tennis_markets_with_mapping_short['index'].astype(int)
group_counts = tennis_markets_with_mapping_short.groupby('market_id').size()
valid_market_ids = group_counts[group_counts == 2].index
tennis_markets_with_mapping_short = tennis_markets_with_mapping_short[
    tennis_markets_with_mapping_short['market_id'].isin(valid_market_ids)]
tennis_markets_with_mapping_short = tennis_markets_with_mapping_short.reset_index(drop=True)

events_short_unpiv['datetime'] = pd.to_datetime(events_short_unpiv['datetime'])
tennis_markets_with_mapping_short['event_date'] = pd.to_datetime(tennis_markets_with_mapping_short['event_date'])

# Create sorted index pairs for joining
events_short_unpiv['sort_key'] = events_short_unpiv.groupby('id')['index'].transform(
    lambda x: ','.join(map(str, sorted(x))))
tennis_markets_with_mapping_short['sort_key'] = tennis_markets_with_mapping_short.groupby('market_id')[
    'index'].transform(lambda x: ','.join(map(str, sorted(x))))

# Perform the join
merged = pd.merge(events_short_unpiv, tennis_markets_with_mapping_short, on='sort_key')
# Calculate time difference
merged['time_diff'] = abs(merged['datetime'] - merged['event_date']).dt.days

merged[['id', 'market_id', 'time_diff']].drop_duplicates().query('time_diff == 0').to_csv(
    '../mappings/market_match_mapping.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  home_df['team_type'] = 'home'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  away_df['team_type'] = 'away'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tennis_markets_with_mapping_short['index'] = tennis_markets_with_mapping_short['index'].astype(int)
