In [1]:
import duckdb
import pandas as pd
import re
import unidecode
# import numpy as np
from rapidfuzz import fuzz
# from phonetics import metaphone, soundex
# from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
# from Levenshtein import distance as edit_distance
# import torch

In [2]:
def process_name(name, words_to_reverse, slug=True, first_name_initial=None):
    name = name.lower()
    name = unidecode.unidecode(name)
    name = re.sub('-', ' ', name)
    name = re.sub(r'[^a-z\s]', '', name)
    
    words = name.split()
    
    if words_to_reverse > 0:
        if len(words) > words_to_reverse:
            words = words[words_to_reverse:] + words[0:words_to_reverse]
            if first_name_initial:
                words[0] = words[0][0]
        
    if slug:
        name = '-'.join(words)
    else:
        name = ' '.join(words)
        
    return name

In [3]:
con = duckdb.connect("E:/duckdb/tennis.duckdb", read_only=True)
tennis_markets = con.execute("""
SELECT *

FROM competition_mappings c
INNER JOIN market_summaries m
ON c.market_id = m.market_id
""").df()

sofascore_events = con.execute("SELECT * FROM sofascore_events WHERE tournament_category IN ('ATP','WTA','Challenger','ITF Men','ITF Women')").df()
con.close()

excluded_selection_names = pd.read_csv('../mappings/excluded_selection_names.csv', header=None)[0].tolist()

In [4]:
tennis_markets = tennis_markets[~tennis_markets['selection_name'].str.contains("/")]
tennis_markets = tennis_markets[~tennis_markets['selection_name'].isin(excluded_selection_names)]
tennis_markets['bf_name'] = [process_name(x, 0, False) for x in tennis_markets['selection_name']]
tennis_markets['FORMATTED_DATE'] = pd.to_datetime(tennis_markets['FORMATTED_DATE'])
betfair_names = tennis_markets[['bf_name']].drop_duplicates()

In [21]:
sofascore_events = sofascore_events[~sofascore_events['home_team'].str.contains('/')]
sofascore_events = sofascore_events[~sofascore_events['away_team'].str.contains('/')]
sofascore_events = sofascore_events[sofascore_events['match_status'] != 'Not started']
sofascore_events['event_fetch_date'] = pd.to_datetime(sofascore_events['event_fetch_date'])

sofascore_events['home_clean_name'] = [process_name(x, 1, False) for x in sofascore_events['home_team_slug']]
sofascore_events['away_clean_name'] = [process_name(x, 1, False) for x in sofascore_events['away_team_slug']]

In [6]:
sofascore_names = pd.concat([sofascore_events[['home_team_slug', 'home_team']],sofascore_events[['away_team_slug', 'away_team']].rename(columns={'away_team_slug': 'home_team_slug','away_team': 'home_team'})]).drop_duplicates().rename(columns={
    'home_team_slug': 'ss_slug',
    'home_team': 'ss_name'
})

In [7]:
sofascore_names['ss_clean_slug'] = [process_name(x, 0, False) for x in sofascore_names['ss_slug']]
sofascore_names['ss_clean_slug_1_rev'] = [process_name(x, 1, False) for x in sofascore_names['ss_slug']]
sofascore_names['ss_clean_slug_2_rev'] = [process_name(x, 2, False) for x in sofascore_names['ss_slug']]
sofascore_names['ss_clean_slug_3_rev'] = [process_name(x, 3, False) for x in sofascore_names['ss_slug']]
sofascore_names['ss_clean_slug_4_rev'] = [process_name(x, 4, False) for x in sofascore_names['ss_slug']]

sofascore_names['ss_clean_slug_1_rev_init'] = [process_name(x, 1, False, True) for x in sofascore_names['ss_slug']]
sofascore_names['ss_clean_slug_2_rev_init'] = [process_name(x, 2, False, True) for x in sofascore_names['ss_slug']]
sofascore_names['ss_clean_slug_3_rev_init'] = [process_name(x, 3, False, True) for x in sofascore_names['ss_slug']]
sofascore_names['ss_clean_slug_4_rev_init'] = [process_name(x, 4, False, True) for x in sofascore_names['ss_slug']]

In [8]:
exact_1_rev = sofascore_names.merge(betfair_names, left_on='ss_clean_slug_1_rev', right_on='bf_name', how='left')
exact_2_rev = sofascore_names.merge(betfair_names, left_on='ss_clean_slug_2_rev', right_on='bf_name', how='left')
exact_3_rev = sofascore_names.merge(betfair_names, left_on='ss_clean_slug_3_rev', right_on='bf_name', how='left')
exact_4_rev = sofascore_names.merge(betfair_names, left_on='ss_clean_slug_4_rev', right_on='bf_name', how='left')

exact_1_rev_init = sofascore_names.merge(betfair_names, left_on='ss_clean_slug_1_rev_init', right_on='bf_name', how='left')
exact_2_rev_init = sofascore_names.merge(betfair_names, left_on='ss_clean_slug_2_rev_init', right_on='bf_name', how='left')
exact_3_rev_init = sofascore_names.merge(betfair_names, left_on='ss_clean_slug_3_rev_init', right_on='bf_name', how='left')
exact_4_rev_init = sofascore_names.merge(betfair_names, left_on='ss_clean_slug_4_rev_init', right_on='bf_name', how='left')

In [9]:
exact_matches = pd.concat([exact_1_rev, exact_2_rev, exact_3_rev, exact_4_rev, exact_1_rev_init, exact_2_rev_init, exact_3_rev_init, exact_4_rev_init]).drop(columns='ss_name').drop_duplicates()
exact_matches['index'] = range(len(exact_matches))
exact_matches['index'] = exact_matches.groupby('ss_slug')['index'].transform('first')

In [10]:
name_mapping_initial = pd.melt(exact_matches, id_vars=['index'], value_vars=['ss_clean_slug','ss_clean_slug_1_rev','ss_clean_slug_2_rev', 'ss_clean_slug_3_rev','ss_clean_slug_4_rev', 'bf_name'], var_name='column', value_name='name')
name_mapping_initial = name_mapping_initial[~name_mapping_initial['name'].isna()].drop(columns='column').sort_values('index')
name_mapping_initial['name'] = [process_name(name,0,False) for name in name_mapping_initial['name']]
name_mapping_initial = name_mapping_initial.drop_duplicates().reset_index(drop=True)

In [11]:
bf_names_exact = exact_matches.dropna()['bf_name'].tolist()
betfair_names_remaining = betfair_names[~betfair_names['bf_name'].isin(bf_names_exact)]

In [12]:
duplicated_names = name_mapping_initial.groupby('name')['index'].count().reset_index().query('index > 1')['name']
name_mapping_initial = name_mapping_initial.query('name not in @duplicated_names')

In [15]:
name_mapping_initial.to_csv('../processing-files/name_mapping_initial.csv', index=False)
betfair_names_remaining.to_csv('../processing-files/betfair_names_remaining.csv', index=False)

In [19]:
fuzzy_matched = pd.read_csv('../processing-files/fuzzy_name_matches.csv')

# Matching based on event dates

In [22]:
sofascore_events_w_mapping = sofascore_events.merge(fuzzy_matched, left_on='home_clean_name', right_on='name', how='left').merge(fuzzy_matched, left_on='away_clean_name', right_on='name', how='left',suffixes=('_home','_away'))
sofascore_events_w_mapping = sofascore_events_w_mapping[~sofascore_events_w_mapping['index_home'].isna() | ~sofascore_events_w_mapping['index_away'].isna()]

In [24]:
home_fuzzy_match = sofascore_events_w_mapping.merge(tennis_markets, left_on='bf_name_home', right_on='bf_name')

In [25]:
min_bf_date = min(tennis_markets['FORMATTED_DATE'])
home_fuzzy_match = home_fuzzy_match.query('event_fetch_date >= @min_bf_date')

In [26]:
home_fuzzy_match['time_diff'] = home_fuzzy_match['event_fetch_date'] - home_fuzzy_match['FORMATTED_DATE']
home_fuzzy_match['time_diff'] = home_fuzzy_match['time_diff'].dt.days

In [27]:
similar_dates = home_fuzzy_match.query("abs(time_diff) < 2")

In [28]:
similar_dates[['index_home','name_home','bf_name_home']].drop_duplicates().to_csv('../processing-files/bf_home_match_fuzzy.csv')
### MANUAL CHECK ###

In [35]:
home_fuzzy_match = pd.read_csv('../processing-files/bf_home_match_fuzzy.csv')
home_fuzzy_match = pd.melt(home_fuzzy_match, id_vars=['index']).drop(columns='variable').rename(columns={'value':'name'})

In [38]:
name_mapping_w_home = pd.concat([name_mapping_initial,home_fuzzy_match]).drop_duplicates()
updated_bf_names_remaining = [x for x in betfair_names_remaining['bf_name'].tolist() if x not in name_mapping_w_home['name'].tolist()]
betfair_names_remaining_w_home = pd.DataFrame({'bf_name':updated_bf_names_remaining})

In [56]:
name_mapping_w_home.to_csv('../processing-files/name_mapping_w_home.csv', index=False)
betfair_names_remaining_w_home.to_csv('../processing-files/betfair_names_remaining_w_home.csv', index=False)

In [61]:
fuzzy_matched_w_home = pd.read_csv('../processing-files/fuzzy_name_matches_w_home.csv')
# Matching based on event dates
sofascore_events_w_mapping_w_home = sofascore_events.merge(fuzzy_matched_w_home, left_on='away_clean_name',
                                                                      right_on='name', how='inner')
away_fuzzy_match = sofascore_events_w_mapping_w_home.merge(tennis_markets, left_on='bf_name', right_on='bf_name')

away_fuzzy_match = away_fuzzy_match.query('event_fetch_date >= @min_bf_date')
away_fuzzy_match['time_diff'] = away_fuzzy_match['event_fetch_date'] - away_fuzzy_match['FORMATTED_DATE']
away_fuzzy_match['time_diff'] = away_fuzzy_match['time_diff'].dt.days
similar_dates_away = away_fuzzy_match.query("abs(time_diff) < 2")
similar_dates_away[['index', 'name', 'bf_name']].drop_duplicates().to_csv(
    '../processing-files/bf_away_match_fuzzy.csv')
### MANUAL CHECK ###

In [62]:
away_fuzzy_match = pd.read_csv('../processing-files/bf_away_match_fuzzy.csv')
away_fuzzy_match = pd.melt(away_fuzzy_match, id_vars=['index']).drop(columns='variable').rename(columns={'value':'name'})

In [66]:
name_mapping_w_away = pd.concat([name_mapping_w_home,away_fuzzy_match]).drop_duplicates()
updated_bf_names_remaining = [x for x in updated_bf_names_remaining if x not in name_mapping_w_away['name'].tolist()]
betfair_names_remaining_w_away = pd.DataFrame({'bf_name':updated_bf_names_remaining})

In [187]:
def process_name(name, bf_names, threshold):
    matches = []
    for bf_name in bf_names:
        similarity = fuzz.ratio(name, bf_name)
        if similarity >= threshold:
            matches.append((bf_name, similarity))
    matches.sort(key=lambda x: x[1], reverse=True)
    return matches

matched_via_fixture = []

for name in tqdm(betfair_names_remaining_w_away['bf_name']):
    bf_matches = tennis_markets.query('bf_name == @name')
    matches_w_score = process_name(name, name_mapping_w_away['name'], 70)
    
    matches = [x[0] for x in matches_w_score]
    
    matched_indices = name_mapping_w_away.query('name in @matches')['index'].unique().tolist()
    all_matched_names = name_mapping_w_away.query('index in @matched_indices')['name'].tolist()
    
    ss_matches = sofascore_events.query('home_clean_name in @all_matched_names or away_clean_name in @all_matched_names').copy()
    ss_matches.loc[ss_matches['home_clean_name'].isin(all_matched_names),'matched_name'] = ss_matches['home_clean_name']
    ss_matches.loc[ss_matches['away_clean_name'].isin(all_matched_names),'matched_name'] = ss_matches['away_clean_name']
    
    cross_joined_matches = bf_matches.merge(ss_matches,how='cross')
    cross_joined_matches['time_diff'] = (cross_joined_matches['datetime'] - cross_joined_matches['event_date']).dt.days
    cross_joined_matches = cross_joined_matches.query('abs(time_diff) < 2')
    
    if len(cross_joined_matches) < 1:
        continue
    
    cj_summ = cross_joined_matches.groupby(['matched_name'])['market_id'].count().reset_index().sort_values('market_id',ascending=False)
    matched_name = cj_summ.iloc[0]['matched_name']
    matches_count = cj_summ.iloc[0]['market_id']
    matched_via_fixture.append(
        (name, matched_name, matches_count)
    )


100%|██████████| 3521/3521 [09:31<00:00,  6.17it/s]


In [189]:
matched_via_fixture_df = pd.DataFrame(matched_via_fixture, columns=['bf_name', 'matched_name', 'matches_count'])
matched_via_fixture_df.to_csv('../processing-files/matched_via_fixture.csv')

# Build combined mapping

In [204]:
matched_via_fixture_df_manual = pd.read_csv('../processing-files/manually-checked/matched_via_fixture.csv')

bf_away_match_fuzzy_manual = pd.read_csv('../processing-files/manually-checked/bf_away_match_fuzzy.csv')
bf_away_match_fuzzy_manual = pd.melt(bf_away_match_fuzzy_manual, id_vars=['index']).drop(columns='variable').rename(columns={'value':'name'})

bf_home_match_fuzzy_manual = pd.read_csv('../processing-files/manually-checked/bf_home_match_fuzzy.csv')
bf_home_match_fuzzy_manual = pd.melt(bf_home_match_fuzzy_manual, id_vars=['index']).drop(columns='variable').rename(columns={'value':'name'})

name_mapping_initial = pd.read_csv('../processing-files/name_mapping_initial.csv')

In [205]:
name_mapping = pd.concat([name_mapping_initial,bf_home_match_fuzzy_manual,bf_away_match_fuzzy_manual]).drop_duplicates()
exclude_repeating_names = name_mapping.groupby('name').count().reset_index().query('index > 1')['name'].tolist()
name_mapping = name_mapping.query('name not in @exclude_repeating_names')

In [220]:
fix_mapping = matched_via_fixture_df_manual.merge(name_mapping, left_on='matched_name', right_on='name')[['index','bf_name']].rename(columns={'bf_name':'name'})

In [221]:
name_mapping = pd.concat([name_mapping,fix_mapping])

In [280]:
name_mapping.to_csv('../mappings/player_name_mapping.csv',index=False)

In [249]:
tennis_markets_with_mapping = tennis_markets.merge(name_mapping, left_on='bf_name',right_on='name')

In [250]:
events_short = sofascore_events.merge(name_mapping, left_on='home_clean_name', right_on='name').merge(name_mapping, left_on='away_clean_name', right_on='name',suffixes=('_home','_away'))[['id','datetime','home_clean_name','away_clean_name','index_home','index_away']]
events_short['id'] = events_short['id'].astype(int)

In [256]:
def unpivot_events(df):
    # Create a DataFrame for home teams
    home_df = df[['id', 'datetime', 'home_clean_name', 'index_home']]
    home_df['team_type'] = 'home'
    home_df = home_df.rename(columns={'home_clean_name': 'team_name', 'index_home': 'index'})

    # Create a DataFrame for away teams
    away_df = df[['id', 'datetime', 'away_clean_name', 'index_away']]
    away_df['team_type'] = 'away'
    away_df = away_df.rename(columns={'away_clean_name': 'team_name', 'index_away': 'index'})

    # Combine the two DataFrames
    result = pd.concat([home_df, away_df], ignore_index=True)

    # Sort the result
    result = result.sort_values(['id', 'team_type']).reset_index(drop=True)

    return result

events_short_unpiv = unpivot_events(events_short)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  home_df['team_type'] = 'home'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  away_df['team_type'] = 'away'


In [252]:
tennis_markets_with_mapping_short = tennis_markets_with_mapping[['market_id','event_date','name','index']]
tennis_markets_with_mapping_short['index'] = tennis_markets_with_mapping_short['index'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tennis_markets_with_mapping_short['index'] = tennis_markets_with_mapping_short['index'].astype(int)


In [254]:
group_counts = tennis_markets_with_mapping_short.groupby('market_id').size()
valid_market_ids = group_counts[group_counts == 2].index
tennis_markets_with_mapping_short = tennis_markets_with_mapping_short[tennis_markets_with_mapping_short['market_id'].isin(valid_market_ids)]
tennis_markets_with_mapping_short = tennis_markets_with_mapping_short.reset_index(drop=True)
tennis_markets_with_mapping_short

Unnamed: 0,market_id,event_date,name,index
0,1.230333344,2024-07-02 14:33:35,kristina dmitruk,8025
1,1.230333344,2024-07-02 14:33:35,antonia schmidt,6956
2,1.230219852,2024-07-02 09:46:00,constant lestienne,1443
3,1.230219852,2024-07-02 09:46:00,lorenzo musetti,4378
4,1.230232771,2024-07-02 09:50:00,tomas martin etcheverry,4322
...,...,...,...,...
508455,1.139564037,2018-01-31 12:58:00,a vrljic,426
508456,1.139564024,2018-01-31 14:14:00,c paquet,245
508457,1.139564024,2018-01-31 14:14:00,k zimmermann,3435
508458,1.139564860,2018-01-02 01:45:00,andrew whittington,305


In [263]:
events_short_unpiv['datetime'] = pd.to_datetime(events_short_unpiv['datetime'])
tennis_markets_with_mapping_short['event_date'] = pd.to_datetime(tennis_markets_with_mapping_short['event_date'])

# Create sorted index pairs for joining
events_short_unpiv['sort_key'] = events_short_unpiv.groupby('id')['index'].transform(lambda x: ','.join(map(str, sorted(x))))
tennis_markets_with_mapping_short['sort_key'] = tennis_markets_with_mapping_short.groupby('market_id')['index'].transform(lambda x: ','.join(map(str, sorted(x))))

# Perform the join
merged = pd.merge(events_short_unpiv, tennis_markets_with_mapping_short, on='sort_key')

In [267]:
# Calculate time difference
merged['time_diff'] = abs(merged['datetime'] - merged['event_date']).dt.days

In [275]:
merged[['id','market_id','time_diff']].drop_duplicates().query('time_diff == 0')

Unnamed: 0,id,market_id,time_diff
76380,6673851,1.120373498,0
108428,6758863,1.121059085,0
108430,6758863,1.121059124,0
126496,6851285,1.121716372,0
136924,6885407,1.120484438,0
...,...,...,...
1910560,12764624,1.232479597,0
1910572,12765349,1.232479595,0
1910580,12765728,1.232479596,0
1910592,12766264,1.232479594,0


In [281]:
merged[['id','market_id','time_diff']].drop_duplicates().query('time_diff == 0').to_csv('../mappings/market_match_mapping.csv',index=False)

In [282]:
merged

Unnamed: 0,id,datetime,team_name,index_x,team_type,sort_key,market_id,event_date,name,index_y,time_diff
0,4842848,2014-01-01 00:40:00,aleksandr nedovyesov,3436,away,13436,1.132590579,2017-07-12 08:53:00,aleksandr nedovyesov,3436,1288
1,4842848,2014-01-01 00:40:00,aleksandr nedovyesov,3436,away,13436,1.132590579,2017-07-12 08:53:00,guillermo garcia lopez,1,1288
2,4842848,2014-01-01 00:40:00,aleksandr nedovyesov,3436,away,13436,1.132590587,2017-07-12 08:53:00,aleksandr nedovyesov,3436,1288
3,4842848,2014-01-01 00:40:00,aleksandr nedovyesov,3436,away,13436,1.132590587,2017-07-12 08:53:00,guillermo garcia lopez,1,1288
4,4842848,2014-01-01 00:40:00,aleksandr nedovyesov,3436,away,13436,1.134974934,2017-10-05 06:01:00,guillermo garcia lopez,1,1373
...,...,...,...,...,...,...,...,...,...,...,...
1913899,12898374,2024-09-26 12:50:00,haruna arakawa,4848,home,48488756,1.227968344,2024-04-23 01:12:02,haruna arakawa,4848,156
1913900,12898406,2024-09-26 08:25:00,momoko kobori,8737,away,873710500,1.217874356,2023-09-03 05:11:17,m kuramochi,10500,389
1913901,12898406,2024-09-26 08:25:00,momoko kobori,8737,away,873710500,1.217874356,2023-09-03 05:11:17,m kobori,8737,389
1913902,12898406,2024-09-26 08:25:00,miho kuramochi,10500,home,873710500,1.217874356,2023-09-03 05:11:17,m kuramochi,10500,389
