In [5]:
import sys
from pathlib import Path
import os

# Determine the current working directory of the notebook
notebook_dir = Path(os.getcwd())

# Add the parent directory to sys.path
parent_dir = notebook_dir.parent
sys.path.append(str(parent_dir))

# Now you can import aux.py
from aux import connect_to_db
import pandas as pd
import psycopg2
from flask import jsonify
from psycopg2.extras import RealDictCursor
from db_values import HOST, DBNAME, USER, PASSWORD

In [12]:
conn = psycopg2.connect(
            dbname=DBNAME,  
            user=USER,         
            password=PASSWORD,     
            host="localhost",             
        )

In [13]:
sql_query = "SELECT * FROM players"  
sql_data = pd.read_sql(sql_query, conn)

  sql_data = pd.read_sql(sql_query, conn)


In [19]:
csv_data = pd.read_csv("../data/players_data.csv")  # Columns: 'name', 'other_data'

In [20]:
# Preprocess CSV names
csv_data['first_name'] = csv_data['name'].apply(lambda x: x.split()[0])  # First word
csv_data['last_name'] = csv_data['name'].apply(lambda x: x.split()[-1])  # Last word
csv_data['key'] = csv_data['last_name'] + " " + csv_data['first_name'].str[0]  # 'LastName FirstInitial'

# Preprocess SQL names
sql_data['last_name'] = sql_data['name'].apply(lambda x: x.split()[0])  # First word
sql_data['initial'] = sql_data['name'].apply(lambda x: x.split()[-1].strip('.'))  # Last character
sql_data['key'] = sql_data['last_name'] + " " + sql_data['initial']  # 'LastName Initial'


In [23]:
# Merge CSV and SQL data on the key
merged_data = pd.merge(
    csv_data, sql_data, on='key', how='left', suffixes=('_csv', '_sql')
)

# Identify unmatched rows
unmatched_csv = merged_data[merged_data['name_sql'].isna()]
unmatched_sql = sql_data[~sql_data['key'].isin(merged_data['key'])]


In [25]:
unmatched_csv

Unnamed: 0,rank,name_csv,age,points,nationality,atp_code,first_name,last_name_csv,key,player_id,name_sql,last_name_sql,initial
8,9,Alex de Minaur,25,3745,AUS,dh58,Alex,Minaur,Minaur A,,,,
30,31,Giovanni Mpetshi Perricard,21,1561,FRA,m0gz,Giovanni,Perricard,Perricard G,,,,
42,43,Jan-Lennard Struff,34,1240,GER,sl28,Jan-Lennard,Struff,Struff J,,,,
44,45,Zhizhen Zhang,28,1155,CHN,z371,Zhizhen,Zhang,Zhang Z,,,,
45,46,Roberto Bautista Agut,36,1151,ESP,bd06,Roberto,Agut,Agut R,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,Michiel De Krom,26,79,NED,dh55,Michiel,Krom,Krom M,,,,
496,497,Andrew Paulson,23,78,CZE,p0ev,Andrew,Paulson,Paulson A,,,,
497,498,Tyler Zink,23,78,USA,z0ak,Tyler,Zink,Zink T,,,,
498,499,Patrick Maloney,24,78,USA,m0re,Patrick,Maloney,Maloney P,,,,


In [26]:
unmatched_sql

Unnamed: 0,player_id,name,last_name,initial,key
11,12,Kubler J.,Kubler,J,Kubler J
19,20,Van De Zandschulp B.,Van,B,Van B
21,22,Broady L.,Broady,L,Broady L
23,24,Struff J.L.,Struff,J.L,Struff J.L
24,25,Bautista Agut R.,Bautista,R,Bautista R
25,26,Huesler M.A.,Huesler,M.A,Huesler M.A
32,33,O Connell C.,O,C,O C
42,43,Van Assche L.,Van,L,Van L
47,48,Carballes Baena R.,Carballes,R,Carballes R
54,55,De Jong J.,De,J,De J


In [28]:
# Fuzz
from rapidfuzz import fuzz, process

In [64]:
# Create a mapping table
mapping = []

for csv_key in csv_data['key']:
    match = process.extractOne(
        csv_key, 
        sql_data['key'], 
        scorer=fuzz.ratio
    )
    if match and match[1] > 50:  # 50% similarity threshold
        mapping.append({
            'csv_name': csv_key,
            'sql_name': match[0],
            'similarity': match[1]
        })

mapping_df = pd.DataFrame(mapping)


In [65]:
# Find unmatched names
unmatched_csv = csv_data[~csv_data['key'].isin(mapping_df['csv_name'])]
unmatched_sql = sql_data[~sql_data['key'].isin(mapping_df['sql_name'])]



In [66]:
unmatched_sql

Unnamed: 0,player_id,name,last_name,initial,key
32,33,O Connell C.,O,C,O C
42,43,Van Assche L.,Van,L,Van L
101,102,Zapata Miralles B.,Zapata,B,Zapata B
115,116,Moreno De Alboran N.,Moreno,N,Moreno N
121,122,Murray A.,Murray,A,Murray A
141,142,Thiem D.,Thiem,D,Thiem D
159,160,Mpetshi G.,Mpetshi,G,Mpetshi G
189,190,Tseng C. H.,Tseng,H,Tseng H
221,222,Valkusz M.,Valkusz,M,Valkusz M
225,226,Sandgren T.,Sandgren,T,Sandgren T


In [67]:
unmatched_csv

Unnamed: 0,rank,name,age,points,nationality,atp_code,first_name,last_name,key
64,65,Christopher O'Connell,30,795,AUS,o483,Christopher,O'Connell,O'Connell C
79,80,Botic van de Zandschulp,29,712,NED,v812,Botic,Zandschulp,Zandschulp B
127,128,Luca Van Assche,20,471,FRA,v0dz,Luca,Assche,Assche L
135,136,Jerome Kym,21,451,SUI,k0ep,Jerome,Kym,Kym J
151,152,Nishesh Basavareddy,19,390,USA,b0nn,Nishesh,Basavareddy,Basavareddy N
156,157,Felipe Meligeni Alves,26,377,BRA,mw75,Felipe,Alves,Alves F
187,188,Valentin Royer,23,313,FRA,r0eb,Valentin,Royer,Royer V
192,193,Sho Shimabukuro,27,297,JPN,sy67,Sho,Shimabukuro,Shimabukuro S
214,215,Nikoloz Basilashvili,32,269,GEO,bg23,Nikoloz,Basilashvili,Basilashvili N
241,242,Gonzalo Oliveira,29,235,VEN,o482,Gonzalo,Oliveira,Oliveira G
