In [2]:
# create a big dataset for similarity search
import numpy as np
import pandas as pd
import os
import faiss

In [3]:
dir_path = "..\\tennis_atp"

In [4]:
# create matches df
matches_df = pd.read_csv(os.path.join(dir_path, "atp_matches_2001.csv"))

for year in range(2002, 2025):
    matches_df = pd.concat([matches_df, pd.read_csv(os.path.join(dir_path, f"atp_matches_{year}.csv") )])

In [5]:
matches_df.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'],
      dtype='object')

In [6]:
new_column_names = {
    'tourney_id': 'Tournament ID',
    'tourney_name': 'Tournament Name',
    'surface': 'Surface',
    'draw_size': 'Draw Size',
    'tourney_level': 'Tournament Level',
    'tourney_date': 'Tournament Date',
    'match_num': 'Match Number',
    'winner_id': 'Winner ID',
    'winner_seed': 'Winner Seed',
    'winner_entry': 'Winner Entry',
    'winner_name': 'Winner Name',
    'winner_hand': 'Winner Hand',
    'winner_ht': 'Winner Height',
    'winner_ioc': 'Winner Country',
    'winner_age': 'Winner Age',
    'loser_id': 'Loser ID',
    'loser_seed': 'Loser Seed',
    'loser_entry': 'Loser Entry',
    'loser_name': 'Loser Name',
    'loser_hand': 'Loser Hand',
    'loser_ht': 'Loser Height',
    'loser_ioc': 'Loser Country',
    'loser_age': 'Loser Age',
    'score': 'Score',
    'best_of': 'Best of',
    'round': 'Round',
    'minutes': 'Minutes',
    'w_ace': 'Winner Aces',
    'w_df': 'Winner Double Faults',
    'w_svpt': 'Winner Serve Points',
    'w_1stIn': 'Winner First Serves In',
    'w_1stWon': 'Winner First Serves Won',
    'w_2ndWon': 'Winner Second Serves Won',
    'w_SvGms': 'Winner Serve Games',
    'w_bpSaved': 'Winner Break Points Saved',
    'w_bpFaced': 'Winner Break Points Faced',
    'l_ace': 'Loser Aces',
    'l_df': 'Loser Double Faults',
    'l_svpt': 'Loser Serve Points',
    'l_1stIn': 'Loser First Serves In',
    'l_1stWon': 'Loser First Serves Won',
    'l_2ndWon': 'Loser Second Serves Won',
    'l_SvGms': 'Loser Serve Games',
    'l_bpSaved': 'Loser Break Points Saved',
    'l_bpFaced': 'Loser Break Points Faced',
    'winner_rank': 'Winner Rank',
    'winner_rank_points': 'Winner Rank Points',
    'loser_rank': 'Loser Rank',
    'loser_rank_points': 'Loser Rank Points'
}

matches_df.rename(columns=new_column_names, inplace=True)

In [7]:
matches_df.dropna(subset=["Winner Name", "Loser Name", "Score", "Tournament Date"], inplace=True)
matches_df.drop(columns=["Tournament Level", "Winner Hand", "Loser Hand", "Loser Rank Points",
                         "Winner Rank Points", "Winner Country", "Loser Country", "Draw Size", "Tournament ID",
                         "Winner Entry", "Loser Entry", "Match Number", "Best of"
                         ,"Winner Seed", "Loser Seed"], inplace=True)

In [8]:
rankings_df = pd.read_csv(os.path.join(dir_path, "atp_rankings_00s.csv"))

for year in range(0, 3):
    rankings_df = pd.concat([rankings_df, pd.read_csv(os.path.join(dir_path, f"atp_rankings_{year}0s.csv"))])
rankings_df = pd.concat([rankings_df, pd.read_csv(os.path.join(dir_path, "atp_rankings_current.csv"))])

In [9]:
rankings_df.columns

Index(['ranking_date', 'rank', 'player', 'points'], dtype='object')

In [10]:
players_df = pd.read_csv(os.path.join(dir_path, "atp_players.csv"))
players_df['full_name'] = players_df['name_first'] + ' ' + players_df['name_last']
players_df.drop(columns=["name_first", "name_last"], inplace=True)

In [11]:
rankings_df = pd.merge(rankings_df, players_df, left_on='player', right_on="player_id", how='left')

In [11]:
rankings_df.head()

Unnamed: 0,ranking_date,rank,player,points,player_id,hand,dob,ioc,height,wikidata_id,full_name
0,20000110,1,101736,4135.0,101736,R,19700429.0,USA,180.0,Q7407,Andre Agassi
1,20000110,2,102338,2915.0,102338,R,19740218.0,RUS,190.0,Q207705,Yevgeny Kafelnikov
2,20000110,3,101948,2419.0,101948,R,19710812.0,USA,185.0,Q9446,Pete Sampras
3,20000110,4,103017,2184.0,103017,R,19770705.0,GER,183.0,Q76778,Nicolas Kiefer
4,20000110,5,102856,2169.0,102856,R,19760910.0,BRA,190.0,Q190723,Gustavo Kuerten


In [12]:
rankings_df.drop(columns=["player", "wikidata_id"], inplace=True)

In [13]:
def format_date(date):
    date = str(date)
    date = date[:4] + '-' + date[4:6] + '-' + date[6:]
    return date

rankings_df["ranking_date"] = rankings_df["ranking_date"].apply(lambda x: format_date(x))

In [14]:
rankings_df.head()

Unnamed: 0,ranking_date,rank,points,player_id,hand,dob,ioc,height,full_name
0,2000-01-10,1,4135.0,101736,R,19700429.0,USA,180.0,Andre Agassi
1,2000-01-10,2,2915.0,102338,R,19740218.0,RUS,190.0,Yevgeny Kafelnikov
2,2000-01-10,3,2419.0,101948,R,19710812.0,USA,185.0,Pete Sampras
3,2000-01-10,4,2184.0,103017,R,19770705.0,GER,183.0,Nicolas Kiefer
4,2000-01-10,5,2169.0,102856,R,19760910.0,BRA,190.0,Gustavo Kuerten


In [15]:
matches_df.head()

Unnamed: 0,Tournament Name,Surface,Tournament Date,Winner ID,Winner Name,Winner Height,Winner Age,Loser ID,Loser Name,Loser Height,...,Loser Double Faults,Loser Serve Points,Loser First Serves In,Loser First Serves Won,Loser Second Serves Won,Loser Serve Games,Loser Break Points Saved,Loser Break Points Faced,Winner Rank,Loser Rank
0,Auckland,Hard,20010108,102905,Stefan Koubek,175.0,24.0,102644,Franco Squillari,183.0,...,5.0,66.0,31.0,19.0,15.0,10.0,4.0,8.0,52.0,14.0
1,Auckland,Hard,20010108,102231,Alex Calatrava,190.0,27.5,102860,Agustin Calleri,183.0,...,3.0,63.0,37.0,33.0,11.0,12.0,1.0,3.0,53.0,64.0
2,Auckland,Hard,20010108,102257,Greg Rusedski,193.0,27.3,103066,Mark Nielsen,185.0,...,5.0,72.0,45.0,31.0,12.0,10.0,4.0,6.0,73.0,260.0
3,Auckland,Hard,20010108,101647,Byron Black,175.0,31.2,102287,Albert Portas,188.0,...,0.0,69.0,35.0,31.0,11.0,13.0,3.0,7.0,35.0,49.0
4,Auckland,Hard,20010108,102998,Jan Michael Gambill,190.0,23.6,103454,Nicolas Massu,183.0,...,6.0,81.0,40.0,27.0,15.0,11.0,5.0,9.0,33.0,67.0


In [14]:
matches_df.columns

Index(['Tournament Name', 'Surface', 'Tournament Date', 'Winner ID',
       'Winner Name', 'Winner Height', 'Winner Age', 'Loser ID', 'Loser Name',
       'Loser Height', 'Loser Age', 'Score', 'Round', 'Minutes', 'Winner Aces',
       'Winner Double Faults', 'Winner Serve Points', 'Winner First Serves In',
       'Winner First Serves Won', 'Winner Second Serves Won',
       'Winner Serve Games', 'Winner Break Points Saved',
       'Winner Break Points Faced', 'Loser Aces', 'Loser Double Faults',
       'Loser Serve Points', 'Loser First Serves In', 'Loser First Serves Won',
       'Loser Second Serves Won', 'Loser Serve Games',
       'Loser Break Points Saved', 'Loser Break Points Faced', 'Winner Rank',
       'Loser Rank'],
      dtype='object')

In [17]:
columns_with_nan = matches_df.columns[matches_df.isnull().any()].tolist()
print(columns_with_nan)

['Surface', 'Winner Height', 'Winner Age', 'Loser Height', 'Loser Age', 'Minutes', 'Winner Aces', 'Winner Double Faults', 'Winner Serve Points', 'Winner First Serves In', 'Winner First Serves Won', 'Winner Second Serves Won', 'Winner Serve Games', 'Winner Break Points Saved', 'Winner Break Points Faced', 'Loser Aces', 'Loser Double Faults', 'Loser Serve Points', 'Loser First Serves In', 'Loser First Serves Won', 'Loser Second Serves Won', 'Loser Serve Games', 'Loser Break Points Saved', 'Loser Break Points Faced', 'Winner Rank', 'Loser Rank']


In [18]:
# MAP NAMES TO UNIQUE PLAYER EMBEDDINGS

In [19]:
# NAMED ENTITY RECOGNITION - STANZA 

mat

In [15]:
# TURN THE MATCH DATA INTO SENTENCES FOR THE SIMILARITY SEARCH 

def turn_features_into_sentence(row):
    sentence = (f"{row['Winner Name']} won against {row['Loser Name']} on {row['Surface']} in the {row['Tournament Name']} on {row['Tournament Date']} with a score of {row['Score']}. {row['Winner Name']} served {row['Winner Aces']} aces and committed {row['Winner Double Faults']} double faults, winning {row['Winner Serve Points']} serve points and saving {row['Winner Break Points Saved']} out of {row['Winner Break Points Faced']} break points. {row['Loser Name']} served {row['Loser Aces']} aces and committed {row['Loser Double Faults']} double faults. {row['Winner Name']} ranked {row['Winner Rank']} defeated {row['Loser Name']}, ranked {row['Loser Rank']}.")
    return sentence

matches_df["Sentence"] = matches_df.apply(lambda x: turn_features_into_sentence(x), axis=1)
    

In [20]:
from sklearn.preprocessing import OneHotEncoder

categorical_columns = ['Tournament Name', 'Surface', 'Score', "Winner Name", "Loser Name"]
encoder = OneHotEncoder(sparse_output=False)
matches_df[categorical_columns] = matches_df[categorical_columns].fillna('Missing')
encoded_categorical = encoder.fit_transform(matches_df[categorical_columns])

In [21]:
from sklearn.preprocessing import StandardScaler

numerical_columns = ['Winner Height', 'Loser Height', 'Winner Age', 'Loser Age', 'Minutes',
                     'Winner Aces', 'Loser Aces', 'Winner Double Faults', 'Loser Double Faults',
                     'Winner Serve Points', 'Loser Serve Points', 'Winner Rank', 'Loser Rank']
matches_df[numerical_columns] = matches_df[numerical_columns].fillna(matches_df[numerical_columns].mean())
scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(matches_df[numerical_columns])

In [22]:
combined_features = np.hstack((scaled_numerical, encoded_categorical)) #stacking features horizontally

In [23]:
combined_features

array([[-1.67083988, -0.40855757, -0.58966212, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.55892142, -0.40855757,  0.28909893, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.00487367, -0.10556441,  0.23888401, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        , -0.40855757,  0.23888401, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.13845418, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.40855757, -0.79052179, ...,  0.        ,
         0.        ,  0.        ]])

In [24]:
dimension = combined_features.shape[1]
print("dimensionality of the index: ", dimension)
combined_features_f32 = combined_features.astype("float32")
combined_features_f32.dtype

dimensionality of the index:  17786


dtype('float32')

In [19]:
from transformers import BertModel, BertTokenizer
import torch

model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
def get_embeddings(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy().astype("float32") # sentence embedding by aggregating token level embeddings

In [43]:
import torch
from torch.utils.data import DataLoader

def get_batch_embeddings(sentences, batch_size=64, max_length=128, device='cpu'):
    model.to(device)
    dataloader = DataLoader(sentences, batch_size=batch_size)
    embeddings = []
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    for batch in dataloader:
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy().astype("float32")
        embeddings.extend(batch_embeddings)
    
    return np.vstack(embeddings)

In [25]:
argument = "of"
argument_embedding = get_embeddings(argument) # 1d numpy array

In [16]:
index = faiss.IndexFlatL2(768)

In [31]:
sentences = matches_df["Sentence"].tolist()

In [45]:
resulting_embeddings = get_batch_embeddings(sentences)
np.save('embeddings.npy', resulting_embeddings)

In [46]:
index.add(resulting_embeddings)

In [28]:
from sklearn.decomposition import PCA

pca = PCA(n_components=embedding_dimension)
pca.fit(combined_features_f32)
reduced_combined_features_f32 = pca.transform(combined_features_f32) # well actually kinda increased, i am getting very confused

MemoryError: Unable to allocate 4.63 GiB for an array with shape (69869, 17786) and data type float32

In [None]:
# and the vector database is built on RAM
index.add(reduced_combined_features_f32)

In [47]:
# Finally the similarity search
D, I = index.search(argument_embedding.reshape(1, -1).astype('float32'), k=10) # -1 basically substitues 768
print("Indices of similar rows:", I)

: 

In [None]:
matches_df.iloc[I[0]]