In [45]:
# See: https://github.com/maladeep/Name-Matching-In-Python/blob/master/Surprisingly%20Effective%20Way%20To%20Name%20Matching%20In%20Python.ipynb


In [17]:
import pandas as pd
import re
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
from dynaconf import LazySettings
from dynaconf.utils.boxing import DynaBox
from scipy.sparse import csr_matrix
from typing import List
import sparse_dot_topn.sparse_dot_topn as ct  # Cosine Similarity
import os

# pd.set_option('display.max_colwidth', -1)


In [18]:
config_file = "/home/tiziano/workspaces/fantasAi_football/config/conf.yaml"
config_mode = "default"


In [19]:
params = LazySettings(settings_files=[config_file])
params = params[config_mode]


In [20]:
tm_path = os.path.join(
    params["PATHS"]["ROOT_FOLDER"],
    f"{params['PATHS']['STAGES']['TM_DATASET']}.pkl",
)
tm_dataset = pd.read_pickle(tm_path)


votes_ita_path = os.path.join(
    params["PATHS"]["ROOT_FOLDER"],
    f"{params['PATHS']['STAGES']['VOTES_ITA']}.pkl",
)
votes_ita = pd.read_pickle(votes_ita_path)


In [21]:
def ngrams(string, n=3):
    string = re.sub(r"[,-./]|\sBD", r"", string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return ["".join(ngram) for ngram in ngrams]


In [61]:
votes_players = votes_ita[
    [
        params["FEATURES"]["PIANETAFANTA_NAME"],
        params["FEATURES"]["PIANETAFANTA_TEAM"],
    ]
].copy()
votes_players["_name_and_club"] = (
    votes_players[params["FEATURES"]["PIANETAFANTA_NAME"]]
    + " "
    + votes_players[params["FEATURES"]["PIANETAFANTA_TEAM"]]
)
votes_players = votes_players.drop_duplicates(
    subset=[params["FEATURES"]["PIANETAFANTA_NAME"]]
).reset_index(drop=False)


In [62]:
tm_players = tm_dataset.loc[
    tm_dataset[params["FEATURES"]["COMPETITION"]]
    == params["SETTINGS"]["ITALIAN_FANTA_COMPETITION"]
]
tm_players = tm_players[
    [
        params["FEATURES"]["PLAYER"],
        params["FEATURES"]["PRETTY_NAME"],
        params["FEATURES"]["CLUB_PRETTY_NAME"],
    ]
].copy()
tm_players["_name_and_club"] = (
    tm_players[params["FEATURES"]["PRETTY_NAME"]]
    + " "
    + tm_players[params["FEATURES"]["CLUB_PRETTY_NAME"]]
)
tm_players = tm_players.drop_duplicates(
    subset=[params["FEATURES"]["PRETTY_NAME"]]
).reset_index(drop=False)


In [74]:
tm_names = tm_players[params["FEATURES"]["PRETTY_NAME"]]
tm_names = tm_names.str.lower()
votes_names = votes_players[params["FEATURES"]["PIANETAFANTA_NAME"]]
votes_names = tm_names.str.lower()

names = pd.concat([tm_names, votes_names], ignore_index=True)
names = names.apply(unidecode)


In [75]:
# After having each words split (token or  lemmas (n-gram generated items) ) into a vector and
# Scikit-learn’s  Tfidfvectorizer aim to do the same thing, which is to convert a collection of raw documents to a matrix of TF-IDF features.
# Generate the matrix of TF-IDF (Term Frequency-Inverse Document frequency)values for each
# vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
vectorizer = TfidfVectorizer(min_df=1, analyzer="word")
vectorizer = vectorizer.fit(names)

tm_names_t = vectorizer.transform(tm_names)
votes_names_t = vectorizer.transform(votes_names)
test_names_t = vectorizer.transform(["ronaldo", "mancini"])


In [76]:
cosine_similarities = linear_kernel(tm_names_t, votes_names_t)


In [77]:
cosine_similarities.shape


(1485, 1485)

In [78]:
# Find the top 5 related documents
related_docs_indices = cosine_similarities.argsort()[1, -5:-1]


In [79]:
related_docs_indices.shape

array([1276,  184,   36,   76])

In [82]:
tm_players.loc[related_docs_indices[:]]['pretty_name']

1276      Luis Binks
184      Luis Muriel
36      Luis Alberto
76             Pedro
Name: pretty_name, dtype: string