In [None]:
# See: https://github.com/maladeep/Name-Matching-In-Python/blob/master/Surprisingly%20Effective%20Way%20To%20Name%20Matching%20In%20Python.ipynb

In [123]:
import pandas as pd 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from dynaconf import LazySettings
from dynaconf.utils.boxing import DynaBox
from scipy.sparse import csr_matrix
from typing import List
import sparse_dot_topn.sparse_dot_topn as ct  #Cosine Similarity
import time
#pd.set_option('display.max_colwidth', -1)

In [124]:
config_file = "/home/tiziano/workspaces/fantasAi_football/config/conf.yaml"
config_mode = 'default'

In [143]:
def data_ingestion_basics(params: DynaBox, table: str, duplicate_key_err:str='raise') -> pd.DataFrame:
    """Performs some basic harmonization steps on a input dataframe.

    Note that after the basic harmonization the columns will have the names
    stated in settings.COLS, and not those appearing in the input data.

    Args:
        table (pd.DataFrame): id of the table to lead, it must appear in
            config file
        duplicate_key_err ('raise' or 'drop'): how to handle duplicated key
        errors. If 'raise' an error will be return in case of duplicated keys,
        if 'drop' all duplicated rows will be dropped from data.

    Raises:
        ValueError: If table registry dataframe keys are missing
        KeyError: If table registry dataframe keys are duplicated

    Returns:
        pd.DataFrame: dataframe in input, harmonized
    """
    # Load parameters
    t_par = params[table]
    t_cols_dict = t_par["COLS"].to_dict()
    keys: List[str] = [t_par["COLS"][c] for c in t_par.KEY]
    cols: List[str] = list(t_cols_dict.values())
    dtype_map = {
        c_name: params["FEATURES"]["DTYPES"][c]
        for c, c_name in t_cols_dict.items()
    }
    name_conversion = {
        c_from: params["FEATURES"][c] for c, c_from in t_cols_dict.items()
    }
    
    path = os.path.join(
        params["PATHS"]["ROOT_FOLDER"], 
        params["PATHS"]["INPUT"]["FOLDER"],
         params["PATHS"]["INPUT"][table]
    )
    data: pd.DataFrame = pd.read_csv(path)

    # Keep only selected columns
    data = data[cols]

    # Ugly but functional call to ensure correct type conversion
    data = data.convert_dtypes().astype(dtype_map, errors='ignore').convert_dtypes()  # type: ignore

    # Ensure that the product registry dataframe keys are valid
    if not data[keys].notna().all(axis=1).all():
        raise ValueError(f"{table} dataframe keys are missing")
    if not data.value_counts(keys).eq(1).all():
        if duplicate_key_err == 'raise':
            raise KeyError(f"{table} keys are duplicated")
        else:
            print(f"WARN: dupliated keys will be removed from {table}")
            data = data.drop_duplicates(subset=keys, keep=False)

    # Sort product product registry by DIVISION,PRODUCT and reset index
    data = data.sort_values(keys).reset_index(drop=True)

    data = data.rename(columns=name_conversion)

    if "FILTER" in params:
        if table in params["FILTER"]:
            for column, values in params["FILTER"].get(table).to_dict().items():
                column_name = params["FEATURES"].get(column)
                data = data.loc[data[column_name].isin(values)]

    return data

In [144]:
params = LazySettings(settings_files=[config_file])
params = params[config_mode]

In [145]:
players = data_ingestion_basics(params, "PLAYERS")
votes_ita = data_ingestion_basics(params, "VOTES_ITA", duplicate_key_err='drop')

players['date_of_birth'] = pd.to_datetime(players['date_of_birth'], errors='coerce')



WARN: dupliated keys will be removed from VOTES_ITA


In [146]:
def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [184]:
names = pd.concat(
    [players[params["FEATURES"]["NAME"]], 
    votes_ita[params["FEATURES"]["PIANETAFANTA_NAME"]]],
    ignore_index=True
    )
names = names.str.lower()

In [185]:

# After having each words split (token or  lemmas (n-gram generated items) ) into a vector and
# Scikit-learn’s  Tfidfvectorizer aim to do the same thing, which is to convert a collection of raw documents to a matrix of TF-IDF features. 
# Generate the matrix of TF-IDF (Term Frequency-Inverse Document frequency)values for each
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
vectorizer = vectorizer.fit(names)
tm_names = vectorizer.transform(players[params["FEATURES"]["NAME"]]) 
pf_names = vectorizer.transform(votes_ita[params["FEATURES"]["PIANETAFANTA_NAME"]]) 

In [186]:
# View sparse CSR matrix.
print(pf_names[0])

  (0, 7193)	0.7498043525871992
  (0, 597)	0.6616596049641319


In [191]:
# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.
# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [195]:
#  Run the optimized cosine similarity function. 
#  Only stores the top 10 most similar items with a similarity above 0.8

t1 = time.time()
matches = awesome_cossim_top(
    tm_names, 
    pf_names.transpose(),
    10,
    0.8
    )
t = time.time() - t1
print("SELFTIMED:", t)

SELFTIMED: 0.009179353713989258


In [179]:
# unpacks the resulting sparse matrix

def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [180]:
# store the  matches into new dataframe called matched_df and printing 10 samples

matches_df = get_matches_df(matches, names, top=200)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # For removing all exact matches


IndexError: index 67 is out of bounds for axis 0 with size 67

In [115]:
matches_df

Unnamed: 0,left_side,right_side,similairity
38,Douglas,Douglao,0.848212
88,Mario Fernandes,Mario Fernandez,0.88237
102,Wellington,Wellington Nem,0.819218
103,Wellington,Wellington Luis,0.814134
161,Lyle Taylor,Kyle Taylor,0.82082
165,Andre,Andre Andre,0.849399
178,Chris Maxwell,Maxwell,0.812869


In [116]:
matches_df.sort_values(['similairity'], ascending=False).head(10)

Unnamed: 0,left_side,right_side,similairity
88,Mario Fernandes,Mario Fernandez,0.88237
165,Andre,Andre Andre,0.849399
38,Douglas,Douglao,0.848212
161,Lyle Taylor,Kyle Taylor,0.82082
102,Wellington,Wellington Nem,0.819218
103,Wellington,Wellington Luis,0.814134
178,Chris Maxwell,Maxwell,0.812869
