# Votes Matching
Matches the transfermarkt dataset with italian votes manually downloaded from "PianetaFanta" website.

In [1]:
# See: https://github.com/maladeep/Name-Matching-In-Python/blob/master/Surprisingly%20Effective%20Way%20To%20Name%20Matching%20In%20Python.ipynb

In [2]:
import pandas as pd
import re
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
from dynaconf import LazySettings
from dynaconf.utils.boxing import DynaBox
from scipy.sparse import csr_matrix
from typing import List
import sparse_dot_topn.sparse_dot_topn as ct  # Cosine Similarity
import os

# pd.set_option('display.max_colwidth', -1)


In [3]:
config_file = "/home/tiziano/workspaces/fantasAi_football/config/conf.yaml"
config_mode = "default"


In [4]:
params = LazySettings(settings_files=[config_file])
params = params[config_mode]


Read serialized datatets to merge

In [5]:
# Transfermarkt dataset
tm_path = os.path.join(
    params["PATHS"]["ROOT_FOLDER"],
    f"{params['PATHS']['STAGES']['TM_DATASET']}.pkl",
)
tm_dataset = pd.read_pickle(tm_path)

# PianetaFAnta data with votes
votes_ita_path = os.path.join(
    params["PATHS"]["ROOT_FOLDER"],
    f"{params['PATHS']['STAGES']['VOTES_ITA']}.pkl",
)
votes_ita = pd.read_pickle(votes_ita_path)


In [6]:
def ngrams(string: str, n: int=3) -> List[str]:
    """Splits the function n-grams.

    args:
    - string (str): the string to split in ngrams
    - n (int): number "n" of characters 

    returns (List[str]) the list of n-grams
    """
    string = re.sub(r"[,-./]|\sBD", r"", string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return ["".join(ngram) for ngram in ngrams]

Define series with the names

In [7]:
votes_players = votes_ita.drop_duplicates(
    subset=[params["FEATURES"]["PIANETAFANTA_NAME"]]
).reset_index(drop=False)

tm_players = tm_dataset.drop_duplicates(
    subset=[params["FEATURES"]["PRETTY_NAME"]]
).reset_index(drop=False)

def simplify(string_series: pd.Series) -> pd.Series:
    """Simplifies the string in a Series removing all the special characters."""
    string_series = string_series.str.lower()
    string_series = string_series.apply(unidecode)
    string_series = string_series.replace(r'[^\w\s]|_', '', regex=True)

    return string_series

tm_names = tm_players[params["FEATURES"]["PRETTY_NAME"]]
tm_names = simplify(tm_names)

votes_names = votes_players[params["FEATURES"]["PIANETAFANTA_NAME"]]
votes_names = simplify(votes_names)

names = pd.concat([tm_names, votes_names], ignore_index=True)


In [8]:
# After having each words split (token or  lemmas (n-gram generated items) )
# into a vector and Scikit-learn’s  Tfidfvectorizer aim to do the same thing, 
# which is to convert a collection of raw documents to a matrix of 
# TF-IDF features. Generate the matrix of TF-IDF (Term Frequency-Inverse 
# Document frequency)values for each 
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
vectorizer = vectorizer.fit(names)

tm_names_t = vectorizer.transform(tm_names)
votes_names_t = vectorizer.transform(votes_names)

In [13]:
# Calcuate the similarities between the encoded names
cosine_similarities = linear_kernel(tm_names_t, votes_names_t)

# Find the best match
related_docs_indices = cosine_similarities.argmax(axis=1)

# Get the matched players
matched = votes_players.loc[related_docs_indices].reset_index(drop=True)

# Concat the transfermarkt players with the matched pianetafanta names
players_matched = pd.concat([tm_players, matched], axis=1)

# Define the trustworty mathing as the ones that have a minimum distance from 
# best match
trustworty = cosine_similarities.max(axis=1) > 0.4
players_matched["_is_reliable"] = trustworty

In [14]:
players_matched = players_matched[[params["FEATURES"]["PLAYER"], params["FEATURES"]["PIANETAFANTA_NAME"], "_is_reliable"]]

In [15]:
players_matched

Unnamed: 0,player_id,pt_name,_is_reliable
0,10,KLOSE,True
1,102192,CAVANDA,True
2,111196,DE VRIJ S.,True
3,12149,CANA,True
4,12906,CIANI,True
...,...,...,...
15825,56324,BASHA,True
15826,584548,MACHIS D.,False
15827,597931,GIORGI,False
15828,423960,PLASIL,False


In [16]:
tm_dataset_j = tm_dataset.merge(players_matched, on=[params["FEATURES"]["PLAYER"]], how='left')
tm_dataset_j = tm_dataset_j.merge(votes_ita, on=[params["FEATURES"]["PLAYER"]], how='left')
# tm_dataset_j = tm_dataset_j.merge(votes_ita, on=[params["FEATURES"]])
# tm_dataset_j = tm_dataset_j.loc[
#     tm_dataset_j[params["FEATURES"]["COMPETITION"]]
#     == params["SETTINGS"]["ITALIAN_FANTA_COMPETITION"]
# ]

In [None]:
votes_ita[params["FEATURES"]["PIANETAFANTA_TEAM"]].unique()

<StringArray>
[  'piacenza',      'milan',      'inter',      'parma',     'empoli',
    'udinese',     'modena',       'roma',       'como',     'chievo',
    'bologna',    'perugia',    'brescia',      'lazio',   'juventus',
     'torino',    'reggina',   'atalanta',      'lecce',     'ancona',
  'sampdoria',      'siena',   'cagliari',    'palermo',    'livorno',
    'messina', 'fiorentina',    'treviso',     'ascoli',    'catania',
      'genoa',     'napoli',       'bari',     'cesena',     'novara',
    'pescara',   'sassuolo',     'verona',  'frosinone',      'carpi',
    'crotone',  'benevento',       'spal',     'spezia']
Length: 44, dtype: string

In [24]:
tm_dataset[params["FEATURES"]["SEASON"]].unique()

<IntegerArray>
[2014, 2015, 2017, 2019, 2020, 2016, 2018, 2021, 2013]
Length: 9, dtype: Int64

In [22]:
tm_dataset.loc[tm_dataset[params["FEATURES"]["COMPETITION"]].str.contains("IT")][params["FEATURES"]["CLUB_PRETTY_NAME"]].unique()

<StringArray>
[           'Lazio Rom',      'Cagliari Calcio',      'Sampdoria Genua',
           'Ssc Neapel',            'Genua Cfc',             'Fc Turin',
        'Hellas Verona',    'Parma Calcio 1913',               'As Rom',
           'Ac Florenz',        'Chievo Verona',       'Udinese Calcio',
        'Inter Mailand',            'Fc Empoli',           'Palermo Fc',
           'Ac Mailand',       'Juventus Turin',          'Us Sassuolo',
     'Atalanta Bergamo',            'Cesena Fc',           'Fc Bologna',
        'Carpi Fc 1909',     'Frosinone Calcio',       'Brescia Calcio',
             'Us Lecce',                 'Spal',           'Fc Crotone',
     'Benevento Calcio',        'Spezia Calcio', 'Delfino Pescara 1936',
       'Acn Siena 1904',  'Us Salernitana 1919',           'Venezia Fc',
           'As Livorno']
Length: 34, dtype: string

In [None]:
tm_dataset_j.sort_values(by=['market_value'], ascending=False).head(10)

Unnamed: 0,player_id,name,pretty_name,country_of_citizenship,date_of_birth,position,sub_position,foot,height_in_cm,season,...,minutes_played,market_value,market_value_delta,_club_value,club_value_ratio,market_value_ratio,on_field_index,club_pretty_name,pt_name,TRUSTWORTH
3056,206050,paulo-dybala,Paulo Dybala,Argentina,15-11-93,Attack,attack - Second Striker,Left,177,2018,...,2137,99000000,0.409091,694800000,7.404881,31.428571,0.624854,Juventus Turin,DYBALA P.,True
3072,8198,cristiano-ronaldo,Cristiano Ronaldo,Portugal,05-02-85,Attack,attack - Centre-Forward,Right,187,2018,...,2689,90000000,0.0,694800000,7.404881,28.571429,0.786257,Juventus Turin,RONALDO,True
2442,68863,mauro-icardi,Mauro Icardi,Argentina,19-02-93,Attack,attack - Centre-Forward,Right,181,2018,...,2269,85500000,0.473684,494370000,5.268784,27.142857,0.66345,Inter Mailand,ICARDI M.,True
166,266302,sergej-milinković-savić,Sergej Milinković Savić,Serbia,27-02-95,Midfield,midfield - Central Midfield,Right,191,2018,...,2405,81000000,0.755556,293760000,3.130768,25.714286,0.703216,Lazio Rom,MILINKOVIC S,True
2975,8198,cristiano-ronaldo,Cristiano Ronaldo,Portugal,05-02-85,Attack,attack - Centre-Forward,Right,187,2019,...,2919,81000000,-0.111111,704250000,4.489357,18.0,0.876577,Juventus Turin,RONALDO,True
2960,206050,paulo-dybala,Paulo Dybala,Argentina,15-11-93,Attack,attack - Second Striker,Left,177,2019,...,2165,76500000,-0.294118,704250000,4.489357,17.0,0.65015,Juventus Turin,DYBALA P.,True
2376,96341,romelu-lukaku,Romelu Lukaku,Belgium,13-05-93,Attack,attack - Centre-Forward,Left,191,2020,...,2978,76500000,0.117647,612000000,4.862353,21.25,0.975698,Inter Mailand,LUKAKU R.,True
2982,206050,paulo-dybala,Paulo Dybala,Argentina,15-11-93,Attack,attack - Second Striker,Left,177,2020,...,1134,72000000,-0.0625,637178000,5.062392,20.0,0.371538,Juventus Turin,DYBALA P.,True
2461,406625,lautaro-martinez,Lautaro Martinez,Argentina,22-08-97,Attack,attack - Centre-Forward,Right,174,2021,...,1777,72000000,0.125,465300000,3.295606,16.0,0.626808,Inter Mailand,MARTINEZ G.,True
2963,326031,matthijs-de-ligt,Matthijs De Ligt,Netherlands,12-08-99,Defender,Defender - Centre-Back,Right,189,2019,...,2449,67500000,0.466667,704250000,4.489357,15.0,0.735435,Juventus Turin,DE LIGT M.,True


In [None]:
len(tm_dataset_j["TRUSTWORTH"])

4021

In [None]:
len(tm_dataset_j)

4021