# Votes Matching
Matches the transfermarkt dataset with italian votes manually downloaded from "PianetaFanta" website.

In [1]:
# See: https://github.com/maladeep/Name-Matching-In-Python/blob/master/Surprisingly%20Effective%20Way%20To%20Name%20Matching%20In%20Python.ipynb

In [2]:
import pandas as pd
import re
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
from dynaconf import LazySettings
from dynaconf.utils.boxing import DynaBox
from scipy.sparse import csr_matrix
from typing import List
import os

# pd.set_option('display.max_colwidth', -1)


In [3]:
config_file = "C://workspaces//learning//fantasAi_football//config//conf.yaml"
config_mode = "default"


In [4]:
params = LazySettings(settings_files=[config_file])
params = params[config_mode]

Read serialized datatets to merge

In [5]:
# Transfermarkt dataset
tm_path = os.path.join(
    params["PATHS"]["ROOT_FOLDER"],
    f"{params['PATHS']['STAGES']['TM_DATASET']}.pkl",
)
tm_dataset = pd.read_pickle(tm_path)

# PianetaFAnta data with votes
votes_ita_path = os.path.join(
    params["PATHS"]["ROOT_FOLDER"],
    f"{params['PATHS']['STAGES']['VOTES_ITA']}.pkl",
)
votes_ita = pd.read_pickle(votes_ita_path)

In [6]:
votes_ita[params["FEATURES"]["PIANETAFANTA_TEAM"]] = votes_ita[params["FEATURES"]["PIANETAFANTA_TEAM"]].replace(params["VOTES_ITA"]["TEAM_TRANSLATOR"].to_dict())

In [7]:
def ngrams(string: str, n: int=3) -> List[str]:
    """Splits the function n-grams.

    args:
    - string (str): the string to split in ngrams
    - n (int): number "n" of characters 

    returns (List[str]) the list of n-grams
    """
    string = re.sub(r"[,-./]|\sBD", r"", string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return ["".join(ngram) for ngram in ngrams]

Define series with the names

In [8]:
votes_players = votes_ita.drop_duplicates(
    subset=[params["FEATURES"]["PIANETAFANTA_NAME"]]
).reset_index(drop=False)

tm_players = tm_dataset.drop_duplicates(
    subset=[params["FEATURES"]["PRETTY_NAME"]]
).reset_index(drop=False)

def simplify(string_series: pd.Series) -> pd.Series:
    """Simplifies the string in a Series removing all the special characters."""
    string_series = string_series.str.lower()
    string_series = string_series.apply(unidecode)
    string_series = string_series.replace(r'[^\w\s]|_', '', regex=True)

    return string_series

tm_names = tm_players[params["FEATURES"]["PRETTY_NAME"]]
tm_names = simplify(tm_names)

votes_names = votes_players[params["FEATURES"]["PIANETAFANTA_NAME"]]
votes_names = simplify(votes_names)

names = pd.concat([tm_names, votes_names], ignore_index=True)


In [9]:
# After having each words split (token or  lemmas (n-gram generated items) )
# into a vector and Scikit-learn’s  Tfidfvectorizer aim to do the same thing, 
# which is to convert a collection of raw documents to a matrix of 
# TF-IDF features. Generate the matrix of TF-IDF (Term Frequency-Inverse 
# Document frequency)values for each 
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
vectorizer = vectorizer.fit(names)

tm_names_t = vectorizer.transform(tm_names)
votes_names_t = vectorizer.transform(votes_names)

In [10]:
# Calcuate the similarities between the encoded names
cosine_similarities = linear_kernel(tm_names_t, votes_names_t)

# Find the best match
related_docs_indices = cosine_similarities.argmax(axis=1)

# Get the matched players
matched = votes_players.loc[related_docs_indices].reset_index(drop=True)

# Concat the transfermarkt players with the matched pianetafanta names
players_matched = pd.concat([tm_players[[params["FEATURES"]["PLAYER"]]], matched[[params["FEATURES"]["PIANETAFANTA_NAME"]]]], axis=1)
players_matched = players_matched.drop_duplicates(subset=[params["FEATURES"]["PLAYER"]])
# Define the trustworthy mathing as the ones that have a minimum distance from 
# best match
trustworthy = cosine_similarities.max(axis=1) > 0.4
players_matched["_is_reliable"] = trustworthy
players_matched[params["FEATURES"]["COMPETITION"]] = params["SETTINGS"]["ITALIAN_FANTA_COMPETITION"]

# Attach pianetafanta name to transfermarkt dataset
tm_dataset_with_pt_names = tm_dataset.merge(players_matched, on=[params["FEATURES"]["PLAYER"], params["FEATURES"]["COMPETITION"]], how='left')

In [11]:
# Perform some manual fixes
for tm_name, pt_name in params["MANUAL_FIXES"].items():
    tm_dataset_with_pt_names.loc[tm_dataset_with_pt_names[params["FEATURES"]["PRETTY_NAME"]] == tm_name, params["FEATURES"]["PIANETAFANTA_NAME"]] = pt_name

In [12]:
votes_ita_m = votes_ita.rename(columns={params["FEATURES"]["PIANETAFANTA_TEAM"]: params["FEATURES"]["CLUB_PRETTY_NAME"]})
votes_ita_m[params["FEATURES"]["COMPETITION"]] = params["SETTINGS"]["ITALIAN_FANTA_COMPETITION"]
matched_dataset = tm_dataset_with_pt_names.merge(
    votes_ita_m, on=[
        params["FEATURES"]["PIANETAFANTA_NAME"],
        params["FEATURES"]["CLUB_PRETTY_NAME"],
        params["FEATURES"]["COMPETITION"],
        params["FEATURES"]["SEASON"],        
        ], how='left')

In [13]:
gd = params["SETTINGS"]["GOALS_DIFF_ITA_VOTES_MATCHING_NOT_RELIABLE"]
matched_dataset.loc[
    (matched_dataset["goals_y"].notna())
    & 
    (
        (matched_dataset["goals_x"] > matched_dataset["goals_y"]*(1+gd))
        | (matched_dataset["goals_x"] < matched_dataset["goals_y"]*(1-gd))
    ), "_is_reliable" ] = False

In [14]:
goals_x = params["FEATURES"]["GOALS"] + "_x"
assist_x = params["FEATURES"]["ASSISTS"] + "_x"
matched_dataset = matched_dataset.drop(columns=[
    "_season_starts",
    params["FEATURES"]["GOALS"] + "_y",
    params["FEATURES"]["ASSISTS"] + "_y"
], errors='ignore').rename(columns={
    goals_x: params["FEATURES"]["GOALS"],
    assist_x: params["FEATURES"]["ASSISTS"],
    }
)

Display top unmatched players

In [15]:
check = matched_dataset.loc[
    (matched_dataset["competition_id"] == "IT1") 
    & (matched_dataset["pt_role"].isna()) 
    & (matched_dataset["season"]<=2020) 
    #& (matched_dataset["goals"]>=1) 
    & (matched_dataset["minutes_played"]>=90*5) 
    ]
check = check.groupby(["pretty_name", "pt_name"]).mean()
check = check.sort_values(by=['market_value'], ascending=False)
check.head(20)


Unnamed: 0_level_0,Unnamed: 1_level_0,height_in_cm,season,goals,assists,minutes_played,_permanence_week,_season_weeks,_minutes_available,_season_played,_season_teams,...,foot_Both,foot_Left,foot_Right,side_0,side_1,side_2,played_matches,started_matches,quotation,vote_average
pretty_name,pt_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Fabian Ruiz,FABIANO,189.0,2019.0,3.666667,3.333333,2419.0,42.52381,43.571429,3480.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,,,,
Tiemoue Bakayoko,BAKAYOKO,189.0,2019.0,1.5,1.0,2135.5,36.142857,41.071429,3510.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,,,,
Steven Nzonzi,MONZON,196.0,2018.0,1.0,2.0,2620.0,38.714286,40.0,3420.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,,,,
Emre Can,EMRE Belozog,186.0,2018.0,4.0,1.0,1807.0,41.142857,40.0,3420.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,,,,
Sandro Tonali,SANDRO R.,181.0,2019.5,0.5,3.5,2132.0,41.071429,45.357143,3510.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,,,,
Lorenzo Pellegrini,PELLEGRINI L,186.0,2018.0,3.0,4.0,1830.0,39.857143,40.0,3420.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,,,,
Arturo Vidal,VIDAL,180.0,2017.0,4.0,3.0,1614.5,30.142857,40.642857,3510.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,,,,
Konstantinos Manolas,CONSTANT,189.0,2017.0,1.285714,0.571429,2522.0,41.510204,41.020408,3445.714286,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,,,,
Andre Silva,JOAO SILVA,185.0,2017.0,2.0,0.0,924.0,40.0,39.0,3420.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,,,,
Nahitan Nandez,HERNANDEZ,172.0,2019.5,2.0,3.0,2821.0,46.0,45.357143,3510.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,,,,


In [16]:
path = os.path.join(
    params["PATHS"]["ROOT_FOLDER"],
    f"{params['PATHS']['STAGES']['MATCHED_DATASET']}.pkl"
    )
matched_dataset.to_pickle(path)