# Votes Matching
Matches the transfermarkt dataset with italian votes manually downloaded from "PianetaFanta" website.

In [1]:
# See: https://github.com/maladeep/Name-Matching-In-Python/blob/master/Surprisingly%20Effective%20Way%20To%20Name%20Matching%20In%20Python.ipynb

In [2]:
import pandas as pd
import re
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
from dynaconf import LazySettings
from dynaconf.utils.boxing import DynaBox
from scipy.sparse import csr_matrix
from typing import List
import os

# pd.set_option('display.max_colwidth', -1)


In [3]:
config_file = "C://workspaces//learning//fantasAi_football//config//conf.yaml"
config_mode = "default"


In [4]:
params = LazySettings(settings_files=[config_file])
params = params[config_mode]

Read serialized datatets to merge

In [5]:
# Transfermarkt dataset
tm_path = os.path.join(
    params["PATHS"]["ROOT_FOLDER"],
    f"{params['PATHS']['STAGES']['TM_DATASET']}.pkl",
)
tm_dataset = pd.read_pickle(tm_path)

# PianetaFAnta data with votes
votes_ita_path = os.path.join(
    params["PATHS"]["ROOT_FOLDER"],
    f"{params['PATHS']['STAGES']['VOTES_ITA']}.pkl",
)
votes_ita = pd.read_pickle(votes_ita_path)

In [6]:
votes_ita[params["FEATURES"]["PIANETAFANTA_TEAM"]] = votes_ita[params["FEATURES"]["PIANETAFANTA_TEAM"]].replace(params["VOTES_ITA"]["TEAM_TRANSLATOR"].to_dict())

In [7]:
def ngrams(string: str, n: int=3) -> List[str]:
    """Splits the function n-grams.

    args:
    - string (str): the string to split in ngrams
    - n (int): number "n" of characters 

    returns (List[str]) the list of n-grams
    """
    string = re.sub(r"[,-./]|\sBD", r"", string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return ["".join(ngram) for ngram in ngrams]

Define series with the names

In [8]:
votes_players = votes_ita.drop_duplicates(
    subset=[params["FEATURES"]["PIANETAFANTA_NAME"]]
).reset_index(drop=False)

tm_players = tm_dataset.drop_duplicates(
    subset=[params["FEATURES"]["PRETTY_NAME"]]
).reset_index(drop=False)

def simplify(string_series: pd.Series) -> pd.Series:
    """Simplifies the string in a Series removing all the special characters."""
    string_series = string_series.str.lower()
    string_series = string_series.apply(unidecode)
    string_series = string_series.replace(r'[^\w\s]|_', '', regex=True)

    return string_series

tm_names = tm_players[params["FEATURES"]["PRETTY_NAME"]]
tm_names = simplify(tm_names)

votes_names = votes_players[params["FEATURES"]["PIANETAFANTA_NAME"]]
votes_names = simplify(votes_names)

names = pd.concat([tm_names, votes_names], ignore_index=True)


In [9]:
# After having each words split (token or  lemmas (n-gram generated items) )
# into a vector and Scikit-learn’s  Tfidfvectorizer aim to do the same thing, 
# which is to convert a collection of raw documents to a matrix of 
# TF-IDF features. Generate the matrix of TF-IDF (Term Frequency-Inverse 
# Document frequency)values for each 
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
vectorizer = vectorizer.fit(names)

tm_names_t = vectorizer.transform(tm_names)
votes_names_t = vectorizer.transform(votes_names)

In [10]:
# Calcuate the similarities between the encoded names
cosine_similarities = linear_kernel(tm_names_t, votes_names_t)

# Find the best match
related_docs_indices = cosine_similarities.argmax(axis=1)

# Get the matched players
matched = votes_players.loc[related_docs_indices].reset_index(drop=True)

# Concat the transfermarkt players with the matched pianetafanta names
players_matched = pd.concat([tm_players[[params["FEATURES"]["PLAYER"]]], matched[[params["FEATURES"]["PIANETAFANTA_NAME"]]]], axis=1)
players_matched = players_matched.drop_duplicates(subset=[params["FEATURES"]["PLAYER"]])
# Define the trustworthy mathing as the ones that have a minimum distance from 
# best match
trustworthy = cosine_similarities.max(axis=1) > 0.4
players_matched["_is_reliable"] = trustworthy
players_matched[params["FEATURES"]["COMPETITION"]] = params["SETTINGS"]["ITALIAN_FANTA_COMPETITION"]

# Attach pianetafanta name to transfermarkt dataset
tm_dataset_with_pt_names = tm_dataset.merge(players_matched, on=[params["FEATURES"]["PLAYER"], params["FEATURES"]["COMPETITION"]], how='left')

In [62]:
# Perform some manual fixes
for tm_name, pt_name in params["MANUAL_FIXES"].items():
    tm_dataset_with_pt_names.loc[tm_dataset_with_pt_names[params["FEATURES"]["PRETTY_NAME"]] == tm_name, params["FEATURES"]["PIANETAFANTA_NAME"]] = pt_name

In [63]:
votes_ita_m = votes_ita.rename(columns={params["FEATURES"]["PIANETAFANTA_TEAM"]: params["FEATURES"]["CLUB_PRETTY_NAME"]})
votes_ita_m[params["FEATURES"]["COMPETITION"]] = params["SETTINGS"]["ITALIAN_FANTA_COMPETITION"]
matched_dataset = tm_dataset_with_pt_names.merge(
    votes_ita_m, on=[
        params["FEATURES"]["PIANETAFANTA_NAME"],
        params["FEATURES"]["CLUB_PRETTY_NAME"],
        params["FEATURES"]["COMPETITION"],
        params["FEATURES"]["SEASON"],        
        ], how='left')

In [64]:
gd = params["SETTINGS"]["GOALS_DIFF_ITA_VOTES_MATCHING_NOT_RELIABLE"]
matched_dataset.loc[
    (matched_dataset["goals_y"].notna())
    & 
    (
        (matched_dataset["goals_x"] > matched_dataset["goals_y"]*(1+gd))
        | (matched_dataset["goals_x"] < matched_dataset["goals_y"]*(1-gd))
    ), "_is_reliable" ] = False

In [65]:
goals_x = params["FEATURES"]["GOALS"] + "_x"
assist_x = params["FEATURES"]["ASSISTS"] + "_x"
matched_dataset = matched_dataset.drop(columns=[
    "_season_starts",
    params["FEATURES"]["GOALS"] + "_y",
    params["FEATURES"]["ASSISTS"] + "_y"
], errors='ignore').rename(columns={
    goals_x: params["FEATURES"]["GOALS"],
    assist_x: params["FEATURES"]["ASSISTS"],
    }
)

In [70]:
votes_ita.loc[votes_ita["pt_name"].str.contains("BASTONI")]

Unnamed: 0,season,pt_name,pt_team,pt_role,played_matches,started_matches,quotation,vote_average,goals,assists
7734,2016,BASTONI A.,Atalanta Bergamo,D,2,0,4.2,6.0,0,0
8300,2017,BASTONI A.,Atalanta Bergamo,D,3,2,2.2,6.0,0,0
8896,2018,BASTONI A.,Parma Calcio 1913,D,20,18,10.3,6.18,1,0
9461,2019,BASTONI A.,Inter Mailand,D,25,21,12.3,6.18,2,0
10047,2020,BASTONI A.,Inter Mailand,D,35,35,15.3,6.1,0,2
10048,2020,BASTONI S.,Spezia Calcio,D,22,20,10.3,6.34,1,6


In [67]:
check = matched_dataset.loc[
    (matched_dataset["competition_id"] == "IT1") 
    & (matched_dataset["pt_role"].isna()) 
    & (matched_dataset["season"]<=2020) 
    #& (matched_dataset["goals"]>=1) 
    & (matched_dataset["minutes_played"]>=90*5) 
    ]
check = check.groupby(["pretty_name", "pt_name"]).mean()
check = check.sort_values(by=['market_value'], ascending=False)
check.head(20)


Unnamed: 0_level_0,Unnamed: 1_level_0,height_in_cm,season,goals,assists,minutes_played,on_field_index,assist_ratio,goals_ratio,fanta_points_ratio,market_value,...,market_value_ratio,position_index,side_index,foot_Both,foot_Left,foot_Right,played_matches,started_matches,quotation,vote_average
pretty_name,pt_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Fabian Ruiz,FABIANO,189.0,2019.0,3.666667,3.333333,2419.0,0.694313,0.125709,0.143419,0.269129,36000000.0,...,9.520202,4.0,0.0,0.0,1.0,0.0,,,,
Tiemoue Bakayoko,BAKAYOKO,189.0,2019.0,1.5,1.0,2135.5,0.61,0.042393,0.065213,0.107606,29475000.0,...,9.248106,3.0,0.0,0.0,0.0,1.0,,,,
Steven Nzonzi,MONZON,196.0,2018.0,1.0,2.0,2620.0,0.766082,0.068702,0.034351,0.103053,27000000.0,...,9.090909,3.0,0.0,0.0,0.0,1.0,,,,
Emre Can,EMRE Belozog,186.0,2018.0,4.0,1.0,1807.0,0.528363,0.049806,0.199225,0.249032,27000000.0,...,9.090909,3.0,0.0,0.0,0.0,1.0,,,,
Sandro Tonali,SANDRO R.,181.0,2019.5,0.5,3.5,2132.0,0.667105,0.107582,0.015369,0.122951,25650000.0,...,6.5,3.0,0.0,0.0,0.0,1.0,,,,
Arturo Vidal,ARTUR,180.0,2017.0,4.0,3.0,1614.5,0.46375,0.165141,0.190226,0.355367,25200000.0,...,10.375,4.0,0.0,0.0,0.0,1.0,,,,
Lorenzo Pellegrini,PELLEGRINI L,186.0,2018.0,3.0,4.0,1830.0,0.535088,0.196721,0.147541,0.344262,25200000.0,...,8.484848,5.0,0.0,1.0,0.0,0.0,,,,
Konstantinos Manolas,CONSTANT,189.0,2017.0,1.285714,0.571429,2522.0,0.732435,0.0175,0.049377,0.066876,25071428.571429,...,8.9174,1.0,0.0,0.0,0.0,1.0,,,,
Andre Silva,JOAO SILVA,185.0,2017.0,2.0,0.0,924.0,0.270175,0.0,0.194805,0.194805,19800000.0,...,8.8,8.0,0.0,0.0,0.0,1.0,,,,
Alessandro Florenzi,D`ALESSANDRO,173.0,2016.5,2.666667,2.833333,1890.5,0.552778,0.156565,0.098096,0.254661,19350000.0,...,7.611868,2.0,2.0,0.0,0.0,1.0,,,,


In [15]:
matched_dataset.loc[matched_dataset[params["FEATURES"]["AVG_VOTE"]].notna()].iloc[0]

player_id                                      10
name                               miroslav-klose
pretty_name                        Miroslav Klose
country_of_citizenship                    Germany
date_of_birth                            09-06-78
position                                   Attack
sub_position              attack - Centre-Forward
foot                                        Right
height_in_cm                                  184
season                                       2014
competition_id                                IT1
club_id                                       398
goals                                          13
assists                                         7
minutes_played                               1835
on_field_index                            0.53655
assist_ratio                             0.343324
goals_ratio                              0.637602
fanta_points_ratio                       0.980926
market_value                               900000


In [16]:
path = os.path.join(
    params["PATHS"]["ROOT_FOLDER"],
    f"{params['PATHS']['STAGES']['MATCHED_DATASET']}.pkl"
    )
matched_dataset.to_pickle(path)