In [207]:
import pandas as pd
from dynaconf import LazySettings
from dynaconf.utils.boxing import DynaBox
from typing import List
import os

In [208]:
config_file = "/home/tiziano/workspaces/fantasAi_football/config/conf.yaml"
config_mode = 'default'

In [209]:
def data_ingestion_basics(params: DynaBox, table: str, duplicate_key_err:str='raise') -> pd.DataFrame:
    import os 
    
    """Performs some basic harmonization steps on a input dataframe.

    Note that after the basic harmonization the columns will have the names
    stated in settings.COLS, and not those appearing in the input data.

    Args:
        table (pd.DataFrame): id of the table to lead, it must appear in
            config file
        duplicate_key_err ('raise' or 'drop'): how to handle duplicated key
        errors. If 'raise' an error will be return in case of duplicated keys,
        if 'drop' all duplicated rows will be dropped from data.

    Raises:
        ValueError: If table registry dataframe keys are missing
        KeyError: If table registry dataframe keys are duplicated

    Returns:
        pd.DataFrame: dataframe in input, harmonized
    """
    # Load parameters
    t_par = params[table]
    t_cols_dict = t_par["COLS"].to_dict()
    keys: List[str] = [t_par["COLS"][c] for c in t_par.KEY]
    cols: List[str] = list(t_cols_dict.values())
    dtype_map = {
        c_name: params["FEATURES"]["DTYPES"][c]
        for c, c_name in t_cols_dict.items()
    }
    name_conversion = {
        c_from: params["FEATURES"][c] for c, c_from in t_cols_dict.items()
    }
    
    path = os.path.join(
        params["PATHS"]["ROOT_FOLDER"], 
        params["PATHS"]["INPUT"]["FOLDER"],
         params["PATHS"]["INPUT"][table]
    )
    data: pd.DataFrame = pd.read_csv(path)

    # Keep only selected columns
    data = data[cols]

    # Ugly but functional call to ensure correct type conversion
    data = data.convert_dtypes().astype(dtype_map, errors='ignore').convert_dtypes()  # type: ignore

    # Ensure that the product registry dataframe keys are valid
    if not data[keys].notna().all(axis=1).all():
        raise ValueError(f"{table} dataframe keys are missing")
    if not data.value_counts(keys).eq(1).all():
        if duplicate_key_err == 'raise':
            raise KeyError(f"{table} keys are duplicated")
        else:
            print(f"WARN: dupliated keys will be removed from {table}")
            data = data.drop_duplicates(subset=keys, keep=False)

    # Sort product product registry by DIVISION,PRODUCT and reset index
    data = data.sort_values(keys).reset_index(drop=True)

    data = data.rename(columns=name_conversion)

    if "FILTER" in params:
        if table in params["FILTER"]:
            for column, values in params["FILTER"].get(table).to_dict().items():
                column_name = params["FEATURES"].get(column)
                data = data.loc[data[column_name].isin(values)]

    return data

In [210]:
params = LazySettings(settings_files=[config_file])
params = params[config_mode]

In [211]:
players = data_ingestion_basics(params, "PLAYERS")
clubs = data_ingestion_basics(params, "CLUBS")
competitions = data_ingestion_basics(params, "COMPETITIONS")
games = data_ingestion_basics(params, "GAMES")
#international_competitions_stats = data_ingestion_basics(params, "INTERNATIONAL_COMPETITIONS_STATS")
player_valuations = data_ingestion_basics(params, "PLAYER_VALUATIONS")
players = data_ingestion_basics(params, "PLAYERS")
appearances = data_ingestion_basics(params, "APPEARANCES")

## De-normalize data


### Find a player club for a given year
A player is part of a club for a given season if he played at least one match for that club in that season.

In [212]:
# Get appearencs of players in games
# For some reason competition is specified also on appereances table, this
# is not necessary and annoying since it generates suffixed names when merged.
apps_sel = appearances.drop(columns=[params["FEATURES"]["COMPETITION"]])
app_game = apps_sel.merge(games, on=[params["FEATURES"]["GAME"]])
#teams = app_game.loc[app_game[params["FEATURES"]["SEASON"]] == season]
teams = app_game[
    [
        params["FEATURES"]["PLAYER"],
        params["FEATURES"]["SEASON"],
        params["FEATURES"]["COMPETITION"],
        params["FEATURES"]["CLUB"],
        params["FEATURES"]["GOALS"],
        params["FEATURES"]["ASSISTS"],
        params["FEATURES"]["MINUTES_PLAYED"]
    ]
]
teams = teams.groupby(
    [
        params["FEATURES"]["PLAYER"],
        params["FEATURES"]["SEASON"],
        params["FEATURES"]["COMPETITION"],
        params["FEATURES"]["CLUB"],
    ],
    as_index=False,
).sum()
teams = teams.rename(columns={params["FEATURES"]["GAME"]: params["FEATURES"]["MINUTES_PLAYED"]})

# For each player, keep only the team with most played games
teams = teams.loc[
    teams.groupby(
        [
            params["FEATURES"]["PLAYER"],
            params["FEATURES"]["SEASON"],
        ]
    )[params["FEATURES"]["MINUTES_PLAYED"]].idxmax()
]



In [213]:
players_team = players.merge(teams, on=[params["FEATURES"]["PLAYER"]])

In [214]:
players_team["year"] = players_team[params["FEATURES"]["SEASON"]]
players_team["month"] = 9
players_team["day"] = 1
players_team["_season_starts"] = pd.to_datetime(players_team[["year", "month", "day"]])
players_team = players_team.drop(columns=["year", "month", "day"])

In [215]:
players_vals = players_team.merge(player_valuations, on=[params["FEATURES"]["PLAYER"]])
players_vals.loc[players_vals.player_id == '341092']
players_vals["_date_diff"] = (players_vals[params["FEATURES"]["DATE"]] - players_vals["_season_starts"]).dt.days
players_vals = players_vals.loc[players_vals["_date_diff"] < 0]

In [216]:
pv_key = [params["FEATURES"]["PLAYER"], params["FEATURES"]["SEASON"]]
players_vals = players_vals[players_vals.groupby(pv_key)["_date_diff"].transform(max) == players_vals["_date_diff"]]
players_lost = len(players_team) - len(players_vals)
players_vals = players_vals.drop(columns=["_date_diff", "_season_starts", params["FEATURES"]["DATE"]])
print(f"{players_lost}/{len(players_team)} players/seasons has been lost when trying to calculate the value.")

3696/55649 players/seasons has been lost when trying to calculate the value.


In [217]:
# Add informationa about total minutage available in the competition
total_minutes = games.groupby(
    [
        params["FEATURES"]["SEASON"],
        params["FEATURES"]["COMPETITION"],
        params["FEATURES"]["HOME_CLUB"],
    ],
    as_index=False,
).count()
total_minutes = total_minutes.groupby(
    [params["FEATURES"]["SEASON"], params["FEATURES"]["COMPETITION"]],
    as_index=False,
).mean()
total_minutes[params["FEATURES"]["MINUTES_AVAILABLE"]] = total_minutes[params["FEATURES"]["GAME"]] * 90 * 2
total_minutes = total_minutes[
    [
        params["FEATURES"]["SEASON"],
        params["FEATURES"]["COMPETITION"],
        params["FEATURES"]["MINUTES_AVAILABLE"],
    ]
]

In [218]:

players_vals = players_vals.sort_values(by=pv_key)
players_vals[params["FEATURES"]["VALUE_DELTA"]] = players_vals[params["FEATURES"]["MARKET_VALUE"]].diff()
players_vals[params["FEATURES"]["VALUE_DELTA"]] = players_vals[params["FEATURES"]["VALUE_DELTA"]]/players_vals[params["FEATURES"]["MARKET_VALUE"]]

# Calculate club value
players_vals["_club_value"] = players_vals.groupby(
    [params["FEATURES"]["CLUB"], params["FEATURES"]["SEASON"]]
)[params["FEATURES"]["MARKET_VALUE"]].transform("sum")

# Express club value as ration from media club value of same competition
players_vals["_median_club_value"] = players_vals.groupby(
    [params["FEATURES"]["COMPETITION"], params["FEATURES"]["SEASON"]]
)["_club_value"].transform("median")
players_vals[params["FEATURES"]["CLUB_VALUE_RATIO"]] = (
    players_vals["_club_value"] / players_vals["_median_club_value"]
)

# Express players value as ratio from median player value of same competition
players_vals["_player_median_value"] = players_vals.groupby(
    [params["FEATURES"]["COMPETITION"], params["FEATURES"]["SEASON"]]
)[params["FEATURES"]["MARKET_VALUE"]].transform("median")

players_vals[params["FEATURES"]["MARKET_VALUE_RATIO"]] = (
    players_vals[params["FEATURES"]["MARKET_VALUE"]]
    / players_vals["_player_median_value"]
)


players_vals = players_vals.merge(total_minutes, on=[params["FEATURES"]["COMPETITION"], params["FEATURES"]["SEASON"]])

players_vals[params["FEATURES"]["ON_FIELD_INDEX"]] = players_vals["minutes_played"]/players_vals[params["FEATURES"]["MINUTES_AVAILABLE"]]

players_vals = players_vals.drop(columns=["_player_median_value", "_median_club_value"])

In [222]:
players_vals.loc[players_vals.player_id.isin(['341092', '368482'])]

Unnamed: 0,player_id,name,pretty_name,country_of_citizenship,date_of_birth,position,sub_position,foot,height_in_cm,season,...,goals,assists,minutes_played,market_value,market_value_delta,_club_value,club_value_ratio,market_value_ratio,minutes_available,ON_FIELD_INDEX
27305,341092,federico-chiesa,Federico Chiesa,Italy,25-10-97,Attack,attack - Right Winger,Right,175,2019,...,10,6,2559,54000000,0.25,233100000,1.485934,12.0,3330.0,0.768468
27334,368482,riccardo-orsolini,Riccardo Orsolini,Italy,24-01-97,Attack,attack - Right Winger,Left,183,2019,...,8,8,2693,13500000,0.766667,102430000,0.652957,3.0,3330.0,0.808709
27800,341092,federico-chiesa,Federico Chiesa,Italy,25-10-97,Attack,attack - Right Winger,Right,175,2020,...,8,8,2227,43200000,-0.25,637178000,5.062392,12.0,3052.173913,0.729644
27834,368482,riccardo-orsolini,Riccardo Orsolini,Italy,24-01-97,Attack,attack - Right Winger,Left,183,2020,...,7,4,1755,19800000,0.318182,125865000,1.0,5.5,3052.173913,0.575
33204,341092,federico-chiesa,Federico Chiesa,Italy,25-10-97,Attack,attack - Right Winger,Right,175,2016,...,3,2,1495,90000,0.244444,129015000,1.625283,0.04,3420.0,0.437135
33694,341092,federico-chiesa,Federico Chiesa,Italy,25-10-97,Attack,attack - Right Winger,Right,175,2017,...,6,6,3022,9000000,0.99,106200000,1.344729,4.0,3420.0,0.883626
33717,368482,riccardo-orsolini,Riccardo Orsolini,Italy,24-01-97,Attack,attack - Right Winger,Left,183,2017,...,0,0,340,3150000,0.985714,68180000,0.863311,1.4,3420.0,0.099415
34176,341092,federico-chiesa,Federico Chiesa,Italy,25-10-97,Attack,attack - Right Winger,Right,175,2018,...,6,3,3037,40500000,0.777778,204210000,2.176383,12.857143,3420.0,0.888012
34203,368482,riccardo-orsolini,Riccardo Orsolini,Italy,24-01-97,Attack,attack - Right Winger,Left,183,2018,...,8,4,1953,3150000,0.0,79605000,0.848396,1.0,3420.0,0.571053
45004,341092,federico-chiesa,Federico Chiesa,Italy,25-10-97,Attack,attack - Right Winger,Right,175,2021,...,2,3,867,63000000,0.314286,529650000,3.751381,14.0,2835.0,0.30582
