In [105]:
import os
import pandas as pd
from dynaconf import LazySettings
from dynaconf.utils.boxing import DynaBox
from typing import List


In [106]:
config_file = "/home/tiziano/workspaces/fantasAi_football/config/conf.yaml"
config_mode = 'default'

In [107]:
params = LazySettings(settings_files=[config_file])
params = params[config_mode]

In [108]:
def data_ingestion_basics(params: DynaBox, table: str) -> pd.DataFrame:
    """Performs some basic harmonization steps on a input dataframe.

    Note that after the basic harmonization the columns will have the names
    stated in settings.COLS, and not those appearing in the input data.

    Args:
        table (pd.DataFrame): id of the table to lead, it must appear in
            config file

    Raises:
        ValueError: If table registry dataframe keys are missing
        KeyError: If table registry dataframe keys are duplicated

    Returns:
        pd.DataFrame: dataframe in input, harmonized
    """
    # Load parameters
    t_par = params[table]
    t_cols_dict = t_par["COLS"].to_dict()
    keys: List[str] = [t_par["COLS"][c] for c in t_par.KEY]
    cols: List[str] = list(t_cols_dict.values())
    dtype_map = {
        c_name: params["FEATURES"]["DTYPES"][c]
        for c, c_name in t_cols_dict.items()
    }
    name_conversion = {
        c_from: params["FEATURES"][c] for c, c_from in t_cols_dict.items()
    }
    
    path = os.path.join(
        params["PATHS"]["ROOT_FOLDER"], 
        params["PATHS"]["INPUT"]["FOLDER"],
         params["PATHS"]["INPUT"][table]
    )
    data: pd.DataFrame = pd.read_csv(path)

    # Keep only selected columns
    data = data[cols]

    # Ugly but functional call to ensure correct type conversion
    data = data.convert_dtypes().astype(dtype_map).convert_dtypes()  # type: ignore

    # Ensure that the product registry dataframe keys are valid
    if not data[keys].notna().all(axis=1).all():
        raise ValueError(f"{table} dataframe keys are missing")
    if not data.value_counts(keys).eq(1).all():
        raise KeyError(f"{table} keys are duplicated")

    # Sort product product registry by DIVISION,PRODUCT and reset index
    data = data.sort_values(keys).reset_index(drop=True)

    data = data.rename(columns=name_conversion)

    if table in params["FILTER"]:
        for column, values in params["FILTER"].get(table).to_dict().items():
            column_name = params["FEATURES"].get(column)
            data = data.loc[data[column_name].isin(values)]

    return data

In [109]:
players = data_ingestion_basics(params, "PLAYERS")
clubs = data_ingestion_basics(params, "CLUBS")
competitions = data_ingestion_basics(params, "COMPETITIONS")
games = data_ingestion_basics(params, "GAMES")
international_competitions_stats = data_ingestion_basics(params, "INTERNATIONAL_COMPETITIONS_STATS")
player_valuations = data_ingestion_basics(params, "PLAYER_VALUATIONS")
players = data_ingestion_basics(params, "PLAYERS")



ValueError: Day out of range in datetime string "24-06-85"

## De-normalize data


In [16]:
players = players.rename(columns={'current_club_id' : 'club_id'})
dataset = players.merge(clubs, on='club_id', how='left')

In [25]:
players.columns

Index(['player_id', 'last_season', 'club_id', 'name', 'pretty_name',
       'country_of_birth', 'country_of_citizenship', 'date_of_birth',
       'position', 'sub_position', 'foot', 'height_in_cm',
       'market_value_in_gbp', 'highest_market_value_in_gbp', 'url'],
      dtype='object')

In [27]:
players.loc[players.player_id == 342229]

Unnamed: 0,player_id,last_season,club_id,name,pretty_name,country_of_birth,country_of_citizenship,date_of_birth,position,sub_position,foot,height_in_cm,market_value_in_gbp,highest_market_value_in_gbp,url
19170,342229,2019,583,kylian-mbappe,Kylian Mbappe,France,France,20-12-98,Attack,attack - Centre-Forward,Right,178,,,https://www.transfermarkt.co.uk/kylian-mbappe/...


In [30]:
mbappe = players.loc[players.player_id == 342229]

In [43]:
player_valuations

Unnamed: 0,player_id,date,market_value
0,38790,2007-04-06,338000
1,38790,2007-09-06,270000
2,38790,2010-05-02,675000
3,38790,2011-08-12,360000
4,38790,2012-03-29,675000
...,...,...,...
326816,251029,2021-03-16,1800000
326817,251029,2021-05-21,1350000
326818,251029,2021-10-12,3060000
326819,251029,2022-01-04,3330000
