In [1]:
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
from sklearn.preprocessing import LabelEncoder

from pandas.plotting import register_matplotlib_converters
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.seasonal import DecomposeResult

register_matplotlib_converters()
sns.set_style("darkgrid")
plt.rc("figure", figsize=(16, 12))
plt.rc("font", size=13)


In [2]:
df1 = pd.read_csv('./data/shop_1_transactions.csv',sep=';')
df1['date'] = pd.to_datetime(df1['date'])

In [None]:
def get_daily_ts(df, number_of_items=15):
    """
    Extract daily time series from the data 
    """
    # get the list of the 10 most popular items
    popular_item_ids = df.groupby(
        ['item_id'])['quantity'].sum().sort_values(ascending=False).index

    result = {}
    for k in range(number_of_items):
        # load the sales of the most popular item
        item_df = df[df['item_id'] == popular_item_ids[k]]

        # remove outlierz with the zscore method
        item_df = item_df[np.abs(zscore(item_df['quantity'])) < 2]

        # get the data grouped by day
        ts = item_df.set_index('date').resample('D')[['quantity']].sum()
        result[popular_item_ids[k]] = ts

    return result


def split_groupped_data_to_slices(df, time_steps, input_cols, target_col, time_col, id_col):
    """
    Split the data in a fixed window size to feed model with fixed input size.
    """
    # Calculate the total number of entries in the DataFrame
    num_entries = len(df)

    # Calculate the valid starting indices for each slice
    valid_sampling_locations = [time_steps +
                                i for i in range(num_entries - time_steps + 1)]

    # Initialize empty lists for inputs, outputs, time information, and identifiers
    inputs_list, outputs_list, time_list, identifiers_list = [], [], [], []

    # Loop over each valid starting index and extract relevant information for each slice
    for start_idx in valid_sampling_locations:
        # Slice the DataFrame to extract the relevant rows
        sliced = df.iloc[start_idx - time_steps:start_idx]

        # Extract inputs, outputs, time information, and identifiers from the sliced DataFrame
        inputs = sliced[input_cols].to_numpy()
        outputs = sliced[[target_col]].to_numpy()
        time = sliced[time_col].to_numpy()
        identifiers = sliced[id_col].to_numpy()

        # Append the extracted information to the appropriate list
        inputs_list.append(inputs)
        outputs_list.append(outputs)
        time_list.append(time)
        identifiers_list.append(identifiers)

    # Return a tuple containing the four lists of extracted information
    return inputs_list, outputs_list, time_list, identifiers_list


def numpy_normalised_quantile_loss(y, y_pred, quantile):
    """Computes normalised quantile loss for numpy arrays.
    Uses the q-Risk metric as defined in the "Training Procedure" section of the
    main TFT paper.
    Args:
      y: Targets
      y_pred: Predictions
      quantile: Quantile to use for loss calculations (between 0 & 1)
    Returns:
      Float for normalised quantile loss.
    """
    prediction_underflow = y - y_pred
    weighted_errors = quantile * np.maximum(prediction_underflow, 0.) \
        + (1. - quantile) * np.maximum(-prediction_underflow, 0.)

    quantile_loss = weighted_errors.mean()
    normaliser = y.abs().mean()

    return 2 * quantile_loss / normaliser


# Calendar data

In [1]:
import requests
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def get_calendar_data():
    # request the json file with all of the calendar events in France
    res = requests.get('https://calendrier.api.gouv.fr/jours-feries/metropole.json')

    # convert the data to a dataframe
    df = pd.DataFrame.from_dict(res.json(),orient='index')
    df.columns = ['calendar_event']

    # parse the date index
    df.index = pd.to_datetime(df.index)

    # resample by day and fill the missing values
    df = df.resample('D').first().fillna("")

    # use the label encoder to convert to numerical values. 
    le = LabelEncoder()
    le.fit(df['calendar_event'])
    df['calendar_embedding'] = le.transform(df['calendar_event'])
    
    # return also the label encoder for parsing
    return df, le


In [4]:
get_calendar_data()[0][get_calendar_data()[0]['calendar_embedding'] != 0]

Unnamed: 0,calendar_event,calendar_embedding
2003-01-01,1er janvier,3
2003-04-21,Lundi de Pâques,10
2003-05-01,1er mai,4
2003-05-08,8 mai,5
2003-05-29,Ascension,6
...,...,...
2028-07-14,14 juillet,2
2028-08-15,Assomption,7
2028-11-01,Toussaint,11
2028-11-11,11 novembre,1


# Weather

In [21]:
def get_weather_file(filename):
    # Define the key_mapper dictionary
    key_mapper = {
        "DATE": 'date',
        "MIN_TEMPERATURE_C": 'temp_min', # minimum temperature in degrees Celsius.
        "MAX_TEMPERATURE_C": 'temp_max', # maximum temperature in degrees Celsius.
        "TEMPERATURE_NOON_C": 'temp_avg', #  the temperature at noon in degrees Celsius.
        "PRECIP_TOTAL_DAY_MM": 'rain', # total precipitation for the day in millimeters.
        "HUMIDITY_MAX_PERCENT": 'humidity', # the maximum humidity for the day as a percentage.
        "WINDSPEED_MAX_KMH": 'wind', # maximum wind speed for the day in kilometers per hour.
        "CLOUDCOVER_AVG_PERCENT": 'cloud_cov' # the average cloud cover for the day as a percentage.
    }

    # Read the csv file and map the column names
    df = pd.read_csv(filename,sep=',')
    df = df.rename(columns=key_mapper)

    df = df[list(key_mapper.values())]

    df['date'] = pd.to_datetime(df['date'])

    return df

get_weather_file('./data/shop_1_weather.csv')

Unnamed: 0,date,temp_min,temp_max,temp_avg,rain,humidity,wind,cloud_cov
0,2009-01-01,-5,1,1,0.0,90,10,12.000
1,2009-01-02,-5,1,0,0.1,91,9,46.000
2,2009-01-03,-4,1,1,0.1,97,13,32.625
3,2009-01-04,-4,1,1,0.0,91,19,53.125
4,2009-01-05,-1,0,0,6.0,98,28,98.250
...,...,...,...,...,...,...,...,...
5228,2023-04-26,3,12,10,0.0,74,9,47.750
5229,2023-04-27,8,18,16,0.2,87,13,67.625
5230,2023-04-28,13,18,16,1.0,97,22,83.875
5231,2023-04-29,9,17,16,0.0,88,9,42.500
