# Образец результата

прогноз на апрель - август 2023, сделанный на данных до марта включительно

In [1]:
from  pathlib import Path

from datetime import datetime
import pandas as pd

from dateutil.relativedelta import relativedelta

In [2]:
data_folder = Path().cwd() / 'data'

In [10]:
# Факт на задание
fact = pd.read_csv(
    data_folder / "fact_train_test.csv", 
    sep=";", 
    decimal=",", 
    encoding="windows-1251"
)
fact["period"] = fact["period"].astype("datetime64[ns]")

In [11]:
# Делим на train/test
train = fact[fact['period'] <= fact['period'].max() - relativedelta(months=5)]

test = fact[fact['period'] > fact['period'].max() - relativedelta(months=5)]

# Сохраняем для валидации
test.to_csv(
    data_folder / "fact_validation.csv",
    index=False, sep=";", 
    decimal=",", 
    encoding="windows-1251"
)

In [12]:
# Пример прогноза - наивный, копия последнего факта
result = []
last_known_fact_month = train["period"].max()

test_periods = pd.date_range(
    start=last_known_fact_month, 
    periods=6,
    freq='MS', 
    inclusive='right'
)

for period in test_periods:
    print(period)
    month_forecast = train[train["period"] == last_known_fact_month].reset_index(drop=True)
    month_forecast["period"] = period
    result.append(month_forecast)

result = pd.concat(result).reset_index(drop=True)

result.rename(columns={
        "real_wagon_count": "forecast_wagon_count", 
        "real_weight": "forecast_weight"
    }, 
    inplace=True
)
result.to_csv(
    data_folder / "forecast_example.csv", 
    index=False, sep=";", 
    decimal=",", 
    encoding="windows-1251"
)
print('Baseline forecast saved')

2022-11-01 00:00:00
2022-12-01 00:00:00
2023-01-01 00:00:00
2023-02-01 00:00:00
2023-03-01 00:00:00
Baseline forecast saved


# Оценка результата

In [13]:
import pandas as pd
import numpy as np

In [14]:
def add_master_data_mappings(df: pd.DataFrame) -> pd.DataFrame:
    # = Пути к справочникам - откорректировать если в реальной системе будут лежать по другому адресу =
    client_mapping_file = data_folder / "client_mapping.csv"
    freight_mapping_file = data_folder / "freight_mapping.csv"
    station_mapping_file = data_folder / "station_mapping.csv"

    # Клиент - холдинг
    client_mapping = pd.read_csv(
        client_mapping_file,
        sep=";",
        decimal=",",
        encoding="windows-1251",
    )
    df = pd.merge(df, client_mapping, how="left", on="client_sap_id")

    # Груз
    freight_mapping = pd.read_csv(
        freight_mapping_file, sep=";", decimal=",", encoding="windows-1251"
    )
    df = pd.merge(df, freight_mapping, how="left", on="freight_id")

    # Станции
    station_mapping = pd.read_csv(
        station_mapping_file,
        sep=";",
        decimal=",",
        encoding="windows-1251",
    )
    df = pd.merge(
        df,
        station_mapping.add_prefix("sender_"),
        how="left",
        on="sender_station_id",
    )
    df = pd.merge(
        df,
        station_mapping.add_prefix("recipient_"),
        how="left",
        on="recipient_station_id",
    )

    return df


def evaluate(fact: pd.DataFrame, forecast: pd.DataFrame, public: bool = True) -> float:
    # = Параметры для расчета метрики =
    accuracy_granularity = [
        "period",
        "rps",
        "holding_name",
        "sender_department_name",
        "recipient_department_name",
    ]
    fact_value, forecast_value = "real_wagon_count", "forecast_wagon_count"
    if public:
        metric_weight = np.array([0.0, 1.0, 0.0, 0.0, 0.0])
    else:
        metric_weight = np.array([0.1, 0.6, 0.1, 0.1, 0.1])

    # = Собственно расчет метрик =
    # 1. Добавляем сущности верхних уровней гранулярности по справочникам
    fact = add_master_data_mappings(fact)
    forecast = add_master_data_mappings(forecast)

    # 2. Расчет KPI
    compare_data = pd.merge(
        fact.groupby(accuracy_granularity, as_index=False)[fact_value].sum(),
        forecast.groupby(accuracy_granularity, as_index=False)[forecast_value].sum(),
        how="outer",
        on=accuracy_granularity,
    ).fillna(0)
    # Против самых хитрых - нецелочисленный прогноз вагоноотправок не принимаем
    compare_data[fact_value] = np.around(compare_data[fact_value]).astype(int)
    compare_data[forecast_value] = np.around(compare_data[forecast_value]).astype(int)

    # 3. Рассчитаем метрики для каждого месяца в выборке
    compare_data["ABS_ERR"] = abs(
        compare_data[forecast_value] - compare_data[fact_value]
    )
    compare_data["MAX"] = abs(compare_data[[forecast_value, fact_value]].max(axis=1))
    summary = compare_data.groupby("period")[
        [forecast_value, fact_value, "ABS_ERR", "MAX"]
    ].sum()
    summary["Forecast Accuracy"] = 1 - summary["ABS_ERR"] / summary["MAX"]

    # 4. Взвесим метрики отдельных месяцев для получения одной цифры score
    score = (
        summary["Forecast Accuracy"].sort_index(ascending=True) * metric_weight
    ).sum()

    return score


def calc_score_public(fact: pd.DataFrame, forecast: pd.DataFrame) -> float:
    return evaluate(fact, forecast, public=True)


def calc_score_private(fact: pd.DataFrame, forecast: pd.DataFrame) -> float:
    return evaluate(fact, forecast, public=False)



In [15]:
# = Примеры файлов для проверки =
validation_file = data_folder / "fact_validation.csv"
forecast_file = data_folder / "forecast_example.csv"

# Валидационный датасет
fact = pd.read_csv(validation_file, sep=";", decimal=",", encoding="windows-1251")
# print("Валидационный датасет:", fact.shape)
# Прогноз
forecast = pd.read_csv(forecast_file, sep=";", decimal=",", encoding="windows-1251")
# print("Прогноз:", forecast.shape)

# Скорим
score_public = calc_score_public(fact, forecast)
score_private = calc_score_private(fact, forecast)
print(f"Public score: {score_public}")
print(f"Private score: {score_private}")


Public score: 0.639683424247945
Private score: 0.6364087430299136
