## Прогноз виручки (three-phase linear)

Щомісячний прогноз виручки по категоріях із повторним використанням пайплайну.

In [9]:
from pathlib import Path

import numpy as np
import pandas as pd

from three_phase_linear import ForecastConfig, run_three_phase_forecast

DATA_PATH = Path('forecast_revenue_dataset.csv')
OUTPUT_PATH = Path('money_three_phase_forecast_accuracy_calculation.csv')
GROUP_COLS = ['category_id']
TARGET_COLUMN = 'revenue'
REGRESSORS = ['is_sale_prohibition', 'cos_month', 'sin_month', 'cos_quarter', 'sin_quarter', 'unique_brand_count']


In [10]:
df = pd.read_csv(DATA_PATH, sep=';')
df['date'] = pd.to_datetime(df['date'], dayfirst=True)
df = df.sort_values(GROUP_COLS + ['date']).reset_index(drop=True)
for col in REGRESSORS:
    df[col] = pd.to_numeric(df[col], errors='coerce')

agg_df = df.groupby(['date', 'category_id', 'category_title'], as_index=False).agg(
    revenue_sum=('revenue', 'sum'),
    revenue_count=('revenue', 'count'),
    is_sale_prohibition=('is_sale_prohibition', 'max'),
    cos_month=('cos_month', 'mean'),
    sin_month=('sin_month', 'mean'),
    cos_quarter=('cos_quarter', 'mean'),
    sin_quarter=('sin_quarter', 'mean'),
    unique_brand_count=('unique_brand_count', 'mean'),
)
agg_df.loc[agg_df['revenue_count'] == 0, 'revenue_sum'] = np.nan
df = agg_df.rename(columns={'revenue_sum': 'revenue'}).drop(columns=['revenue_count'])
df = df.sort_values(GROUP_COLS + ['date']).reset_index(drop=True)

future_mask = df[TARGET_COLUMN].isna()
future_counts = df[future_mask].groupby(GROUP_COLS).size()
forecast_horizon = int(future_counts.max()) if not future_counts.empty else 12
if forecast_horizon <= 0:
    forecast_horizon = 12

history_df = df[~future_mask].copy()
history_df = history_df.sort_values(GROUP_COLS + ['date']).reset_index(drop=True)
history_df['is_evaluation_period'] = False

for _, group in history_df.groupby(GROUP_COLS):
    eval_count = min(len(group), forecast_horizon)
    if eval_count == 0:
        continue
    eval_indices = group.tail(eval_count).index
    history_df.loc[eval_indices, 'is_evaluation_period'] = True

history_df['revenue_actual'] = history_df[TARGET_COLUMN]
history_df.loc[history_df['is_evaluation_period'], TARGET_COLUMN] = np.nan

df = history_df

print(f'Forecast horizon for accuracy: {forecast_horizon} periods')


Forecast horizon for accuracy: 18 periods


In [11]:
input_cols = ['date', *GROUP_COLS, TARGET_COLUMN, *REGRESSORS]
input_cols = list(dict.fromkeys(input_cols))
config = ForecastConfig(
    time_col='date',
    target_col=TARGET_COLUMN,
    group_cols=GROUP_COLS,
    freq='MS',
    forecast_horizon=forecast_horizon,
    seasonal_periods=12,
    min_history=24,
    lags=(1, 2, 3, 6, 12, 18, 24),
    rolling_windows=(3, 6, 12, 24),
    additional_regressors=REGRESSORS,
    random_search_iterations=10,
    n_splits=4,
    random_state=46,
)

preds, summaries = run_three_phase_forecast(df[input_cols].copy(), config)
preds = preds.rename(columns={
    'prediction': 'revenue_forecast',
    f'{TARGET_COLUMN}_holtwinters': 'revenue_baseline',
})
summary_report = pd.DataFrame({
    'group_key': [s.group_key[0] for s in summaries],
    'train_rows': [s.train_rows for s in summaries],
    'cv_mae': [s.best_score for s in summaries],
    'skipped_reason': [s.skipped_reason for s in summaries],
})
summary_report.head()

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Unnamed: 0,group_key,train_rows,cv_mae,skipped_reason
0,1,68,22961160.0,
1,2,68,23247750.0,
2,3,68,73435200.0,
3,4,68,17221330.0,
4,5,68,12233020.0,


In [12]:
merge_cols = [*GROUP_COLS, 'date']
result_df = df.copy()
result_df['is_evaluation_period'] = result_df[TARGET_COLUMN].isna()
result_df = result_df.merge(preds[merge_cols + ['revenue_forecast']], on=merge_cols, how='left')
result_df['revenue'] = result_df['revenue'].astype(float)
result_df['revenue'] = result_df['revenue'].fillna(result_df['revenue_forecast'])

output_columns = ['date', 'category_id', 'category_title', 'revenue']
final_output = result_df.loc[result_df['is_evaluation_period'], output_columns]
final_output = final_output.groupby(['date', 'category_id', 'category_title'], as_index=False)['revenue'].sum()
final_output = final_output.sort_values(['category_id', 'date']).reset_index(drop=True)
final_output.to_csv(OUTPUT_PATH, index=False)

final_output.tail()


Unnamed: 0,date,category_id,category_title,revenue
85,2024-10-01,5,Vodka,107735288.0
86,2024-11-01,5,Vodka,95446872.0
87,2024-12-01,5,Vodka,95991840.0
88,2025-01-01,5,Vodka,109749864.0
89,2025-02-01,5,Vodka,111444008.0


MAE - Середня абсолютна помилка 

In [13]:
evaluation_df = result_df[result_df['is_evaluation_period']].copy()
actual = evaluation_df['revenue_actual'].astype(float)
forecast = evaluation_df['revenue_forecast'].astype(float)
mask = actual.notna() & forecast.notna()
if mask.any():
    mae = np.abs(actual[mask] - forecast[mask]).mean()
    print(f'MAE: {mae:.4f}')
else:
    print('MAE: not enough data to calculate')


MAE: 39311132.0367


MAPE-Середня абсолютна відсоткова помилка 

In [14]:
evaluation_df = result_df[result_df['is_evaluation_period']].copy()
actual = evaluation_df['revenue_actual'].astype(float)
forecast = evaluation_df['revenue_forecast'].astype(float)
mask = actual.notna() & forecast.notna() & (actual.replace(0, np.nan).notna())
if mask.any():
    mape = (np.abs((actual[mask] - forecast[mask]) / actual[mask]) * 100).mean()
    print(f'MAPE: {mape:.4f}%')
else:
    print('MAPE: not enough data to calculate')


MAPE: 19.1904%


WMAPE-Взважена середня абсолютна помилка 

In [15]:
evaluation_df = result_df[result_df['is_evaluation_period']].copy()
actual = evaluation_df['revenue_actual'].astype(float)
forecast = evaluation_df['revenue_forecast'].astype(float)
mask = actual.notna() & forecast.notna()
denominator = np.abs(actual[mask]).sum()
if mask.any() and denominator > 0:
    wmape = np.abs(actual[mask] - forecast[mask]).sum() / denominator * 100
    print(f'WMAPE: {wmape:.4f}%')
else:
    print('WMAPE: not enough data to calculate')


WMAPE: 18.4956%


In [16]:
summary_report

Unnamed: 0,group_key,train_rows,cv_mae,skipped_reason
0,1,68,22961160.0,
1,2,68,23247750.0,
2,3,68,73435200.0,
3,4,68,17221330.0,
4,5,68,12233020.0,
