## Revenue forecast (three-phase linear model)

Forecast monthly revenue per category with the reusable pipeline.

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

from three_phase_linear import ForecastConfig, run_three_phase_forecast

DATA_PATH = Path('forecast_revenue_dataset.csv')
OUTPUT_PATH = Path('money_three_phase_forecast.csv')
GROUP_COLS = ['category_id']
TARGET_COLUMN = 'revenue'
REGRESSORS = ['is_sale_prohibition', 'cos_month', 'sin_month', 'cos_quarter', 'sin_quarter', 'unique_brand_count']

In [2]:
df = pd.read_csv(DATA_PATH)
df['date'] = pd.to_datetime(df['date'])
for col in REGRESSORS:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.sort_values(GROUP_COLS + ['date']).reset_index(drop=True)

future_counts = df[df[TARGET_COLUMN].isna()].groupby(GROUP_COLS).size()
forecast_horizon = int(future_counts.max()) if not future_counts.empty else 12
if forecast_horizon <= 0:
    forecast_horizon = 12

print(f'Forecast horizon: {forecast_horizon} periods')

Forecast horizon: 72 periods


In [3]:
input_cols = ['date', *GROUP_COLS, TARGET_COLUMN, *REGRESSORS]
input_cols = list(dict.fromkeys(input_cols))
config = ForecastConfig(
    time_col='date',
    target_col=TARGET_COLUMN,
    group_cols=GROUP_COLS,
    freq='MS',
    forecast_horizon=forecast_horizon,
    seasonal_periods=12,
    min_history=24,
    lags=(1, 2, 3, 6, 12, 18, 24),
    rolling_windows=(3, 6, 12, 24),
    additional_regressors=REGRESSORS,
    random_search_iterations=30,
    n_splits=4,
    random_state=46,
)

preds, summaries = run_three_phase_forecast(df[input_cols].copy(), config)
preds = preds.rename(columns={
    'prediction': 'revenue_forecast',
    f'{TARGET_COLUMN}_holtwinters': 'revenue_baseline',
})
summary_report = pd.DataFrame({
    'group_key': [s.group_key[0] for s in summaries],
    'train_rows': [s.train_rows for s in summaries],
    'cv_mae': [s.best_score for s in summaries],
    'skipped_reason': [s.skipped_reason for s in summaries],
})
summary_report.head()

  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


  self._init_dates(dates, freq)


  return get_prediction_index(
  return get_prediction_index(


  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Unnamed: 0,group_key,train_rows,cv_mae,skipped_reason
0,1,408,13519820.0,
1,2,408,12581970.0,
2,3,408,46843740.0,
3,4,408,10902580.0,
4,5,408,7471469.0,


In [4]:
merge_cols = [*GROUP_COLS, 'date']
result_df = df.copy()
result_df = result_df.merge(preds[merge_cols + ['revenue_forecast']], on=merge_cols, how='left')
result_df['revenue'] = result_df['revenue'].astype(float)
result_df['revenue'] = result_df['revenue'].fillna(result_df['revenue_forecast'])

output_columns = ['date', 'category_id', 'category_title', 'revenue']
final_output = result_df[output_columns].sort_values(['category_id', 'date']).reset_index(drop=True)
final_output = final_output.drop_duplicates(['date', 'category_id'])
final_output.to_csv(OUTPUT_PATH, index=False)

final_output.tail()


Unnamed: 0,date,category_id,category_title,revenue
3400,2026-04-01,5,Vodka,90530896.0
3416,2026-05-01,5,Vodka,90935576.0
3432,2026-06-01,5,Vodka,96083184.0
3448,2026-07-01,5,Vodka,95405472.0
3464,2026-08-01,5,Vodka,95513080.0


In [5]:
summary_report

Unnamed: 0,group_key,train_rows,cv_mae,skipped_reason
0,1,408,13519820.0,
1,2,408,12581970.0,
2,3,408,46843740.0,
3,4,408,10902580.0,
4,5,408,7471469.0,
