In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [2]:
REL_CLIP = 0.3
ENSEMBLE_W_REL = 0.55
ENSEMBLE_W_LOG = 0.45
MAX_PRED_BY_PROD_Q = 0.999
SMOOTH_ALPHA = 0.35
RANDOM_STATE = 42

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

In [4]:
if 'Unnamed: 0' in train.columns:
    train = train.rename(columns={'Unnamed: 0': 'id'})

train = train[train['store_location_rk'] != 309].reset_index(drop=True)
train['period_start_dt'] = pd.to_datetime(train['period_start_dt'], dayfirst=False, errors='coerce')
test['period_start_dt'] = pd.to_datetime(test['period_start_dt'], dayfirst=True, errors='coerce')

for df in [train, test]:
    df['series_id'] = df['product_rk'].astype(str) + '_' + df['store_location_rk'].astype(str)

promo_mode = train['PROMO1_FLAG'].mode().iloc[0] if 'PROMO1_FLAG' in train.columns else 0
train['PROMO1_FLAG'] = train.get('PROMO1_FLAG', promo_mode).fillna(promo_mode)
if 'PROMO1_FLAG' in test.columns:
    test['PROMO1_FLAG'] = test['PROMO1_FLAG'].fillna(promo_mode)
else:
    test['PROMO1_FLAG'] = promo_mode

for col in ['PRICE_REGULAR','PRICE_AFTER_DISC','AUTORIZATION_FLAG','PROMO2_FLAG','NUM_CONSULTANT']:
    if col in train.columns:
        train[col] = train.groupby(['product_rk','store_location_rk'])[col].transform(lambda s: s.ffill().bfill())
        prod_med = train.groupby('product_rk')[col].transform('median')
        train[col] = train[col].fillna(prod_med)
        prod_med_map = train.groupby('product_rk')[col].median().to_dict()
        test[col] = test['product_rk'].map(prod_med_map).fillna(0.0).values if 'product_rk' in test.columns else 0.0

test = sample[['id']].merge(test, on='id', how='left')
test['series_id'] = test['product_rk'].astype(str) + '_' + test['store_location_rk'].astype(str)

In [5]:
key_cols = ['product_rk','store_location_rk','period_start_dt']
test_new = test.merge(train[key_cols].drop_duplicates().assign(_in_train=1), on=key_cols, how='left')
test_new = test_new[test_new['_in_train'].isna()].drop(columns=['_in_train'])

df = pd.concat([train, test_new], sort=False).reset_index(drop=True)
df = df.sort_values(['series_id','period_start_dt']).reset_index(drop=True)

df['week'] = df['period_start_dt'].dt.isocalendar().week.astype('Int64')
df['month'] = df['period_start_dt'].dt.month.astype('Int64')
df['weekday'] = df['period_start_dt'].dt.weekday.astype('Int64')
df['year'] = df['period_start_dt'].dt.year.astype('Int64')

df['demand'] = df['demand'].astype(float)
df['demand_log'] = np.log1p(df['demand'])
df['demand_relative'] = df.groupby('series_id')['demand'].pct_change().replace([np.inf,-np.inf], 0).fillna(0)

In [6]:
lags = [1,2,3,4,8,12,52]
for lag in lags:
    df[f'd_lag_{lag}'] = df.groupby('series_id')['demand'].shift(lag)
    df[f'log_lag_{lag}'] = df.groupby('series_id')['demand_log'].shift(lag)
    df[f'rel_lag_{lag}'] = df.groupby('series_id')['demand_relative'].shift(lag)

for w in [4,8,12]:
    df[f'd_roll_mean_{w}'] = df.groupby('series_id')['demand'].shift(1).rolling(w, min_periods=1).mean()
    df[f'd_roll_median_{w}'] = df.groupby('series_id')['demand'].shift(1).rolling(w, min_periods=1).median()
    df[f'rel_roll_mean_{w}'] = df.groupby('series_id')['demand_relative'].shift(1).rolling(w, min_periods=1).mean()
    df[f'rel_roll_std_{w}'] = df.groupby('series_id')['demand_relative'].shift(1).rolling(w, min_periods=1).std()

if 'PRICE_REGULAR' in df.columns and 'PRICE_AFTER_DISC' in df.columns:
    df['price_ratio'] = df['PRICE_AFTER_DISC'] / (df['PRICE_REGULAR'].replace(0, np.nan))
    df['price_ratio'] = df['price_ratio'].replace([np.inf,-np.inf],1.0).fillna(1.0)
    for lag in [1,2,4]:
        df[f'price_ratio_lag_{lag}'] = df.groupby('series_id')['price_ratio'].shift(lag)
else:
    df['price_ratio'] = 1.0

In [7]:
if 'PROMO1_FLAG' in df.columns:
    df['promo_prev_mean_4'] = df.groupby('series_id')['PROMO1_FLAG'].shift(1).rolling(4,min_periods=1).mean()

series_stats = train.groupby('series_id')['demand'].agg(['median','mean','count','std']).rename(columns={'median':'series_median','mean':'series_mean','count':'series_count','std':'series_std'})
df = df.merge(series_stats, left_on='series_id', right_index=True, how='left')

prod_stats = train.groupby('product_rk')['demand'].agg(['median','mean','std']).rename(columns={'median':'prod_median','mean':'prod_mean','std':'prod_std'})
df = df.merge(prod_stats, left_on='product_rk', right_index=True, how='left')

feature_cols = [
    'month','weekday','week','PRICE_REGULAR','PRICE_AFTER_DISC','price_ratio','promo_prev_mean_4',
] + [f'd_lag_{l}' for l in lags] + [f'log_lag_{l}' for l in lags] + [f'rel_lag_{l}' for l in lags] + \
[f'd_roll_mean_{w}' for w in [4,8,12]] + [f'd_roll_median_{w}' for w in [4,8,12]] + [f'rel_roll_mean_{w}' for w in [4,8,12]]

for c in feature_cols:
    if c in df.columns:
        df[c] = df.groupby('series_id')[c].transform(lambda s: s.fillna(method='ffill').fillna(method='bfill'))
        df[c] = df[c].fillna(df['prod_median'])
        df[c] = df[c].fillna(df['series_median'])
        df[c] = df[c].fillna(0.0)

train_proc = df[df['demand'].notna()].copy()
test_proc = df[df['demand'].isna()].copy()

print(f"Processed: train rows {len(train_proc)}, test rows {len(test_proc)}")

rel_features = ['PRICE_REGULAR','PRICE_AFTER_DISC','price_ratio','promo_prev_mean_4',
    'series_median','series_mean','prod_median','prod_mean'] + \
    [f'rel_lag_{l}' for l in lags] + [f'rel_roll_mean_{w}' for w in [4,8,12]] + ['month','weekday']

log_features = ['PRICE_REGULAR','PRICE_AFTER_DISC','price_ratio','promo_prev_mean_4',
    'series_median','series_mean','prod_median','prod_mean'] + \
    [f'log_lag_{l}' for l in lags] + [f'd_roll_mean_{w}' for w in [4,8,12]] + ['month','weekday']

Processed: train rows 34129, test rows 1200


In [8]:
X_rel = train_proc[rel_features].fillna(0)
y_rel = train_proc['demand_relative'].fillna(0)

X_log = train_proc[log_features].fillna(0)
y_log = train_proc['demand_log'].fillna(0)

m_rel = HistGradientBoostingRegressor(max_iter=800, learning_rate=0.08, max_depth=10, min_samples_leaf=20, random_state=RANDOM_STATE)
m_rel.fit(X_rel, y_rel)

lgb_train = lgb.Dataset(X_log, label=y_log)
lgb_params = {
    'objective':'regression','metric':'l2','learning_rate':0.05,'num_leaves':128,
    'min_data_in_leaf':20,'feature_fraction':0.8,'bagging_fraction':0.8,'bagging_freq':5,
    'verbosity':-1,'seed':RANDOM_STATE
}
m_log = lgb.train(lgb_params, lgb_train, num_boost_round=3000, valid_sets=[lgb_train], callbacks=[lgb.log_evaluation(period=500), lgb.early_stopping(stopping_rounds=200)])

last_values = train_proc.groupby('series_id')['demand'].last()

[500]	training's l2: 0.162435
[1000]	training's l2: 0.0770931
[1500]	training's l2: 0.0393143
[2000]	training's l2: 0.0207897
[2500]	training's l2: 0.0114401
[3000]	training's l2: 0.00645121


In [9]:
def get_baseline_value(row):
    s = row['series_id']
    if 'd_lag_52' in row.index and not pd.isna(row['d_lag_52']):
        return row['d_lag_52']
    if s in last_values.index:
        return last_values.loc[s]
    if not pd.isna(row.get('prod_median', np.nan)):
        return row['prod_median']
    return train_proc['demand'].median()

X_rel_test = test_proc[rel_features].fillna(0)
X_log_test = test_proc[log_features].fillna(0)

rel_pred = m_rel.predict(X_rel_test)
rel_pred = np.clip(rel_pred, -REL_CLIP, REL_CLIP)

log_pred = m_log.predict(X_log_test, num_iteration=m_log.best_iteration)
abs_from_log = np.expm1(log_pred)
abs_from_log = np.clip(abs_from_log, 0, None)

baseline_vals = test_proc.apply(get_baseline_value, axis=1).values.astype(float)
pred_from_rel = baseline_vals * (1.0 + rel_pred)

In [10]:
ensemble_pred = ENSEMBLE_W_REL * pred_from_rel + ENSEMBLE_W_LOG * abs_from_log

series_median_map = train_proc.groupby('series_id')['demand'].median().to_dict()
series_median_arr = test_proc['series_id'].map(series_median_map).fillna(train_proc['demand'].median()).values

ensemble_pred = (1 - SMOOTH_ALPHA) * ensemble_pred + SMOOTH_ALPHA * series_median_arr

prod_995 = train_proc.groupby('product_rk')['demand'].quantile(MAX_PRED_BY_PROD_Q).to_dict()
prod_upper = test_proc['product_rk'].map(prod_995).fillna(train_proc['demand'].quantile(0.99)).values
ensemble_pred = np.minimum(ensemble_pred, prod_upper)

mask_no_history = test_proc['series_count'].fillna(0) < 2
ensemble_pred[mask_no_history.values] = test_proc.loc[mask_no_history, 'prod_median'].fillna(train_proc['demand'].median()).values

train_med = train_proc['demand'].median()
pred_med = np.median(ensemble_pred) if len(ensemble_pred) > 0 else 1.0
if pred_med > 0:
    scale = min(1.05, max(0.95, (train_med / pred_med) ** 0.5))
    ensemble_pred = ensemble_pred * scale

ensemble_pred = np.clip(ensemble_pred, 0, None)
ensemble_pred = np.round(ensemble_pred).astype(int)

In [11]:
submission = test_proc[['id']].copy().reset_index(drop=True)
submission['predicted'] = ensemble_pred
submission = sample[['id']].merge(submission, on='id', how='left')
submission['predicted'] = submission['predicted'].fillna(train_proc['demand'].median()).astype(int)
submission.to_csv('submission_original.csv', index=False)