In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
train = pd.read_csv('/kaggle/input/kazakhstan-ai-respa-take-home/train.csv', parse_dates=['submitted_date'])
train['week_start'] = train['submitted_date'] - pd.to_timedelta(
    train['submitted_date'].dt.weekday, unit='d'
)
train_weekly = train.groupby(['category','week_start'], as_index=False)['num_papers'].sum()

df = train_weekly.copy()
df['year']         = df['week_start'].dt.year
df['week_of_year'] = df['week_start'].dt.isocalendar().week.astype(int)
df['month']        = df['week_start'].dt.month
df['quarter']      = df['week_start'].dt.quarter
df['day_of_year']  = df['week_start'].dt.dayofyear

df['sin_w'] = np.sin(2*np.pi*df['week_of_year']/52)
df['cos_w'] = np.cos(2*np.pi*df['week_of_year']/52)
df['sin_m'] = np.sin(2*np.pi*df['month']/12)
df['cos_m'] = np.cos(2*np.pi*df['month']/12)

cat_mean = df.groupby('category')['num_papers'].mean().to_dict()
df['cat_te'] = df['category'].map(cat_mean)

df.sort_values(['category','week_start'], inplace=True)

max_lag = 52
for lag in range(1, max_lag+1):
    df[f'lag_{lag}'] = df.groupby('category')['num_papers'].shift(lag)
for w in [4, 12, 26, 52]:
    grp = df.groupby('category')['num_papers']
    a = grp.shift(1).rolling(w).mean()
    b = grp.shift(1).rolling(w).var()
    c = grp.shift(1).rolling(w).std()
    d = grp.shift(1).rolling(w).median()
    e = grp.shift(1).rolling(w).min()
    f = grp.shift(1).rolling(w).max()
    df[f'roll_form_{w}'] = ((a - d) / b / np.sqrt(c) * (e + 1))

df.fillna(0, inplace=True)

df['y_log'] = np.log1p(df['num_papers'])

drop_cols = ['category','week_start','num_papers','y_log']
X = df.drop(columns=drop_cols)
y = df['y_log']

base_learners = [
    ('xgb', XGBRegressor(
        n_estimators=800, learning_rate=0.03, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        random_state=42, tree_method='hist',
        eval_metric='rmse', use_label_encoder=False
    )),
    ('lgbm', LGBMRegressor(
        n_estimators=800, learning_rate=0.03, max_depth=7,
        subsample=0.9, colsample_bytree=0.8,
        random_state=42
    )),
    ('cat', CatBoostRegressor(
        iterations=800, learning_rate=0.03, depth=6,
        eval_metric='RMSE', random_state=42, verbose=False
    ))
]
kf = KFold(n_splits=5, shuffle=False)

stack = StackingRegressor(
    estimators=base_learners,
    final_estimator=RidgeCV(alphas=[0.1,1.0,10.0]),
    passthrough=True,
    cv=kf,
    n_jobs=-1
)

stack.fit(X, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.484667 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14842
[LightGBM] [Info] Number of data points in the train set: 140956, number of used features: 66
[LightGBM] [Info] Start training from score 2.372866
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.552221 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14839
[LightGBM] [Info] Number of data points in the train set: 140956, number of used features: 66
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.422023 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14840
[LightGBM] [Info] Number of data points in the train set: 140956, number of used features: 66
[LightGBM] [Info] Start training from score 2.264213
[LightGBM] [Info] Sta

In [2]:
test = pd.read_csv('/kaggle/input/kazakhstan-ai-respa-take-home/test.csv', parse_dates=['week_start','week_end'])
test['year']         = test['week_start'].dt.year
test['week_of_year'] = test['week_start'].dt.isocalendar().week.astype(int)
test['month']        = test['week_start'].dt.month
test['quarter']      = test['week_start'].dt.quarter
test['day_of_year']  = test['week_start'].dt.dayofyear
test['sin_w']     = np.sin(2 * np.pi * test['week_of_year'] / 52)
test['cos_w']     = np.cos(2 * np.pi * test['week_of_year'] / 52)
test['sin_m']    = np.sin(2 * np.pi * test['month'] / 12)
test['cos_m']    = np.cos(2 * np.pi * test['month'] / 12)
test['cat_te']       = test['category'].map(cat_mean)

agg = train_weekly.set_index(['category','week_start'])['num_papers'].to_dict()

preds = []
for _, row in test.sort_values(['category','week_start']).iterrows():
    cat = row['category']
    ws  = row['week_start']
    feat = {
        'year':          row['year'],
        'week_of_year':  row['week_of_year'],
        'month':         row['month'],
        'quarter':       row['quarter'],
        'day_of_year':   row['day_of_year'],
        'sin_w':         row['sin_w'],
        'cos_w':         row['cos_w'],
        'sin_m':         row['sin_m'],
        'cos_m':         row['cos_m'],
        'cat_te':        row['cat_te']
    }
    for lag in range(1, max_lag+1):
        feat[f'lag_{lag}'] = agg.get((cat, ws - timedelta(days=7*lag)), 0)
    for window in [4, 12, 26, 52]:
        vals = [agg.get((cat, ws - timedelta(days=7*i)), 0) for i in range(1, window+1)]
        a = np.mean(vals)
        b = np.var(vals)
        c = np.std(vals)
        d = np.median(vals)
        e = np.min(vals)
        f = np.max(vals)
        feat[f'roll_form_{window}'] = ((a - d) / b / np.sqrt(c)) * (e + 1)

    x = pd.DataFrame([feat])
    y_pred_log = stack.predict(x)[0]
    y_pred = np.expm1(y_pred_log)
    preds.append(max(0, y_pred))
    agg[(cat, ws)] = y_pred
test['num_papers'] = np.rint(preds).astype(int)
test['id']         = test['category'] + '__' + test['week_id'].astype(str)
submission = test[['id','num_papers']]
submission.to_csv('submission.csv', index=False)

print("THE END")

THE END


In [3]:
submission

Unnamed: 0,id,num_papers
0,astro-ph__1,346
1,astro-ph__2,340
2,astro-ph__3,348
3,astro-ph__4,334
4,astro-ph__5,312
...,...,...
1115,stat.TH - Statistics Theory__4,45
1116,stat.TH - Statistics Theory__5,43
1117,stat.TH - Statistics Theory__6,44
1118,stat.TH - Statistics Theory__7,45
