In [1]:
%pylab inline
plt.style.use("bmh")

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pathlib
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.model_selection import TimeSeriesSplit

In [3]:
DATA_DIR = pathlib.Path(".")
DATA_FILE = "sc2021_train_deals.csv"
AGG_COLS = ["material_code", "company_code", "country", "region", "manager_code"]
RS = 82736

GROUPS = 941 # Number of "material-company-country-region-manager" groups

# Загрузка данных

In [4]:
data = pd.read_csv(DATA_DIR.joinpath(DATA_FILE), parse_dates=["month", "date"])
group_ts = data.groupby(AGG_COLS + ["month"])["volume"].sum().unstack(fill_value=0)

По умолчанию, для каждого целевого месяца резервируются предыдущие 12 месяцев для генерации признаков.

In [5]:
target_full_range = pd.date_range("2019-01-01", "2020-07-01", freq="MS")

# Baseline #1: последнее известное значение

In [6]:
last_value_datasets = []
for target_month in target_full_range:
    previous_month = target_month - pd.offsets.MonthBegin(1)
    features = pd.DataFrame([], index=group_ts.index)
    features[['vol_tm1', 'target']] = group_ts.loc[:, previous_month:target_month].copy()
    features['target_month'] = target_month
    last_value_datasets.append(features.reset_index())

last_value_features = pd.concat(last_value_datasets, ignore_index=True)

# Safety Check 1: make sure, that rows are ordered by time ('target_month')
assert last_value_features['target_month'].is_monotonic

# Safety Check 2: make sure, that there are exactly 941 groups for every month
assert (last_value_features['target_month'].value_counts() == GROUPS).all()

In [7]:
last_value_features.sample(n=5)

Unnamed: 0,material_code,company_code,country,region,manager_code,vol_tm1,target,target_month
10567,197,0,Россия,Томская обл.,17506,0.0,42.0,2019-12-01
16368,286,1378,Венгрия,Венгрия,15986,20.0,19.0,2020-06-01
17193,214,0,Россия,Респ. Саха (Якутия),16656,128.0,172.0,2020-07-01
15453,303,0,Россия,Омская обл.,16656,0.0,20.0,2020-05-01
12111,717,8837,Россия,Нижегородская обл.,10670,0.0,21.0,2020-01-01


### Кросс-валидация для временных рядов

**Как это работает**: статья [Cross validation of time series data](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-of-time-series-data) из scikit-learn User Guide

Требование к данным: строки должны быть упорядочены по времени.

Для такой простой модели шаг обучения будет тривиальным: запомнить значение для предыдущего месяца. Кросс-валидация сводится к тому, чтобы проверить модель на нескольких тестовых датасетах.

Будем использовать 5 последовательных разбиенией на обучающую и тестовую выборки, причём в каждом разбиении тестовая выборка содержит данные за 3 месяца, следущие после обучающей выборки:

 1. Обучение: `2019-01-01` - `2019-04-01`, Тест: `2019-05-01` - `2019-07-01` 
 2. Обучение: `2019-01-01` - `2019-07-01`, Тест: `2019-08-01` - `2019-10-01` 
 3. Обучение: `2019-01-01` - `2019-10-01`, Тест: `2019-11-01` - `2020-01-01` 
 4. Обучение: `2019-01-01` - `2020-01-01`, Тест: `2020-02-01` - `2020-04-01`
 5. Обучение: `2019-01-01` - `2020-04-01`, Тест: `2020-05-01` - `2020-07-01`

In [8]:
ts_cv = TimeSeriesSplit(n_splits=5, test_size=GROUPS*3)
ts_cv_losses = []
for train_idx, test_idx in ts_cv.split(last_value_features):
    X_train = last_value_features.drop(columns=['target']).iloc[train_idx]
    X_test = last_value_features.drop(columns=['target']).iloc[test_idx]
    y_train = last_value_features['target'].iloc[train_idx]
    y_test = last_value_features['target'].iloc[test_idx]
        
    train_loss = msle(y_train, X_train['vol_tm1'], squared=False)
    test_loss = msle(y_test, X_test['vol_tm1'], squared=False)
        
    ts_cv_losses.append([train_loss, test_loss])
        
    print("Train range: {} - {}.".format(X_train['target_month'].min(), X_train['target_month'].max()),
            "Train Loss (RMSLE): {:.6f}".format(train_loss))
    print("Test range:  {} - {}.".format(X_test['target_month'].min(),X_test['target_month'].max()),
            "Test Loss (RMSLE):  {:.6f}\n".format(test_loss))
        
print("Avg. Train Loss: {:.6f}, Avg. Test Loss: {:.6f}".format(*np.mean(ts_cv_losses, axis=1)))

Train range: 2019-01-01 00:00:00 - 2019-04-01 00:00:00. Train Loss (RMSLE): 1.792478
Test range:  2019-05-01 00:00:00 - 2019-07-01 00:00:00. Test Loss (RMSLE):  1.757021

Train range: 2019-01-01 00:00:00 - 2019-07-01 00:00:00. Train Loss (RMSLE): 1.777368
Test range:  2019-08-01 00:00:00 - 2019-10-01 00:00:00. Test Loss (RMSLE):  1.737792

Train range: 2019-01-01 00:00:00 - 2019-10-01 00:00:00. Train Loss (RMSLE): 1.765589
Test range:  2019-11-01 00:00:00 - 2020-01-01 00:00:00. Test Loss (RMSLE):  1.963820

Train range: 2019-01-01 00:00:00 - 2020-01-01 00:00:00. Train Loss (RMSLE): 1.813259
Test range:  2020-02-01 00:00:00 - 2020-04-01 00:00:00. Test Loss (RMSLE):  2.046455

Train range: 2019-01-01 00:00:00 - 2020-04-01 00:00:00. Train Loss (RMSLE): 1.859212
Test range:  2020-05-01 00:00:00 - 2020-07-01 00:00:00. Test Loss (RMSLE):  2.027146

Avg. Train Loss: 1.774749, Avg. Test Loss: 1.757580


# Baseline #2: среднее за последние три месяца

In [9]:
last_3_avg_datasets = []
for target_month in target_full_range:
    start_period = target_month - pd.offsets.MonthBegin(3)
    end_period = target_month - pd.offsets.MonthBegin(1)
    features = pd.DataFrame([], index=group_ts.index)
    features['last_3m_avg'] = group_ts.loc[:, start_period:end_period].mean(axis=1)
    features['target'] = group_ts[target_month]
    features['target_month'] = target_month
    last_3_avg_datasets.append(features.reset_index())

last_3_avg_features = pd.concat(last_3_avg_datasets, ignore_index=True)

# Safety Check 1: make sure, that rows are ordered by time ('target_month')
assert last_3_avg_features['target_month'].is_monotonic

# Safety Check 2: make sure, that there are exactly 941 groups for every month
assert (last_3_avg_features['target_month'].value_counts() == GROUPS).all()

In [10]:
last_3_avg_features.sample(n=5)

Unnamed: 0,material_code,company_code,country,region,manager_code,last_3m_avg,target,target_month
17472,486,1851,Россия,Россия,12444,97.666667,42.0,2020-07-01
2362,443,0,Россия,г. Санкт-Петербург,14956,13.333333,20.0,2019-03-01
5214,451,0,Россия,Нижегородская обл.,15938,0.0,0.0,2019-06-01
1888,133,0,Китай,Китай,16079,128.333333,145.0,2019-03-01
6329,583,0,Россия,Пермский край,16788,40.0,0.0,2019-07-01


### Кросс-валидация для временных рядов

Аналогично, как и для предыдущей модели

In [11]:
ts_cv = TimeSeriesSplit(n_splits=5, test_size=GROUPS*3)
ts_cv_losses = []
for train_idx, test_idx in ts_cv.split(last_3_avg_features):
    X_train = last_3_avg_features.drop(columns=['target']).iloc[train_idx]
    X_test = last_3_avg_features.drop(columns=['target']).iloc[test_idx]
    y_train = last_3_avg_features['target'].iloc[train_idx]
    y_test = last_3_avg_features['target'].iloc[test_idx]
        
    train_loss = msle(y_train, X_train['last_3m_avg'], squared=False)
    test_loss = msle(y_test, X_test['last_3m_avg'], squared=False)
        
    ts_cv_losses.append([train_loss, test_loss])
        
    print("Train range: {} - {}.".format(X_train['target_month'].min(), X_train['target_month'].max()),
            "Train Loss (RMSLE): {:.6f}".format(train_loss))
    print("Test range:  {} - {}.".format(X_test['target_month'].min(),X_test['target_month'].max()),
            "Test Loss (RMSLE):  {:.6f}\n".format(test_loss))
        
print("Avg. Train Loss: {:.6f}, Avg. Test Loss: {:.6f}".format(*np.mean(ts_cv_losses, axis=1)))

Train range: 2019-01-01 00:00:00 - 2019-04-01 00:00:00. Train Loss (RMSLE): 1.705125
Test range:  2019-05-01 00:00:00 - 2019-07-01 00:00:00. Test Loss (RMSLE):  1.652521

Train range: 2019-01-01 00:00:00 - 2019-07-01 00:00:00. Train Loss (RMSLE): 1.682782
Test range:  2019-08-01 00:00:00 - 2019-10-01 00:00:00. Test Loss (RMSLE):  1.681824

Train range: 2019-01-01 00:00:00 - 2019-10-01 00:00:00. Train Loss (RMSLE): 1.682494
Test range:  2019-11-01 00:00:00 - 2020-01-01 00:00:00. Test Loss (RMSLE):  1.916876

Train range: 2019-01-01 00:00:00 - 2020-01-01 00:00:00. Train Loss (RMSLE): 1.739388
Test range:  2020-02-01 00:00:00 - 2020-04-01 00:00:00. Test Loss (RMSLE):  1.917399

Train range: 2019-01-01 00:00:00 - 2020-04-01 00:00:00. Train Loss (RMSLE): 1.774126
Test range:  2020-05-01 00:00:00 - 2020-07-01 00:00:00. Test Loss (RMSLE):  1.892962

Avg. Train Loss: 1.678823, Avg. Test Loss: 1.682303


# Baseline #3: CatBoostRegressor

Признаки:

- оригинальные категориальные признаки,
- месяц, для которого предсказываем,
- среднее, минимум и максимум за год,
- последние 6 месяцев до месяца, для которого предсказываем.

In [12]:
def get_features(df: pd.DataFrame, month: pd.Timestamp) -> pd.DataFrame:
    """Calculate features for `month`."""

    start_period = month - pd.offsets.MonthBegin(6)
    end_period = month - pd.offsets.MonthBegin(1)

    df = df.loc[:, :end_period]

    features = pd.DataFrame([], index=df.index)
    features['month'] = month.month
    features[[f"vol_tm{i}" for i in range(6, 0, -1)]] = df.loc[:, start_period:end_period].copy()

    rolling = df.rolling(12, axis=1, min_periods=1)
    features = features.join(rolling.mean().iloc[:, -1].rename('last_year_avg'))
    features = features.join(rolling.min().iloc[:, -1].rename('last_year_min'))
    features = features.join(rolling.max().iloc[:, -1].rename('last_year_max'))
    return features

In [13]:
catboost_datasets = []
for target_month in target_full_range:
    features = get_features(group_ts, target_month)
    features['target'] = group_ts[target_month]
    catboost_datasets.append(features.reset_index())
catboost_features = pd.concat(catboost_datasets, ignore_index=True)

### Кросс-валидация

Для оценки качества модели будем использовать такую же кросс-валидацию, как и для предыдущих двух примерах. На каждом этапе будем заново обучать и тестировать модель.

In [14]:
CAT_COLS = ['material_code', 'company_code', 'country', 'region', 'manager_code', 'month']
FTS_COLS = ['material_code', 'company_code', 'country', 'region', 'manager_code', 'month',
            'vol_tm6', 'vol_tm5', 'vol_tm4', 'vol_tm3', 'vol_tm2', 'vol_tm1',
            'last_year_avg', 'last_year_min', 'last_year_max']
TARGET = 'target'

In [15]:
base_model = CatBoostRegressor(iterations=1000, early_stopping_rounds=30,
                               depth=6, cat_features=CAT_COLS,
                               random_state=RS, verbose=False)

ts_cv = TimeSeriesSplit(n_splits=5, test_size=GROUPS*3)
ts_cv_losses = []
for train_idx, test_idx in ts_cv.split(catboost_features):
    
    X_train = catboost_features[FTS_COLS].iloc[train_idx]
    X_test = catboost_features[FTS_COLS].iloc[test_idx]
    y_train = catboost_features[TARGET].iloc[train_idx]
    y_test = catboost_features[TARGET].iloc[test_idx]
    
    model = base_model.copy()
    model.fit(X_train, y_train, eval_set=(X_test, y_test))
    
    train_loss = msle(y_train, np.clip(model.predict(X_train), 0, None), squared=False)
    test_loss = msle(y_test, np.clip(model.predict(X_test), 0, None), squared=False)
        
    ts_cv_losses.append([train_loss, test_loss])
        
    print("Train Loss (RMSLE): {:.6f}".format(train_loss))
    print("Test Loss (RMSLE):  {:.6f}\n".format(test_loss))
        
print("Avg. Train Loss: {:.6f}, Avg. Test Loss: {:.6f}".format(*np.mean(ts_cv_losses, axis=1)))

Train Loss (RMSLE): 1.932906
Test Loss (RMSLE):  1.947802

Train Loss (RMSLE): 2.053931
Test Loss (RMSLE):  1.908815

Train Loss (RMSLE): 2.226333
Test Loss (RMSLE):  2.250038

Train Loss (RMSLE): 2.171322
Test Loss (RMSLE):  2.211588

Train Loss (RMSLE): 2.078048
Test Loss (RMSLE):  2.221429

Avg. Train Loss: 1.940354, Avg. Test Loss: 1.981373


За основу взят [оригинальный Jupyther Notebook](https://sibur.ai-community.com/files/uploads/1e0024b8c6f92420a524903c2c6d71ff/[SC2021]%20Baseline.ipynb) c Baseline-решением для соревнования [Sibur Challenge 2021](https://sibur.ai-community.com/competitions/5)