In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pathlib
import pandas as pd
import numpy as np
from lstm.dataset import SiburDataset
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm
import pickle
from sklearn.metrics import mean_squared_log_error
import matplotlib.pyplot as plt

In [3]:
DATA_DIR = pathlib.Path(".")
DATA_FILE = "sc2021_train_deals.csv"
AGG_COLS = ["material_code", "company_code", "country", "region",
            "manager_code"]
RS = 82736

In [4]:
data = pd.read_csv(DATA_DIR.joinpath(DATA_FILE), parse_dates=["month", "date"])

In [5]:
data.head()

Unnamed: 0,material_code,company_code,country,region,manager_code,month,material_lvl1_name,material_lvl2_name,material_lvl3_name,contract_type,date,volume
0,134,0,Литва,Литва,12261,2018-01-01,Базовые полимеры,ПЭ,ПЭНП,Спот,2018-01-01,43.0
1,197,0,Китай,Китай,16350,2018-01-01,Базовые полимеры,ПЭ,ПЭНП,Спот,2018-01-02,95.0
2,794,2162,Казахстан,Атырауская обл.,10942,2018-01-01,Базовые полимеры,ПП,ПП,Контракт,2018-01-02,57.0
3,134,0,Литва,Литва,12261,2018-01-01,Базовые полимеры,ПЭ,ПЭНП,Спот,2018-01-02,21.0
4,133,0,Китай,Китай,17745,2018-01-01,Базовые полимеры,ПЭ,ПЭНП,Спот,2018-01-02,150.0


In [6]:
group_ts = data.groupby(AGG_COLS + ["month"])["volume"].sum().unstack(fill_value=0)

In [7]:
group_ts.shape

(941, 31)

In [15]:
group_ts.iloc[0]

month
2018-01-01    340.0
2018-02-01    340.0
2018-03-01    260.0
2018-04-01    240.0
2018-05-01    220.0
2018-06-01    220.0
2018-07-01    220.0
2018-08-01    220.0
2018-09-01    220.0
2018-10-01    280.0
2018-11-01    280.0
2018-12-01    280.0
2019-01-01    200.0
2019-02-01    200.0
2019-03-01    200.0
2019-04-01    185.0
2019-05-01    103.0
2019-06-01     62.0
2019-07-01      0.0
2019-08-01      0.0
2019-09-01      0.0
2019-10-01      0.0
2019-11-01      0.0
2019-12-01      0.0
2020-01-01      0.0
2020-02-01      0.0
2020-03-01      0.0
2020-04-01      0.0
2020-05-01      0.0
2020-06-01      0.0
2020-07-01      0.0
Name: (124, 7278, Россия, Респ. Татарстан, 17460), dtype: float64

In [43]:
agg_cols = ["material_code", "company_code", "country", "region",
                    "manager_code", "material_lvl1_name", "material_lvl2_name",
                    "material_lvl3_name", "contract_type"]
group_ts = data.groupby(agg_cols + ["month"])["volume"].sum().unstack(fill_value=0)
group_ts.shape

(943, 31)

In [44]:
row = group_ts.iloc[0]

In [45]:
for ix, val in zip(row.index, row):
    vector = list(row.name) + [ix.month]

In [46]:
vector

[124,
 7278,
 'Россия',
 'Респ. Татарстан',
 17460,
 'Базовые полимеры',
 'ПЭ',
 'ПЭНП',
 'Contract + Spot',
 7]

In [49]:
encoder = OneHotEncoder()
data['month_'] = data['date'].dt.month
agg_cols = ["material_code", "company_code", "country", "region",
            "manager_code", "material_lvl1_name", "material_lvl2_name",
            "material_lvl3_name", "contract_type", "month_"]
encoder.fit(data[agg_cols])

OneHotEncoder()

In [50]:
encoder.transform([vector]).toarray().flatten().shape

(538,)

In [119]:
with open('./lstm/ohe_encoder.pkl', 'rb') as f:
    encoder = pickle.load(f)

dataset = SiburDataset(
    data=data,
    encoder=encoder,
    period={
        'start': '2018-01-01',
        'end': '2019-03-01'
    })

In [23]:
with open('ohe_encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

# inference

In [8]:
from lstm.model import SiburModel
from lstm.dataset import get_loader
from pathlib import Path
from sklearn.metrics import mean_squared_log_error
import torch

In [9]:
def load_model(model_weights):
    model = SiburModel(hidden_dim=2048, num_layers=2)
    model_path = Path(model_weights)
    if model_path.exists():
        model.load_model(model_path)
        print('Модель загружена с', model_path)
    return model

def predict_month(df, num_workers=2):
    model = load_model('lstm/experiment/last.pth')
    model = torch.quantization.quantize_dynamic(
        model,
        {torch.nn.Linear, torch.nn.ReLU},  # a set of layers to dynamically quantize
        dtype=torch.qint8
        )
    
    dataloader = get_loader(
        df,
        shuffle=False,
        period=None,
        num_workers=num_workers
        )
    preds = model.predict(dataloader)
    
    agg_cols = ["material_code", "company_code", "country", "region",
                "manager_code", "material_lvl1_name", "material_lvl2_name",
                "material_lvl3_name", "contract_type"]
    test = df.groupby(agg_cols + ["month"])["volume"].sum().unstack(fill_value=0)
    test['prediction'] = preds
    
    preds_df = test['prediction'] \
        .reset_index() \
        .pivot_table(
            index=['material_code', 'company_code', 'country', 'region', 'manager_code'],
            aggfunc='sum'
            ) \
        .reset_index()
    preds_df
    return preds_df

In [10]:
def get_gt_df(df, month):
    agg_cols = ["material_code", "company_code", "country", "region",
                "manager_code", "material_lvl1_name", "material_lvl2_name",
                "material_lvl3_name", "contract_type"]
    gt = df.groupby(agg_cols + ["month"])["volume"].sum().unstack(fill_value=0).loc[:, '2020-07-01']
    gt_df = gt.reset_index() \
        .pivot_table(
            index=['material_code', 'company_code', 'country', 'region', 'manager_code'],
            aggfunc='sum'
            ) \
        .reset_index()
    return gt_df

def measure(preds_df, gt_df):
    cols = ['material_code', 'company_code', 'country', 'region', 'manager_code']
    total = gt_df.merge(
        preds_df, 
        how='left',
        left_on=cols,
        right_on=cols
        ).fillna(0)
    score = mean_squared_log_error(total.iloc[:, -2], total.iloc[:, -1])**0.5
    return score

In [None]:
from lstm.predict import predict

In [12]:
month = '2020-07-01'
df_pred = predict(data[data['date'] < month], None)

100%|█████████████████████████████████████████| 943/943 [09:19<00:00,  1.69it/s]


In [13]:
df_gt = get_gt_df(data, month)
measure(df_pred, df_gt)

1.7419878178098591

полный датасет 1.7313655040851463

In [15]:
from lstm.dataset import get_loader

In [19]:
df = pd.read_csv('sc2021_train_deals.csv', parse_dates=["month", "date"])
train_dataloader = get_loader(
    df,
    shuffle=True,
    period={
        'start': '2018-01-01',
        'end': '2020-07-01'
        },
    num_workers=0,
    train=True,
    encoder_path='ohe_encoder.pkl'
    )

In [20]:
len(train_dataloader)

943

In [None]:
# months = pd.date_range(start='2019-06-01', end='2020-07-01', freq='MS')  
# results = []
# for m in months:
#     preds_df = predict_month(data[data['month'] < m])
#     gt_df = get_gt_df(data, month=m)
#     score = measure(preds_df, gt_df)
#     results.append(score)
#     print(m, f'score: {score}')

# mean = sum(results) / len(results)
# print(f'Mean score on test_data: {mean}')