In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from tqdm import tqdm

from get_data_all_sources import get_data, make_mpstats_request
from source.net_bilstm import BiLSTMAutoencoder # old solution
from source.net_trend_bi import PredictTrendBi # new solution
from source.net_expect import ExpectParams # new exponenta solution
from source.net_seq2seq import Seq2Seq 
from source.exponenta import get_stat

from datetime import datetime, timedelta

import torch
import torch.nn as nn
import requests

DATA_PATH = 'data/'
PRODUCTS_DIR = 'products/'
SOURCE_PATH = 'source/'
WEIGHTS_DIR = 'weights/'
PRODUCTS_PRICES_DIR = 'products_prices/'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def make_tg_report(text) -> None:
    token = '6498069099:AAFtdDZFR-A1h1F-8FvOpt6xIzqjCbdLdsc'
    method = 'sendMessage'
    chat_id = 324956476
    _ = requests.post(
            url='https://api.telegram.org/bot{0}/{1}'.format(token, method),
            data={'chat_id': chat_id, 'text': text} 
        ).json()

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# new_products/products/products_prices
CURRENT_PRODUCTS_DIR = DATA_PATH + 'new_products/'
NUM_DAYS_FOR_CHOICE = 5
NUM_DAYS_TO_PREDICT = 14
NUM_METHODS_TO_TEST = 2 # лучший, среднее

In [4]:
model_trend_v1 = BiLSTMAutoencoder(2, 128, 1)
model_trend_v1.load_state_dict(torch.load(WEIGHTS_DIR+'trend_bilstm_cpu', weights_only=False, map_location=device));

model_trend_v2 = PredictTrendBi(3, 128, 2)
model_trend_v2.load_state_dict(torch.load(WEIGHTS_DIR+'trend_2enc_1dec_128_bi', weights_only=False, map_location=device));   

model_exp = ExpectParams(3, 128)
model_exp.load_state_dict(torch.load(WEIGHTS_DIR+'expect_128_1', weights_only=False, map_location=device));  

model_predict = Seq2Seq(2, 256, 1)
model_predict.load_state_dict(torch.load(WEIGHTS_DIR+'trend_gru', map_location=device));

In [5]:
def find_best_exp_model(df, num_days=NUM_DAYS_FOR_CHOICE):
    data = df.iloc[:,1:4].to_numpy(copy=True)
    sum_exp, sum_v2, real = 0, 0, 0
    for day in range(num_days):
        p = data[:len(data)-num_days+day+1, 1] # добавляем единицу чтобы завтрашняя цена была в нормировке
        x = (p - p.min())/(p.max()-p.min())
        tomm_price = x[-1]
        x = x[:-1].copy() # убираем обратно
        order = data[:-num_days+day, 0].copy()
        balance = data[:-num_days+day, 2]
        bal = np.ones_like(balance)
        # bal[balance<order] = 0 
        LEN = x.shape[0]
        X = torch.zeros(1,LEN,3,dtype=torch.float32)
        X[:,:,0] = torch.tensor(bal)
        X[:,:,1] = torch.tensor(x)
        X[:,:,2] = torch.tensor(order)
        # expect
        exp_params = model_exp(X)
        exp_params = exp_params.detach().squeeze().numpy()
        predict_exp = np.exp(exp_params[1]-tomm_price*exp_params[0])
        sum_exp += predict_exp
        # new v2 
        out_v2 = model_trend_v2(X)
        out_v2 = out_v2.detach().squeeze().numpy()
        discount_v2 = order/out_v2*out_v2[-1]
        v2_params = get_stat(x, discount_v2)
        predict_v2 = np.exp(v2_params[0]+tomm_price*v2_params[1])
        sum_v2 += predict_v2
        
        real += data[-num_days+day, 0].copy()
    exp_res = np.abs(sum_exp-real)/real
    v2_res = np.abs(sum_v2-real)/real
    # finding best
    v2 = lambda x: np.exp(v2_params[0]+x*v2_params[1])
    exp = lambda x: np.exp(exp_params[1]-x*exp_params[0])
    if v2_res < exp_res:
        # print('v2 best')
        return v2, exp
    else:
        # print('exp best')
        return exp, v2

In [6]:
def make_torch_data(df):
    order = df.sales.to_numpy(copy=True)
    balance = df.balance.to_numpy(copy=True)
    bal = np.ones_like(df.balance.to_numpy(copy=True))
    price = df.price.to_numpy(copy=True)
    price_norm = (price - price.min())/(price.max()-price.min())
    # extract trend
    data = torch.zeros(1, price_norm.size, 3, dtype=torch.float32)
    data[:,:,0] = torch.tensor(bal)
    data[:,:,1] = torch.tensor(price_norm)
    data[:,:,2] = torch.tensor(order)
    return data

In [7]:
import warnings
warnings.filterwarnings("ignore")

In [63]:
L = os.listdir(CURRENT_PRODUCTS_DIR)#[-10:]
Z = np.zeros((len(L), NUM_METHODS_TO_TEST+2+1))
for k, ls in enumerate(tqdm(L)): 
    df = pd.read_csv(CURRENT_PRODUCTS_DIR+ls)
    train_df = df[:-NUM_DAYS_TO_PREDICT].copy()
    test_df = df[-NUM_DAYS_TO_PREDICT:].copy()
    # sku = int(ls[:-4])
    # print(f'{sku}')
    # отрезаем данные для теста и предикта
    try:
        best_func, sec_func = find_best_exp_model(train_df)
        data = make_torch_data(train_df)
        data = data.permute([1,0,2])
        data = data[:,:,1:]
        # preds = model_predict(data, num_step=NUM_DAYS_TO_PREDICT).detach().squeeze().numpy()
        preds = np.ones(NUM_DAYS_TO_PREDICT)
        test_prices = test_df.price.values
        min_price, max_price = df.price.min(), df.price.max()
        norm_test_prices = (test_prices - min_price)/(max_price - min_price)
        test_orders = test_df.sales.values.sum()
        best_preds = best_func(norm_test_prices)
        sec_preds = sec_func(norm_test_prices)
        Z[k, 0] = test_orders - (best_preds * preds).sum() 
        Z[k, 1] = test_orders - ((best_preds+sec_preds)/2 * preds).sum()
        Z[k, 2] = Z[k, 0] / test_orders
        Z[k, 3] = Z[k, 1] / test_orders
        Z[k, 4] = test_orders
    except:
        sku = int(ls[:-4])
        print(f'problems with {sku}')
# Z = np.round(Z, 3)

  1%|          | 4/668 [00:00<01:20,  8.20it/s]

problems with 100470447


 40%|███▉      | 265/668 [00:37<00:45,  8.93it/s]

problems with 176777205


 53%|█████▎    | 354/668 [00:49<00:27, 11.51it/s]

problems with 207925692


 68%|██████▊   | 454/668 [00:58<00:16, 13.33it/s]

problems with 231046298


100%|██████████| 668/668 [01:26<00:00,  7.69it/s]


In [64]:
res = pd.DataFrame(Z[Z[:, -1] >= NUM_DAYS_TO_PREDICT]).abs()
res.columns = ['best', 'mean', 'best_perc', 'mean_perc', 'real']
res.replace([np.inf, -np.inf], np.nan, inplace=True)
res.dropna(inplace=True)
res.loc['means'] = res.mean(axis=0)
res.tail().style.highlight_min(axis=1, props='color:red;')

Unnamed: 0,best,mean,best_perc,mean_perc,real
413,6.4098,37.551161,0.032049,0.187756,200.0
414,70.826744,92.708879,0.267271,0.349845,265.0
415,92.409805,102.268943,0.315392,0.349041,293.0
416,37.299072,42.21506,0.731354,0.827746,51.0
means,48.81708,53.412427,0.563042,0.611583,153.402878


In [20]:
res = pd.DataFrame(Z[Z[:, -1] >= NUM_DAYS_TO_PREDICT]).abs()
res.columns = ['best', 'mean', 'best_perc', 'mean_perc', 'real']
res.replace([np.inf, -np.inf], np.nan, inplace=True)
res.dropna(inplace=True)
res.loc['means'] = res.mean(axis=0)
res.tail().style.highlight_min(axis=1, props='color:red;')

Unnamed: 0,best,mean,best_perc,mean_perc,real
413,68.692522,37.551161,0.343463,0.187756,200.0
414,70.826744,92.708879,0.267271,0.349845,265.0
415,112.128081,102.268943,0.38269,0.349041,293.0
416,37.299072,42.21506,0.731354,0.827746,51.0
means,52.418637,53.412427,0.580621,0.611583,153.402878


# Посмотрим, а нужна ли линэксп вообще

In [8]:
from source.lin_exp import lin_exponenta, lin_exp

In [9]:
def find_trend(data):
    trend = model_trend_v2(data)
    trend = trend.detach().numpy().squeeze()
    trend = trend/trend[-1]
    return trend

In [10]:
L = os.listdir(CURRENT_PRODUCTS_DIR) #[-10:]
ls = L[11]
df = pd.read_csv(CURRENT_PRODUCTS_DIR+ls)

data = make_torch_data(df)
order = df.sales.to_numpy(copy=True)
trend = find_trend(data)
discount_order = order/trend
price = df.price.to_numpy(copy=True)
price_norm = (price - price.min())/(price.max()-price.min())

In [11]:
def get_func(price_norm, discount_order):
    """returns `linexp` or `exp` function whichever best for this case"""
    a0,b0,c0,I,out = lin_exponenta(price_norm, discount_order)
    func_lin = True if out*I else False
    b = get_stat(price_norm, discount_order)
    func_name = 'exp'
    f = lambda x: np.exp(b[0]+b[1]*x)
    # if func_lin:
    o1 = lin_exp(price_norm,a0,b0,c0)  
    s1 = ((o1 - discount_order)**2).sum()
    o2 = np.exp(b[0]+b[1]*price_norm)
    s2 = ((o2 - discount_order)**2).sum()
    if s1 < s2: # лучше горка
        f = lambda x: lin_exp(x,a0,b0,c0) 
        func_name = 'linexp'
    return f, func_name, s1, s2

In [12]:
f, func_name, s1, s2 = get_func(price_norm, discount_order)
func_name, s1, s2

('exp', nan, 79286.00581781377)

In [13]:
L = os.listdir(CURRENT_PRODUCTS_DIR)#[-10:]
Z = np.zeros((len(L), NUM_METHODS_TO_TEST+2+1))
s1_all, s2_all = list(), list()
ctr = np.zeros(2)
for k, ls in enumerate(tqdm(L)): 
    try:
        df = pd.read_csv(CURRENT_PRODUCTS_DIR+ls)
        data = make_torch_data(df)
        order = df.sales.to_numpy(copy=True)
        trend = find_trend(data)
        discount_order = order/trend
        price = df.price.to_numpy(copy=True)
        price_norm = (price - price.min())/(price.max()-price.min())
        f, func_name, s1, s2 = get_func(price_norm, discount_order)
        s1_all.append(s1)
        s2_all.append(s2)
        if func_name == 'linexp':
            ctr[1]+=1 
        else:
            ctr[0]+=1
    except:
        sku = int(ls[:-4])
        print(f'problems with {sku}')

 40%|███▉      | 265/668 [02:11<01:29,  4.49it/s]

problems with 176777205


 53%|█████▎    | 352/668 [02:44<01:23,  3.80it/s]

problems with 207925692


100%|██████████| 668/668 [04:40<00:00,  2.38it/s]


In [18]:
diffs = pd.DataFrame([s1_all, s2_all]).T.dropna().reset_index(drop=True)
diffs.columns = ['linexp', 'exp']

In [45]:
subs = diffs[diffs.linexp < diffs.exp].copy()
subs['perc'] = ((subs.exp - subs.linexp) / subs.exp * 100).sort_values(ascending=False)
subs = subs.sort_values('perc', ascending=False)

In [50]:
subs[subs.perc>5].shape[0] / subs.shape[0]

0.2196969696969697

In [52]:
subs.shape[0]/ diffs.shape[0], diffs.shape[0]

(0.5581395348837209, 473)