In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import math

from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from scipy import signal
from scipy.fft import fftshift

In [None]:
# enter_raw = pd.read_csv('../data/marking/enter.csv', parse_dates=['dt'])
# enter_int = enter_raw.rename(columns={ 'cnt': 'enter_cnt' }).set_index('dt')
# enter_agg = enter_int.groupby('dt').sum(numeric_only=True)
# enter = enter_agg.resample('W').sum(numeric_only=True)[:-1]
# enter.to_csv('../data/marking/enter-aggregate.csv')
enter = pd.read_csv('../data/marking/enter-aggregate.csv', parse_dates=['dt'], index_col='dt')
enter_std = enter['enter_cnt'].std()
enter_norm = enter.copy()
enter_norm['enter_cnt'] = enter_norm['enter_cnt'] / enter_std

In [None]:
# leave_raw = pd.read_csv('../data/marking/leave.csv', parse_dates=['dt'])
# leave_int = leave_raw.drop(['price'], axis=1).rename(columns={ 'cnt': 'leave_cnt' }).set_index('dt')
# leave_agg = leave_int.groupby('dt').sum(numeric_only=True)
# leave = leave_agg.resample('W').sum(numeric_only=True)[:-1]
# leave.to_csv('../data/marking/leave-aggregate.csv')
leave = pd.read_csv('../data/marking/leave-aggregate.csv', parse_dates=['dt'], index_col='dt')
leave_std = leave['leave_cnt'].std()
leave_norm = leave.copy()
leave_norm['leave_cnt'] = leave_norm['leave_cnt'] / leave_std

In [None]:
# search_raw = pd.read_csv('../dictionary/google-trends/парфюм.csv', parse_dates=['Неделя'])
# search_int = search_raw.rename(columns={ 'Неделя': 'dt', 'Баллы': 'search_cnt' }).set_index('dt')
# search = search_int.loc['2021-11-28':'2022-11-20']
# search.to_csv('../data/marking/search-aggregate.csv')
search = pd.read_csv('../data/marking/search-aggregate.csv', parse_dates=['dt'], index_col='dt')
search_std = 100 / 6 # @NOTE: Диапазон значений от 0 до 100
search_norm = search.copy()
search_norm['search_cnt'] = search_norm['search_cnt'] / search_std
px.line(search_norm)
# search_std

In [None]:
data = enter.merge(leave, on='dt')
data = data.merge(search, on='dt')
px.line(data)

In [None]:
data_norm = enter_norm.merge(leave_norm, on='dt')
data_norm = data_norm.merge(search_norm, on='dt')
px.line(data_norm)

In [450]:
HISTORY_SIZE = 4 * 3 # @NOTE: 3 месяца
HORIZON_SIZE = 4 * 3 # @NOTE: 3 месяца

window_size = HISTORY_SIZE + HORIZON_SIZE
windows = [window for window in data_norm.rolling(window_size) if len(window) == window_size]

x_shape = (3, HISTORY_SIZE)
x_elm = x_shape[0] * x_shape[1]
y_shape = (2, HORIZON_SIZE)
y_elm = y_shape[0] * y_shape[1]

x = []
y = []
for window in windows:
    x.append(window[0:HISTORY_SIZE].values.reshape((x_elm)))
    y.append(window[HISTORY_SIZE:].copy().drop(['search_cnt'], axis=1).values.reshape((y_elm)))

x = np.array(x, 'float32')
y = np.array(y, 'float32')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2)

# [x_shape, y_shape]

activation = 'relu'
model = keras.Sequential([
    keras.Input(shape=(x_elm)),
    layers.Dense(24, activation=activation),
    layers.Dense(y_elm),
])

model.compile(loss="mse", optimizer="adamax", metrics=["accuracy"])

[initial_loss, initial_accuracy] = model.evaluate(x_train, y_train)
[initial_val_loss, initial_val_accuracy] = model.evaluate(x_val, y_val)
history = pd.DataFrame(dict(
    loss=[initial_loss],
    accuracy=[initial_accuracy],
    val_loss=[initial_val_loss],
    val_accuracy=[initial_val_accuracy],
    time=[0],
    batch_size=[math.nan],
))

model.summary()

Model: "sequential_72"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_125 (Dense)           (None, 24)                888       
                                                                 
 dense_126 (Dense)           (None, 24)                600       
                                                                 
Total params: 1,488
Trainable params: 1,488
Non-trainable params: 0
_________________________________________________________________


In [442]:
from lib import TimeCallback

batch_size = 8
epochs = 250

# @NOTE: Keras does not have the "time" metric, so we have to track it manually
time_callback = TimeCallback()
hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_val, y_val), callbacks=[time_callback])
history_chunk = pd.merge(
    pd.DataFrame(hist.history),
    pd.DataFrame(dict(time=time_callback.times, batch_size=batch_size)),
    left_index=True, right_index=True,
)
history = pd.concat((history, history_chunk), ignore_index=True)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

In [443]:
px.line(history, y=['loss', 'val_loss'], log_y=True, log_x=True).show()
px.line(history, y=['accuracy', 'val_accuracy'], log_y=True, log_x=True).show()

In [449]:
OFFSET = 20

out = data.copy()
tx = out.copy().iloc[len(data) - OFFSET - HISTORY_SIZE:len(data) - OFFSET]
tx['enter_cnt'] /= enter_std
tx['leave_cnt'] /= leave_std
tx['search_cnt'] /= search_std
mx = tx.values.reshape((1, x_elm))
my = model.predict(mx)
my = my.reshape(y_shape).transpose()
ty = pd.DataFrame(my)
ty_index = pd.date_range(tx.index[-1] + pd.DateOffset(1), tx.index[-1] + pd.DateOffset(HORIZON_SIZE * 7), freq='W')
ty = pd.DataFrame({ 'dt': ty_index, 'enter_cnt': ty[0], 'leave_cnt': ty[1] }).set_index('dt')
ty['enter_cnt'] *= enter_std
ty['leave_cnt'] *= leave_std
out = out.merge(ty, on='dt', how='outer', suffixes=['', '_model'])

px.line(out, y=['enter_cnt', 'enter_cnt_model']).show()
px.line(out, y=['leave_cnt', 'leave_cnt_model']).show()


