In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import math

from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from scipy import signal
from scipy.fft import fftshift

In [463]:
# enter_raw = pd.read_csv('../data/marking/enter.csv', parse_dates=['dt'])
# enter_int = enter_raw.rename(columns={ 'cnt': 'enter_cnt' }).set_index('dt')
# enter_agg = enter_int.groupby('dt').sum(numeric_only=True)
# enter = enter_agg.resample('D').sum(numeric_only=True)[:-1]
# enter.to_csv('../data/marking/enter-aggregate.csv')
enter = pd.read_csv('../data/marking/enter-aggregate.csv', parse_dates=['dt'], index_col='dt')
enter_smooth = enter.rolling(7, center=True).mean()
enter_std = enter_smooth['enter_cnt'].std()
enter_norm = enter_smooth.copy()
enter_norm['enter_cnt'] = enter_norm['enter_cnt'] / enter_std

In [464]:
tmp = enter.copy()
tmp['enter_cnt_smooth'] = enter_smooth['enter_cnt']
px.line(tmp)

In [465]:
# leave_raw = pd.read_csv('../data/marking/leave.csv', parse_dates=['dt'])
# leave_int = leave_raw.drop(['price'], axis=1).rename(columns={ 'cnt': 'leave_cnt' }).set_index('dt')
# leave_agg = leave_int.groupby('dt').sum(numeric_only=True)
# leave = leave_agg.resample('D').sum(numeric_only=True)[:-1]
# leave.to_csv('../data/marking/leave-aggregate.csv')
leave = pd.read_csv('../data/marking/leave-aggregate.csv', parse_dates=['dt'], index_col='dt')
leave_smooth = leave.rolling(7, center=True).mean()
leave_std = leave_smooth['leave_cnt'].std()
leave_norm = leave_smooth.copy()
leave_norm['leave_cnt'] = leave_norm['leave_cnt'] / leave_std

In [466]:
tmp = leave.copy()
tmp['leave_cnt_smooth'] = leave_smooth['leave_cnt']
px.line(tmp)

In [241]:
# search_raw = pd.read_csv('../dictionary/google-trends/парфюм.csv', parse_dates=['Неделя'])
# search_int = search_raw.rename(columns={ 'Неделя': 'dt', 'Баллы': 'search_cnt' }).set_index('dt')
# search_int2 = search_int.resample('D').interpolate('pchip')
# search = search_int2.loc['2021-11-28':'2022-11-20']
# search.to_csv('../data/marking/search-aggregate.csv')
search = pd.read_csv('../data/marking/search-aggregate.csv', parse_dates=['dt'], index_col='dt')
search_std = 100 / 6 # @NOTE: Диапазон значений от 0 до 100
search_norm = search.copy()
search_norm['search_cnt'] = search_norm['search_cnt'] / search_std
px.line(search_norm)
# search_stdо

In [641]:
secondary_data_raw = pd.read_csv('../data/marking/secondary_data.csv', parse_dates=['dt'])
secondary_data_int = secondary_data_raw.set_index('dt')
secondary_data_int2 = secondary_data_int.resample('D').interpolate('cubic')
secondary_data = secondary_data_int2[['physical_loans']] # @NOTE: leave
# secondary_data = secondary_data_int2[[]] # @NOTE: enter
secondary_data_std = secondary_data.std()
secondary_data_norm = secondary_data / secondary_data_std

2021-10-28
2021-10-29
2021-10-30
2021-10-31
2021-11-01
...
2023-02-24
2023-02-25
2023-02-26
2023-02-27
2023-02-28


In [642]:
day_data_raw = pd.read_csv('../data/marking/days.csv', parse_dates=['dt'])
day_data = day_data_raw.groupby('dt').max().astype(np.float64)
day_data['day_type'] /= 4 # @NOTE: 0..4
day_data['day_type'] = day_data['day_type'].shift(-45)
day_data['day_of_year'] = day_data.index.day_of_year / 365.0
# day_data['day_of_week'] = day_data.index.day_of_week / 7

In [643]:
data = enter_smooth.merge(leave_smooth, on='dt')
data = data.merge(search, on='dt')
data = data.merge(secondary_data, on='dt')
data = data.merge(day_data, on='dt')
data = data.dropna()

In [644]:
data_norm = enter_norm.merge(leave_norm, on='dt')
data_norm = data_norm.merge(search_norm, on='dt')
data_norm = data_norm.merge(secondary_data_norm, on='dt')
data_norm = data_norm.merge(day_data, on='dt')
data_norm = data_norm.dropna()
px.scatter(data_norm)

In [651]:
HISTORY_SIZE = 3 * 4 * 7 # @NOTE: 3 месяца
HORIZON_SIZE = 3 * 4 * 7 # @NOTE: 3 месяца

window_size = HISTORY_SIZE + HORIZON_SIZE
windows = [window for window in data_norm.rolling(window_size) if len(window) == window_size]

x_shape = (len(data_norm.columns), HISTORY_SIZE)
x_elm = x_shape[0] * x_shape[1]
y_shape = (1, HORIZON_SIZE)
y_elm = y_shape[0] * y_shape[1]

x = []
y = []
for window in windows:
    x.append(window[0:HISTORY_SIZE].values.reshape((x_elm)))
    # y.append(window[HISTORY_SIZE:][['enter_cnt']].values.reshape((y_elm)))
    y.append(window[HISTORY_SIZE:][['leave_cnt']].values.reshape((y_elm)))

x = np.array(x, 'float32')
y = np.array(y, 'float32')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2)

activation = 'relu'
model = keras.Sequential([
    keras.Input(shape=(x_elm)),
    layers.Dropout(0.25),
    layers.Dense(x_elm, activation=activation, kernel_regularizer=keras.regularizers.L1L2(0.0001, 0.0001)),
    layers.Dropout(0.25),
    layers.Dense(y_elm),
])

model.compile(loss="mse", optimizer="adamax", metrics=["accuracy"])

[initial_loss, initial_accuracy] = model.evaluate(x_train, y_train)
[initial_val_loss, initial_val_accuracy] = model.evaluate(x_val, y_val)
history = pd.DataFrame(dict(
    loss=[initial_loss],
    accuracy=[initial_accuracy],
    val_loss=[initial_val_loss],
    val_accuracy=[initial_val_accuracy],
    time=[0],
    batch_size=[math.nan],
))

params = pd.DataFrame(dict(
    history_size=HISTORY_SIZE,
    horizon_size=HORIZON_SIZE,
    train_len=len(x_train),
    val_len=len(x_val),
    test_len=len(x_test),
    search_std=[search_std],
    # y_std=[enter_std],
    y_std=[leave_std],
))

model.summary()
print("Data set [train, val, test]:", [len(x_train), len(x_test), len(x_val)])
print("Shapes [x, y]:", [x_shape, y_shape])

Model: "sequential_39"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dropout_73 (Dropout)        (None, 420)               0         
                                                                 
 dense_80 (Dense)            (None, 420)               176820    
                                                                 
 dropout_74 (Dropout)        (None, 420)               0         
                                                                 
 dense_81 (Dense)            (None, 84)                35364     
                                                                 
Total params: 212,184
Trainable params: 212,184
Non-trainable params: 0
_________________________________________________________________
Data set [train, val, test]: [135, 19, 34]
Shapes [x, y]: [(5, 84), (1, 84)]


In [662]:
from lib import TimeCallback

batch_size = 2
epochs = 4000

model.summary()
# @NOTE: Keras does not have the "time" metric, so we have to track it manually
time_callback = TimeCallback()
hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_val, y_val), callbacks=[time_callback])
history_chunk = pd.merge(
    pd.DataFrame(hist.history),
    pd.DataFrame(dict(time=time_callback.times, batch_size=batch_size)),
    left_index=True, right_index=True,
)
history = pd.concat((history, history_chunk), ignore_index=True)

Model: "sequential_39"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dropout_73 (Dropout)        (None, 420)               0         
                                                                 
 dense_80 (Dense)            (None, 420)               176820    
                                                                 
 dropout_74 (Dropout)        (None, 420)               0         
                                                                 
 dense_81 (Dense)            (None, 84)                35364     
                                                                 
Total params: 212,184
Trainable params: 212,184
Non-trainable params: 0
_________________________________________________________________
Epoch 1/4000
Epoch 2/4000
Epoch 3/4000
Epoch 4/4000
Epoch 5/4000
Epoch 6/4000
Epoch 7/4000
Epoch 8/4000
Epoch 9/4000
Epoch 10/4000
Epoch 11/4000
Epoch 12/4000
Epoch 13/4000
Epoch 

In [663]:
px.line(history, y=['loss', 'val_loss'], log_y=True, log_x=True).show()
px.line(history, y=['accuracy', 'val_accuracy'], log_y=True, log_x=True).show()

In [665]:
def test_model(offset):
    out = data.copy()
    tx = out.copy().iloc[len(data) - offset - HISTORY_SIZE:len(data) - offset]
    tx['enter_cnt'] /= enter_std
    tx['leave_cnt'] /= leave_std
    tx['search_cnt'] /= search_std
    mx = tx.values.reshape((1, x_elm))
    my = model.predict(mx)
    my = my.reshape(y_shape).transpose()
    ty = pd.DataFrame(my)
    ty_index = pd.date_range(tx.index[-1] + pd.DateOffset(1), tx.index[-1] + pd.DateOffset(HORIZON_SIZE), freq='D')
    # ty = pd.DataFrame({ 'dt': ty_index, 'enter_cnt': ty[0], 'leave_cnt': ty[1] }).set_index('dt')
    ty = pd.DataFrame({ 'dt': ty_index, 'leave_cnt': ty[0] }).set_index('dt')
    # ty = pd.DataFrame({ 'dt': ty_index, 'enter_cnt': ty[0] }).set_index('dt')
    # ty['enter_cnt'] *= enter_std
    ty['leave_cnt'] *= leave_std
    out = out.merge(ty, on='dt', how='outer', suffixes=['', '_model'])

    res = out
    # px.line(res, y=['enter_cnt', 'enter_cnt_model'], title=f"offset = {offset}").show()
    px.line(res, y=['leave_cnt', 'leave_cnt_model'], title=f"offset = {offset}").show()

test_model(0*7)
test_model(5*7)
test_model(10*7)
test_model(15*7)
test_model(20*7)
test_model(25*7)
test_model(30*7)
test_model(35*7)



















In [661]:
# SAVE_MODEL_PATH = f'../data/models/enter_v1'
#
# model.save(f'{SAVE_MODEL_PATH}/model')
# params.to_csv(f'{SAVE_MODEL_PATH}/params.csv', index=False)
# history.to_csv(f'{SAVE_MODEL_PATH}/history.csv', index=False)
# data_norm.to_csv(f'{SAVE_MODEL_PATH}/dataset.csv')