In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd # 2.x to support pyarrow
import pyarrow as pa
import openpyxl # for reading xlsx with structure
import numpy as np
import plotly.express as px
import math
import re
from datetime import timedelta, datetime
import os

import tensorflow as tf
from tensorflow import keras

In [None]:
x_structure = pd.read_parquet('../data/pipeline/x_structure.parquet')
y_structure = pd.read_parquet('../data/pipeline/y_structure.parquet')

In [None]:
x_train_pretty_1h = pd.read_parquet('../data/pipeline/x_train_pretty_1h.parquet')
x_test_pretty_1h = pd.read_parquet('../data/pipeline/x_test_pretty_1h.parquet')
y_train_tte_1h = pd.read_parquet('../data/pipeline/y_train_tte_1h.parquet')

In [None]:
left_train = x_train_pretty_1h.set_index(['ИМЯ МАШИНЫ', 'DT']).astype('float64').sort_index().ffill()[x_structure.index]
left_test = x_test_pretty_1h.set_index(['ИМЯ МАШИНЫ', 'DT']).astype('float64').sort_index().ffill()[x_structure.index]
left_test_raw = x_test_pretty_1h.set_index(['ИМЯ МАШИНЫ', 'DT']).astype('float64').sort_index()[x_structure.index]
left_stats = left_train.describe()
left_train = left_train / left_stats.loc['std']
left_test = left_test / left_stats.loc['std']
MAX_TTE = 31 * 24 * 60 * 60

In [None]:
def train_model(target_place_name_and_type):
    # model_path = f'../dist/models/{target_place_name_and_type}.h5'
    # if os.path.isfile(model_path):
    #     return

    right = y_train_tte_1h.set_index(['ИМЯ МАШИНЫ', 'DT'])[[target_place_name_and_type]].astype('float64') / MAX_TTE
    data = pd.merge(left_train, right, left_index=True, right_index=True)

    batches = None
    for machine in x_structure.columns:
        seq = data.loc[machine].sort_index().astype('float64').ffill().fillna(0)
        X = seq[x_structure.index]
        Y = seq.drop(x_structure.index, axis=1)
        machine_examples = keras.utils.timeseries_dataset_from_array(X, Y, sequence_length=24 * 7, sequence_stride=24, seed=1337)
        if batches is None:
            batches = machine_examples
        else:
            batches = batches.concatenate(machine_examples)

    model = keras.Sequential([
        keras.Input((24*7, len(x_structure.index))),
        keras.layers.Dense(24*3, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid'),
    ])

    model.compile(loss='mse', optimizer='adam')
    # model.summary()

    print(str(datetime.now()) + ' -- ' + target_place_name_and_type)
    history = model.fit(batches, epochs=1000)
    # px.line(history.history).show()

    # model.save(model_path, save_format='h5')
    return history


history = train_model('РОТОР TTE M3')
px.line(history.history).show()

In [None]:
def evaluate_model(target_place_name_and_type, machine):
    model = keras.models.load_model(f'../dist/models/{target_place_name_and_type}.h5', compile=False)
    right = y_train_tte_1h.set_index(['ИМЯ МАШИНЫ', 'DT'])[[target_place_name_and_type]].astype('float64') / MAX_TTE

    input_df = left_train.loc[machine].astype('float64').ffill()
    result = pd.DataFrame(index=right.loc[machine].index)
    result['ПРОГНОЗ'] = 0

    x = 0
    while x < len(input_df) - 24*7 - 24:
        window = input_df.iloc[x:x+24*7]
        if len(window) < 24:
            break
        input = np.array([window])
        output = model.predict(input, verbose=0)
        result['ПРОГНОЗ'].iloc[x+24*7:x+24*7+24] = output[0][-24:].reshape((24))

        print("{:3.2f}%".format(100 * x / len(input_df)))
        x += 24

# evaluate_model('УЛИТА TTE M3', 'ЭКСГАУСТЕР А/М №4')

In [None]:
submission1_ref = pd.read_excel('../data/source/sample_submission/submission_1.xlsx', index_col=0)
submission2_ref = pd.read_parquet('../data/source/sample_submission/sample_submission_2.parquet')
submission3_ref = pd.read_parquet('../data/source/sample_submission/sample_submission_3.parquet')

In [None]:
x_test = pd.read_parquet('../data/source/X_test.parquet')

In [None]:
# submission1 = submission1_ref.copy()
# submission1['machine'] = np.NaN
# submission1['tm'] = np.NaN
# submission2 = pd.DataFrame(index=x_test.index, columns=submission2_ref.columns)
# submission3 = pd.DataFrame(index=x_test.index, columns=submission3_ref.columns)

def apply_model(target_place_name):
    # submission2_slice_path = f'../dist/submission2/{place}.parquet'
    # submission3_slice_path = f'../dist/submission3/{place}.parquet'
    # if os.path.isfile(submission2_slice_path) and os.path.isfile(submission3_slice_path):
    #     return
    #
    submission2_slice = pd.DataFrame(index=x_test.index)
    submission3_slice = pd.DataFrame(index=x_test.index)

    print(str(datetime.now()) + ' -- ' + target_place_name)
    for machine in y_structure.columns:
        prediction_field_m1 = f'{target_place_name} TTE M1'
        prediction_field_m3 = f'{target_place_name} TTE M3'
        model_m1 = keras.models.load_model(f'../dist/models/{prediction_field_m1}.h5', compile=False)
        model_m3 = keras.models.load_model(f'../dist/models/{prediction_field_m3}.h5', compile=False)
        y_name = y_structure[machine].loc[target_place_name]
        if y_name not in submission2_ref.columns:
            continue

        input_df = left_test.loc[machine].astype('float64').ffill()
        input_raw_df = left_test_raw.loc[machine].astype('float64')
        result = pd.DataFrame(index=left_test.loc[machine].index, columns=[prediction_field_m1, prediction_field_m3])

        x = 0
        while x < len(input_df) - 24*7 - 24:
            window = input_df.iloc[x:x+24*7]
            if len(window) < 24:
                break
            input = np.array([window])
            output_m1 = model_m1.predict(input, verbose=0)
            output_m3 = model_m3.predict(input, verbose=0)
            result[prediction_field_m1].iloc[x+24*7:x+24*7+24] = output_m1[0][-24:].reshape((24))
            result[prediction_field_m3].iloc[x+24*7:x+24*7+24] = output_m3[0][-24:].reshape((24))
            x += 24
            # break

        upsampled = result.rolling(72).mean().resample('10s').interpolate().fillna(1)
        submission2_slice[y_name] = upsampled[prediction_field_m3].map(lambda x: 1 if x < 0.2 else 0)
        submission3_slice[y_name] = upsampled[prediction_field_m1] * MAX_TTE

    # submission2_slice.to_parquet(submission2_slice_path)
    # submission3_slice.to_parquet(submission3_slice_path)

for place in y_structure.index:
    apply_model(place)

# for place in ['РОТОР']:
#     apply_model(place)

In [None]:
def prepare_results(target_place_name):
    m1_slice_path = f'../data/pipeline/db/tms/m1/{target_place_name}.parquet'
    m3_slice_path = f'../data/pipeline/db/tms/m3/{target_place_name}.parquet'

    if os.path.isfile(m1_slice_path) and os.path.isfile(m3_slice_path):
        return

    left = pd.concat([left_train, left_test])

    m1_slice = pd.DataFrame(index=left.index)
    m3_slice = pd.DataFrame(index=left.index)

    prediction_field_m1 = f'{target_place_name} TTE M1'
    prediction_field_m3 = f'{target_place_name} TTE M3'
    model_m1 = keras.models.load_model(f'../dist/models/{prediction_field_m1}.h5', compile=False)
    model_m3 = keras.models.load_model(f'../dist/models/{prediction_field_m3}.h5', compile=False)

    print(str(datetime.now()) + ' -- ' + target_place_name)
    for machine in y_structure.columns:
        input_df = left.loc[machine].astype('float64').ffill()
        result = pd.DataFrame(index=left.loc[machine].index, columns=[prediction_field_m1, prediction_field_m3])

        # @NOTE: Шаг в неделю
        x = 0
        while x < len(input_df) - 24*7 - 24:
            window = input_df.iloc[x:x+24*7]
            if len(window) < 24:
                break
            input = np.array([window])
            output_m1 = model_m1.predict(input, verbose=0)
            output_m3 = model_m3.predict(input, verbose=0)
            result[prediction_field_m1].iloc[x:x+24*7] = output_m1[0].reshape((24*7,))
            result[prediction_field_m3].iloc[x:x+24*7] = output_m3[0].reshape((24*7,))
            x += 7 * 24

        # upsampled = result.rolling(72).mean().fillna(1)
        upsampled = result.fillna(1)
        m1_slice.loc[machine, target_place_name] = upsampled[prediction_field_m1].values * MAX_TTE
        m3_slice.loc[machine, target_place_name] = upsampled[prediction_field_m3].values * MAX_TTE

    m1_slice.to_parquet(m1_slice_path)
    m3_slice.to_parquet(m3_slice_path)

for target_place_name in y_structure.index:
    prepare_results(target_place_name)

# for place in ['РОТОР']:
#     prepare_results(place)

In [None]:
def glue_results():
    m1_predictions = None
    m3_predictions = None

    for target_place_name in y_structure.index:
        m1_slice_path = f'../data/pipeline/db/tms/m1/{target_place_name}.parquet'
        m3_slice_path = f'../data/pipeline/db/tms/m3/{target_place_name}.parquet'

        if not(os.path.isfile(m1_slice_path) and os.path.isfile(m3_slice_path)):
            continue

        m1_slice = pd.read_parquet(m1_slice_path)
        if m1_predictions is None:
            m1_predictions = m1_slice
        else:
            m1_predictions = m1_predictions.merge(m1_slice, left_index=True, right_index=True)

        m3_slice = pd.read_parquet(m3_slice_path)
        if m3_predictions is None:
            m3_predictions = m3_slice
        else:
            m3_predictions = m3_predictions.merge(m3_slice, left_index=True, right_index=True)

    return m1_predictions, m3_predictions

m1_predictions, m3_predictions = glue_results()

tmp_m1 = m1_predictions.copy().reset_index()
tmp_m1['machine_id'] = tmp_m1['ИМЯ МАШИНЫ'].str.extract('№(\\d+)').astype(int)
tmp_m1 = tmp_m1.drop(['ИМЯ МАШИНЫ'], axis=1)
tmp_m1.to_csv('../data/pipeline/db/tm_m1_predictions.csv')
tmp_m1.set_index(['machine_id', 'DT']).rolling(14, center=True).quantile(0.1).reset_index().to_csv('../data/pipeline/db/tm_m1_predictions_smooth.csv')

tmp_m3 = m3_predictions.copy().reset_index()
tmp_m3['machine_id'] = tmp_m3['ИМЯ МАШИНЫ'].str.extract('№(\\d+)').astype(int)
tmp_m3 = tmp_m3.drop(['ИМЯ МАШИНЫ'], axis=1)
tmp_m3.to_csv('../data/pipeline/db/tm_m3_predictions.csv')
tmp_m3.set_index(['machine_id', 'DT']).rolling(14, center=True).quantile(0.1).reset_index().to_csv('../data/pipeline/db/tm_m3_predictions_smooth.csv')

In [None]:
metrics = []

def wrmse_metric(y_true, y_pred):
    t = y_true
    a = 30
    weights = 1 / ((a * t).clip(lower=1)) # @NOTE: clip чтобы не получать бесконечности когда t стремится к 0
    e = y_pred - y_true
    se = e ** 2
    wse = se * weights
    wmse = wse.mean()
    wrmse = wmse ** 0.5
    average_wrmse = wrmse
    return average_wrmse

tmp = y_train_tte_1h.set_index(['ИМЯ МАШИНЫ', 'DT']).sort_index()

for place in y_structure.index:
    m1_true = tmp[f'{place} TTE M1'] / MAX_TTE
    m1_pred = m1_predictions.loc[tmp.index][place] / MAX_TTE

    m3_true = tmp[f'{place} TTE M3'] / MAX_TTE
    m3_pred = m3_predictions.loc[tmp.index][place] / MAX_TTE

    m1_metric = wrmse_metric(m1_true, m1_pred)
    m3_metric = wrmse_metric(m3_true, m3_pred)

    ones = pd.Series(np.ones(m1_pred.shape), index=m1_pred.index)
    m1_ones_metric = wrmse_metric(m1_true, ones)
    m3_ones_metric = wrmse_metric(m3_true, ones)

    zeros = pd.Series(np.zeros(m1_pred.shape), index=m1_pred.index)
    m1_zeros_metric = wrmse_metric(m1_true, zeros)
    m3_zeros_metric = wrmse_metric(m3_true, zeros)

    random = pd.Series(np.random.uniform(0, 1, m1_pred.shape), index=m1_pred.index)
    m1_random_metric = wrmse_metric(m1_true, random)
    m3_random_metric = wrmse_metric(m3_true, random)

    metrics.append(dict(
        m1_metric=m1_metric, m3_metric=m3_metric,
        m1_ones_metric=m1_ones_metric, m3_ones_metric=m3_ones_metric,
        m1_zeros_metric=m1_zeros_metric, m3_zeros_metric=m3_zeros_metric,
        m1_random_metric=m1_random_metric, m3_random_metric=m3_random_metric,
    ))

In [None]:
pd.DataFrame(metrics).describe().drop(['count']).transpose().sort_index()

In [None]:
tmp0 = m3_predictions.rolling(14).quantile(0.1)
m3_pred = tmp0.loc['ЭКСГАУСТЕР А/М №4']['РОТОР']
m3_true = y_train_tte_1h.set_index(['ИМЯ МАШИНЫ', 'DT']).loc['ЭКСГАУСТЕР А/М №4']['РОТОР TTE M3']
tmp = pd.merge(m3_true, m3_pred, left_index=True, right_index=True, how='outer')
px.line(tmp)