In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd # 2.x to support pyarrow
import pyarrow as pa
import openpyxl # for reading xlsx with structure
import numpy as np
import plotly.express as px
import math
import re
from datetime import timedelta, datetime
import os

import tensorflow as tf
from tensorflow import keras

In [4]:
x_structure = pd.read_parquet('../data/pipeline/x_structure.parquet')
y_structure = pd.read_parquet('../data/pipeline/y_structure.parquet')

In [5]:
x_train_pretty_1h = pd.read_parquet('../data/pipeline/x_train_pretty_1h.parquet')
x_test_pretty_1h = pd.read_parquet('../data/pipeline/x_test_pretty_1h.parquet')
y_train_tte_1h = pd.read_parquet('../data/pipeline/y_train_tte_1h.parquet')

In [6]:
left_train = x_train_pretty_1h.set_index(['ИМЯ МАШИНЫ', 'DT']).astype('float64').sort_index().ffill()[x_structure.index]
left_test = x_test_pretty_1h.set_index(['ИМЯ МАШИНЫ', 'DT']).astype('float64').sort_index().ffill()[x_structure.index]
left_test_raw = x_test_pretty_1h.set_index(['ИМЯ МАШИНЫ', 'DT']).astype('float64').sort_index()[x_structure.index]
left_stats = left_train.describe()
left_train = left_train / left_stats.loc['std']
left_test = left_test / left_stats.loc['std']
MAX_TTE = 31 * 24 * 60 * 60

In [None]:
def train_model(target_place_name_and_type):
    model_path = f'../dist/models/{target_place_name_and_type}.h5'
    if os.path.isfile(model_path):
        return

    right = y_train_tte_1h.set_index(['ИМЯ МАШИНЫ', 'DT'])[[target_place_name_and_type]].astype('float64') / MAX_TTE
    data = pd.merge(left_train, right, left_index=True, right_index=True)

    batches = None
    for machine in x_structure.columns:
        seq = data.loc[machine].sort_index().astype('float64').ffill().fillna(0)
        X = seq[x_structure.index]
        Y = seq.drop(x_structure.index, axis=1)
        machine_examples = keras.utils.timeseries_dataset_from_array(X, Y, sequence_length=24 * 7, sequence_stride=24, seed=1337)
        if batches is None:
            batches = machine_examples
        else:
            batches = batches.concatenate(machine_examples)

    model = keras.Sequential([
        keras.Input((24*7, len(x_structure.index))),
        keras.layers.Dense(24*3, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid'),
    ])

    model.compile(loss='mse', optimizer='adam')
    # model.summary()

    print(str(datetime.now()) + ' -- ' + target_place_name_and_type)
    history = model.fit(batches, epochs=1000, verbose=0)
    # px.line(history.history).show()

    model.save(model_path, save_format='h5')
    return model

# for place in y_structure.index:
#     for type in ['TTE M1', 'TTE M3']:
#         train_model(place + ' ' + type)

In [None]:
def validate_model(target_place_name_and_type, machine):
    model = keras.models.load_model(f'../dist/models/{target_place_name_and_type}.h5', compile=False)
    right = y_train_tte_1h.set_index(['ИМЯ МАШИНЫ', 'DT'])[[target_place_name_and_type]].astype('float64') / MAX_TTE

    input_df = left_train.loc[machine].astype('float64').ffill()
    result = pd.DataFrame(index=right.loc[machine].index)
    result['ПРОГНОЗ'] = 0

    x = 0
    while x < len(input_df) - 24*7 - 24:
        window = input_df.iloc[x:x+24*7]
        if len(window) < 24:
            break
        input = np.array([window])
        output = model.predict(input, verbose=0)
        result['ПРОГНОЗ'].iloc[x+24*7:x+24*7+24] = output[0][-24:].reshape((24))

        print("{:3.2f}%".format(100 * x / len(input_df)))
        x += 24

# validate_model('УЛИТА TTE M3', 'ЭКСГАУСТЕР А/М №4')

In [7]:
submission1_ref = pd.read_excel('../data/source/sample_submission/submission_1.xlsx', index_col=0)
submission2_ref = pd.read_parquet('../data/source/sample_submission/sample_submission_2.parquet')
submission3_ref = pd.read_parquet('../data/source/sample_submission/sample_submission_3.parquet')

In [8]:
x_test = pd.read_parquet('../data/source/X_test.parquet')

In [9]:
# submission1 = submission1_ref.copy()
# submission1['machine'] = np.NaN
# submission1['tm'] = np.NaN
# submission2 = pd.DataFrame(index=x_test.index, columns=submission2_ref.columns)
# submission3 = pd.DataFrame(index=x_test.index, columns=submission3_ref.columns)

def apply_model(target_place_name):
    submission2_slice_path = f'../dist/submission2/{place}.parquet'
    submission3_slice_path = f'../dist/submission3/{place}.parquet'
    if os.path.isfile(submission2_slice_path) and os.path.isfile(submission3_slice_path):
        return

    submission2_slice = pd.DataFrame(index=x_test.index)
    submission3_slice = pd.DataFrame(index=x_test.index)

    print(str(datetime.now()) + ' -- ' + target_place_name)
    for machine in y_structure.columns:
        prediction_field_m1 = f'{target_place_name} TTE M1'
        prediction_field_m3 = f'{target_place_name} TTE M3'
        model_m1 = keras.models.load_model(f'../dist/models/{prediction_field_m1}.h5', compile=False)
        model_m3 = keras.models.load_model(f'../dist/models/{prediction_field_m3}.h5', compile=False)
        y_name = y_structure[machine].loc[target_place_name]
        if y_name not in submission2_ref.columns:
            continue

        input_df = left_test.loc[machine].astype('float64').ffill()
        input_raw_df = left_test_raw.loc[machine].astype('float64')
        result = pd.DataFrame(index=left_test.loc[machine].index, columns=[prediction_field_m1, prediction_field_m3])

        x = 0
        while x < len(input_df) - 24*7 - 24:
            window = input_df.iloc[x:x+24*7]
            if len(window) < 24:
                break
            input = np.array([window])
            output_m1 = model_m1.predict(input, verbose=0)
            output_m3 = model_m3.predict(input, verbose=0)
            result[prediction_field_m1].iloc[x+24*7:x+24*7+24] = output_m1[0][-24:].reshape((24))
            result[prediction_field_m3].iloc[x+24*7:x+24*7+24] = output_m3[0][-24:].reshape((24))
            x += 24

        upsampled = result.rolling(72).mean().resample('10s').interpolate().fillna(1)
        submission2_slice[y_name] = upsampled[prediction_field_m3].map(lambda x: 1 if x < 0.2 else 0)
        submission3_slice[y_name] = upsampled[prediction_field_m1] * MAX_TTE

    submission2_slice.to_parquet(submission2_slice_path)
    submission3_slice.to_parquet(submission3_slice_path)

for place in y_structure.index:
    apply_model(place)

# for place in ['РОТОР']:
#     for machine in ['ЭКСГАУСТЕР А/М №4']:
#         apply_model(place, machine)

2023-05-28 19:29:38.212650 -- МАСЛОПРОВОДЫ
2023-05-28 19:34:21.399769 -- МАСЛОСТАНЦИЯ ЖИДКОЙ СМАЗКИ
2023-05-28 19:35:09.868420 -- МАСЛЯНЫЙ ФИЛЬТР
2023-05-28 19:36:45.419027 -- МЕТРАН-100 ДАТЧИКИ ДАВЛЕНИЯ
2023-05-28 19:39:09.380750 -- ПОДШИПНИК ОПОРНО-УПОРНЫЙ


KeyboardInterrupt: 