# Восстановление показания датчиков на приеме (P,T) с помощью ML

In [None]:
import os
import sys
sys.path.append('../../../')
import pandas as pd
import datetime
from multiprocessing import Pool

In [None]:
import plot_workflow.plotly_option as pltl_opt
import plot_workflow.plotly_workflow as pltl_wf

from preproc_p import workflow_cs_data
from preproc_p import workflow_chess_data
from preproc_p import preproc_tool
from preproc_p import workflow_calc_data
from preproc_p import workflow_tr_data
from preproc_p import filtration
from proc_p import processor as proc

from ml import calibr_restore as calibr_restore

from postproc_p import result_and_metrics as result_and_metrics

In [None]:
well_name = '1982'
chess_file_name = f'Скв. {well_name} (01.01.2020-29.02.2020).xls'
# TODO убрать флаги
read_initial_data = True
plot_initial_data = True
create_input_data = True
auto_open_html = True
multiprocessing_on = True #TODO может быть убрать флаги, как лучше?

In [None]:
static_data_full_path = preproc_tool.find_full_path_by_pattern(os.getcwd(), "*static_data.xlsx*")[0]

In [None]:
current_path = os.getcwd()
time_mark = '' #datetime.datetime.today().strftime('%Y_%m_%d_%H_%M_%S')
path_to_data = current_path + "\\data\\"
path_to_work_dir = current_path + "\\data\\" + well_name +  "\\"
save_dir_name = 'init_edit'
path_to_save = path_to_work_dir + save_dir_name + '\\'
dirnames_list = []
for (dirpath, dirnames, filenames) in os.walk(path_to_data):
    dirnames_list.extend(dirnames)
    break
print(dirnames_list)
dynamic_data_full_path = path_to_save + well_name + "_first_edit.csv"

In [None]:
#left_boundary = [datetime.datetime(2018,8,1), datetime.datetime(2018,11,29)]
#right_boundary = [datetime.datetime(2018,11,5), datetime.datetime(2019,2,28)]
#left_boundary = [datetime.datetime(2018,8,3), datetime.datetime(2018,11,29),   datetime.datetime(2019,2,19)]
#right_boundary = [datetime.datetime(2018,11,6), datetime.datetime(2019,2,4),  datetime.datetime(2019,2,28)]
#left_boundary = [datetime.datetime(2019,1,30)]
#right_boundary = [datetime.datetime(2019,2,28)]
left_boundary = [datetime.datetime(2018,6,27)]
right_boundary = [datetime.datetime(2020,7,27)]

In [None]:
def run_calculation(thread_option_list):
    if __name__ == '__main__':
        with Pool(amount_of_threads) as p:
            p.map(proc.calc,
                  thread_option_list)

In [None]:
global_names = preproc_tool.GlobalNames()

В начале 3. Подготовка данных к адаптации

In [None]:
time_to_resamle = '1h'
input_data_dir_name = 'adapt_input_' + time_mark
path_to_input_data = path_to_work_dir + input_data_dir_name + '\\'
plot_file_path = path_to_input_data + well_name
if create_input_data:
    try:
        os.mkdir(path_to_work_dir + input_data_dir_name)
    except:
        pass

In [None]:
prepared_dynamic_data = pd.read_csv(path_to_save + well_name + "_second_edit.csv", index_col = 'Время', parse_dates = True, dayfirst = True)

In [None]:
left_border_to_train = datetime.datetime(2020,1,21)
items_to_median_filtration = 25
flot_filtration = True
time_to_resample = '1800s'
use_time_as_int_in_column = True

In [None]:
essential_df = prepared_dynamic_data[global_names.return_essential_parameters() + [global_names.i_a_motor_a]]
essential_df = essential_df[essential_df.index >=left_border_to_train]

for i in global_names.return_essential_parameters():
    smooth_df = filtration.check_medfit(essential_df, i, items_to_median_filtration, flot_filtration)
    essential_df[i] = smooth_df[i + ' (median)']
essential_df_resampled = essential_df.resample(time_to_resample).mean()

In [None]:
parameters_to_restore_intake = [global_names.p_buf_atm,
                                global_names.cos_phi_d,
                                global_names.u_motor_v, 
                                global_names.motor_load_perc, 
                                global_names.freq_hz,
                                global_names.active_power_kwt]
predicted_intake_parameters = [global_names.p_intake_atm, global_names.t_intake_c]

Проверка метода восстановления на исходных данных

In [None]:
train_data = essential_df_resampled.dropna(subset = [global_names.p_intake_atm])
forecast_data = essential_df_resampled[essential_df_resampled.index >= train_data.index[-1]]
forecast_data = forecast_data[parameters_to_restore_intake]
forecast_data = forecast_data.dropna()

In [None]:
train_data = train_data[parameters_to_restore_intake + predicted_intake_parameters]
train_data = train_data.dropna()
feature_data = train_data[parameters_to_restore_intake]
intake_data = train_data[predicted_intake_parameters]

In [None]:
x_train, x_test, y_train, y_test = calibr_restore.get_test_train_drop_2_points(feature_data,
                                                                               intake_data)

In [None]:
p_intake_predicted = calibr_restore.predict_parameter_in_df(x_train, x_test, y_train[global_names.p_intake_atm], use_time_as_int_in_column) 
t_intake_predicted = calibr_restore.predict_parameter_in_df(x_train, x_test, y_train[global_names.t_intake_c], use_time_as_int_in_column)


In [None]:
y_test[global_names.p_intake_atm + ' (PREDICTION)'] = p_intake_predicted
y_test[global_names.t_intake_c + ' (PREDICTION)'] = t_intake_predicted
y = y_test.merge(y_train, how = 'outer')

In [None]:
y_test[[global_names.p_intake_atm, global_names.p_intake_atm + ' (PREDICTION)']].plot()

In [None]:
result_and_metrics.relative_error_perc(y_test[global_names.p_intake_atm],
                                       y_test[global_names.p_intake_atm + ' (PREDICTION)']).abs().mean()

In [None]:
y_test[[global_names.t_intake_c, global_names.t_intake_c + ' (PREDICTION)']].plot()

In [None]:
result_and_metrics.relative_error_perc(y_test[global_names.t_intake_c],
                                       y_test[global_names.t_intake_c + ' (PREDICTION)']).abs().mean()

## Прогнозирование давления и температуре на приеме

In [None]:
x_train = train_data[parameters_to_restore_intake]

In [None]:
y_train = train_data[predicted_intake_parameters]

In [None]:
x_test = forecast_data

In [None]:
p_intake_forecast = calibr_restore.predict_parameter_in_df(x_train, x_test, y_train[global_names.p_intake_atm], use_time_as_int_in_column) 
t_intake_forecast = calibr_restore.predict_parameter_in_df(x_train, x_test, y_train[global_names.t_intake_c], use_time_as_int_in_column)

In [None]:
forecast = x_test.copy()
forecast[global_names.p_intake_atm + ' (PREDICTION)'] = p_intake_forecast
forecast[global_names.t_intake_c + ' (PREDICTION)'] = t_intake_forecast

In [None]:
result = train_data.append(forecast, sort = True)

In [None]:
result[[global_names.p_intake_atm, global_names.p_intake_atm + ' (PREDICTION)']].plot()

In [None]:
result[[global_names.t_intake_c, global_names.t_intake_c + ' (PREDICTION)']].plot()

## Добавление результатов прогнозирования в общий набор данных

In [None]:
new_values = list(result[global_names.t_intake_c].dropna().values) + \
list(result[global_names.t_intake_c + ' (PREDICTION)'].dropna().values)

In [None]:
result[global_names.t_intake_c] = list(result[global_names.t_intake_c].dropna().values) + \
list(result[global_names.t_intake_c + ' (PREDICTION)'].dropna().values)

In [None]:
result[global_names.p_intake_atm] = list(result[global_names.p_intake_atm].dropna().values) + \
list(result[global_names.p_intake_atm + ' (PREDICTION)'].dropna().values)

In [None]:
essential_low_freq = essential_df[global_names.return_essential_parameters()[0:3] + \
                                  [global_names.i_a_motor_a]]

In [None]:
essential_low_freq[global_names.i_a_motor_a] = essential_df[global_names.i_a_motor_a]

In [None]:
result = result.join(essential_low_freq, how = 'outer')

In [None]:
result[global_names.d_choke_mm] = result[global_names.p_intake_atm]* 0 + 32

In [None]:
prepared_dynamic_data[global_names.p_intake_atm].dropna().plot()

In [None]:
old_p_intake = prepared_dynamic_data[global_names.p_intake_atm]
old_p_intake.dropna().plot()

In [None]:
new_p_intake = forecast[global_names.p_intake_atm + ' (PREDICTION)']
new_p_intake.plot()

In [None]:
combined_p_intake = old_p_intake.append(new_p_intake)
combined_p_intake.dropna().plot()

In [None]:
prepared_dynamic_data[global_names.p_intake_atm].dropna().plot()

In [None]:
del prepared_dynamic_data[global_names.p_intake_atm]

In [None]:
combined_p_intake.name = global_names.p_intake_atm

In [None]:
prepared_dynamic_data = prepared_dynamic_data.join(combined_p_intake, how = 'outer')

In [None]:
prepared_dynamic_data[global_names.p_intake_atm].dropna().plot()

In [None]:
prepared_dynamic_data[global_names.d_choke_mm] = prepared_dynamic_data[global_names.p_intake_atm]* 0 + 32

In [None]:
prepared_dynamic_data.to_csv(path_to_save + well_name + "_third_edit.csv")

In [None]:
all_banches = pltl_opt.create_banches_for_report(report_type = 'second_edit_data')
pltl_wf.create_report_html(prepared_dynamic_data, all_banches, path_to_save + well_name +
                           "_third_edit_data.html",  auto_open = auto_open_html)