In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
pd.set_option('display.float_format', '{:.3f}'.format)

# Загрузка
df = pd.read_csv("../data/processed/daily_merged.csv", parse_dates=['date'], index_col='date')
print(f"Исходно: {len(df)} строк | {df.index.min().date()} → {df.index.max().date()}")
df.head()

Исходно: 1096 строк | 2022-01-01 → 2024-12-31


Unnamed: 0_level_0,Vgas_m3,Qboiler_prod_Gcal,air_temp_daily,Qgas_ccal_m3,T1_c_mean,T2_c_mean
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-01,13143.0,83.245,-1.896,8121,91.273,50.827
2022-01-02,14814.0,103.855,-7.458,8121,90.949,50.599
2022-01-03,14523.0,106.003,-9.796,8121,90.043,50.138
2022-01-04,13295.0,101.535,-9.225,8121,89.821,50.01
2022-01-05,12394.0,94.071,-7.954,8121,89.108,49.662


In [2]:
df['month'] = df.index.month
df['day_of_year'] = df.index.dayofyear
df['day_of_week'] = df.index.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Сезон: 0-зима, 1-весна, 2-лето, 3-осень
df['season'] = ((df['month'] % 12 + 3) // 3 - 1) % 4

# Циклические
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

print("Временные признаки добавлены")

Временные признаки добавлены


In [3]:
# Лаги Vgas
for lag in [1, 3, 7, 14]:
    df[f'Vgas_lag_{lag}d'] = df['Vgas_m3'].shift(lag)

# Лаги температуры
for lag in [1, 3, 7]:
    df[f'air_temp_lag_{lag}d'] = df['air_temp_daily'].shift(lag)

# Скользящие
df['Vgas_roll_mean_7d'] = df['Vgas_m3'].rolling(7).mean()
df['Vgas_roll_std_7d'] = df['Vgas_m3'].rolling(7).std()
df['air_temp_roll_mean_7d'] = df['air_temp_daily'].rolling(7).mean()

print("Лаги и скользящие добавлены")

Лаги и скользящие добавлены


In [4]:
# 1. Эффективность (Gcal / м³ газа) — с защитой от деления на 0
denominator = df['Vgas_m3'] * df['Qgas_ccal_m3'] / 1_000_000
denominator = denominator.replace(0, np.nan)  # избегаем деления на 0
df['efficiency_Gcal_per_m3'] = df['Qboiler_prod_Gcal'] / denominator

# 2. ΔT в сети
df['delta_T'] = df['T1_c_mean'] - df['T2_c_mean']

# 3. Температурный напор
df['temp_head'] = df['T1_c_mean'] - df['air_temp_daily']

# 4. Относительная нагрузка
df['load_ratio'] = df['Qboiler_prod_Gcal'] / df['Qboiler_prod_Gcal'].max()

print("Доменные признаки добавлены")

Доменные признаки добавлены


In [5]:
print("Очистка от inf / NaN / огромных значений...")

# Заменяем inf на NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Ограничиваем efficiency (реальные значения ~0.5–1.5)
df['efficiency_Gcal_per_m3'] = df['efficiency_Gcal_per_m3'].clip(lower=0, upper=2)

# Удаляем строки с NaN
df_clean = df.dropna()

print(f"После очистки: {len(df_clean)} строк ({len(df_clean)/len(df)*100:.1f}%)")

Очистка от inf / NaN / огромных значений...
После очистки: 1044 строк (95.3%)


In [6]:
os.makedirs("../data/processed", exist_ok=True)
df_clean.to_csv("../data/processed/daily_features.csv")

print("Сохранено: daily_features.csv")
df_clean.head()

Сохранено: daily_features.csv


Unnamed: 0_level_0,Vgas_m3,Qboiler_prod_Gcal,air_temp_daily,Qgas_ccal_m3,T1_c_mean,T2_c_mean,month,day_of_year,day_of_week,is_weekend,...,air_temp_lag_1d,air_temp_lag_3d,air_temp_lag_7d,Vgas_roll_mean_7d,Vgas_roll_std_7d,air_temp_roll_mean_7d,efficiency_Gcal_per_m3,delta_T,temp_head,load_ratio
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-15,12402.0,88.268,-2.646,8121,84.521,47.345,1,15,5,1,...,0.163,-15.25,-7.029,15217.0,3890.625,-7.754,0.876,37.176,87.167,0.021
2022-01-16,12749.0,93.779,-5.158,8121,83.483,46.812,1,16,6,1,...,-2.646,-13.421,-5.5,14984.714,3996.376,-7.705,0.906,36.671,88.641,0.023
2022-01-17,12701.0,91.013,-3.446,8121,83.255,46.64,1,17,0,0,...,-5.158,0.163,-7.267,14445.143,4016.085,-7.159,0.882,36.616,86.701,0.022
2022-01-18,16622.0,101.157,-4.521,8121,82.523,46.309,1,18,1,0,...,-3.446,-2.646,-10.354,14123.429,3678.157,-6.326,0.749,36.214,87.044,0.025
2022-01-19,12059.0,113.111,-9.158,8121,82.507,46.29,1,19,2,0,...,-4.521,-5.158,-15.25,12781.429,1784.449,-5.455,1.155,36.218,91.666,0.027


In [7]:
corr = df_clean.corr()['Vgas_m3'].abs().sort_values(ascending=False)
print("Топ-15 признаков по корреляции с Vgas_m3:")
display(corr.head(15))

Топ-15 признаков по корреляции с Vgas_m3:


Vgas_m3                 1.000
Vgas_lag_1d             0.966
Vgas_roll_mean_7d       0.961
air_temp_daily          0.941
air_temp_lag_1d         0.928
Vgas_lag_3d             0.922
air_temp_roll_mean_7d   0.922
Vgas_lag_7d             0.881
air_temp_lag_3d         0.879
air_temp_lag_7d         0.843
Vgas_lag_14d            0.834
Qboiler_prod_Gcal       0.777
load_ratio              0.777
month_cos               0.777
Vgas_roll_std_7d        0.767
Name: Vgas_m3, dtype: float64