In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
import plotly.graph_objs as go
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller

In [2]:
df = pd.read_csv('energy_consumption.csv', parse_dates=[0])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145366 entries, 0 to 145365
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   Datetime  145366 non-null  datetime64[ns]
 1   PJME_MW   145366 non-null  float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 2.2 MB


In [3]:
df['Datetime'].describe()

count                           145366
mean     2010-04-18 03:25:25.999202048
min                2002-01-01 01:00:00
25%                2006-02-23 14:15:00
50%                2010-04-18 04:30:00
75%                2014-06-10 18:45:00
max                2018-08-03 00:00:00
Name: Datetime, dtype: object

Возьму данные за 2016 год.

In [5]:
df = df[df['Datetime'].dt.year == 2016]
df['Datetime'].describe()

count                             8784
mean     2016-07-02 00:09:00.573770496
min                2016-01-01 00:00:00
25%                2016-04-01 12:45:00
50%                2016-07-02 00:30:00
75%                2016-10-01 12:15:00
max                2016-12-31 23:00:00
Name: Datetime, dtype: object

Выполню ресемплирование данных по дню с суммированием.

In [6]:
df = df.resample('1D', on='Datetime').sum().reset_index()
df.head()

Unnamed: 0,Datetime,PJME_MW
0,2016-01-01,670751.0
1,2016-01-02,696432.0
2,2016-01-03,698803.0
3,2016-01-04,821012.0
4,2016-01-05,918751.0


In [10]:
df['Datetime'] = df['Datetime'].apply(pd.Timestamp.toordinal)
df.head()

Unnamed: 0,Datetime,PJME_MW
0,735964,670751.0
1,735965,696432.0
2,735966,698803.0
3,735967,821012.0
4,735968,918751.0


## Проверка ряда на стационарность

In [44]:
result = adfuller(df['PJME_MW'])

print('ADF Statistic:', result[0])
print('p-value:', result[1])

if result[1] < 0.05:
    print("Ряд стационарен, отвергаем нулевую гипотезу.")
else:
    print("Ряд нестационарен, не отвергаем нулевую гипотезу.")

ADF Statistic: -1.7476193839802703
p-value: 0.4067594447199813
Ряд нестационарен, не отвергаем нулевую гипотезу.


Разобью данные на 2 выборки в соотношение 80:20.

In [14]:
train, test = train_test_split(df, test_size=0.2, shuffle=False)
X_train = train.drop('PJME_MW', axis=1)
X_test = test.drop('PJME_MW', axis=1)
y_train = train['PJME_MW']
y_test = test['PJME_MW']

## Модель МНК для полинома 4-й степени

In [51]:
poly = PolynomialFeatures(degree=4)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
ridge = Ridge()
ridge.fit(X_train_poly, y_train)
y_pred_ridge = ridge.predict(X_test_poly)

## Модель KNN

In [16]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

## Модель Linear Regression

In [17]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_lr = linear_reg.predict(X_test)

## Модель Decision Tree Regressor

In [18]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
y_pred_dtr = dtr.predict(X_test)

## Модель ARIMA

In [34]:
history = [y for y in y_test]
predictions_arima = []

for i in range(1, len(y_test)):
    model = sm.tsa.arima.ARIMA(history, order=(5, 2, 0))
    model_fit = model.fit()

    yhat = model_fit.forecast()[0]
    predictions_arima.append(yhat)

    obs = y_test.iloc[i]
    history.append(obs)

## Графики для полученных значений

In [54]:
fig = go.Figure()

# Оригинальные данные
fig.add_trace(go.Scatter(x=df['Datetime'], y=df['PJME_MW'], mode='lines', line = dict(color = 'blue', width = 2), name='Train Data'))

# Скользящие среднее и СКО
fig.add_trace(go.Scatter(x=X_train['Datetime'], y=df['PJME_MW'].rolling(15).mean(), mode='lines', name='Скользящее среднее'))
fig.add_trace(go.Scatter(x=X_train['Datetime'], y=df['PJME_MW'].rolling(15).std(), mode='lines', name='Скользящее СКО'))

# Прогнозы моделей
fig.add_trace(go.Scatter(x=X_test['Datetime'], y=y_pred_ridge, mode='lines', name='Polynomial Ridge'))
fig.add_trace(go.Scatter(x=X_test['Datetime'], y=y_pred_knn, mode='lines', name='KNN'))
fig.add_trace(go.Scatter(x=X_test['Datetime'], y=y_pred_lr, mode='lines', name='Linear Regression'))
fig.add_trace(go.Scatter(x=X_test['Datetime'], y=y_pred_dtr, mode='lines', name='Decision Tree Regressor'))
fig.add_trace(go.Scatter(x=X_test['Datetime'], y=predictions_arima, mode='lines', name='ARIMA'))

# Настройки графика
fig.update_layout(title='Energy Consumption Predictions (Monthly)',
                  xaxis_title='Date',
                  yaxis_title='Monthly Energy Consumption',
                  legend_title='Models')

fig.show()