In [99]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

import plotly.express as px

from scipy.stats import shapiro

## Шаг 1. Загрузим данные.

In [2]:
data = pd.read_csv('https://code.s3.yandex.net/datasets/taxi.csv', index_col=[0], parse_dates=[0])
data.head()

Unnamed: 0_level_0,num_orders
datetime,Unnamed: 1_level_1
2018-03-01 00:00:00,9
2018-03-01 00:10:00,14
2018-03-01 00:20:00,28
2018-03-01 00:30:00,20
2018-03-01 00:40:00,32


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 26496 entries, 2018-03-01 00:00:00 to 2018-08-31 23:50:00
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   num_orders  26496 non-null  int64
dtypes: int64(1)
memory usage: 414.0 KB


Проверим данные на монотонность

In [4]:
if data.index.is_monotonic_increasing:
    print('Монотонный временной ряд, данные распределены от прошлого к будущему.')
else:
    print('Данные в датасете перемешаны во времени.')

Монотонный временной ряд, данные распределены от прошлого к будущему.


Проверим, что в данных нет одинаковых моментов времени

In [11]:
if data.index.is_unique:
    print('Временные метки уникальны')
else:
    print('Временные метки не уникальные')

Временные метки уникальны


In [15]:
data_by_hour = data.resample('1h').sum()

In [19]:
fig = px.histogram(
    data_by_hour,
    title='Распределение количества заказов'
)

fig.update_xaxes(
    title='Количество заказов '
)

fig.update_yaxes(
    title='Количество'
)

fig.update_layout(
    showlegend=False,
    bargap=0.1,
)

fig.show()

In [84]:
def make_features(data, max_lag, rolling_mean_size):
    data['year'] = data.index.year
    data['month'] = data.index.month
    data['day'] = data.index.day
    data['dayofweek'] = data.index.dayofweek

    for lag in range(1, max_lag + 1):
        data['lag_{}'.format(lag)] = data['num_orders'].shift(lag)

    data['rolling_mean'] = data['num_orders'].shift().rolling(rolling_mean_size).mean()
    return data

data_by_hour = make_features(data_by_hour, 200, 200).dropna()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [85]:
train, test = train_test_split(data_by_hour, test_size=.1, shuffle=False)

train_features = train.drop('num_orders', axis=1)
train_target = train['num_orders']

test_features = test.drop('num_orders', axis=1)
test_target = test['num_orders']

In [100]:
model = RandomForestRegressor(max_depth=10, n_estimators=1500, n_jobs=-1)
model.fit(train_features, train_target)
pred = model.predict(test_features)
np.sqrt(mean_squared_error(test_target, pred))

35.24558107109316

In [109]:
train_features

Unnamed: 0_level_0,year,month,day,dayofweek,lag_1,lag_2,lag_3,lag_4,lag_5,rolling_mean,...,lag_491,lag_492,lag_493,lag_494,lag_495,lag_496,lag_497,lag_498,lag_499,lag_500
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-05-11 01:00:00,2018,5,11,4,107.0,71.0,53.0,60.0,75.0,67.605,...,47.0,67.0,39.0,87.0,96.0,47.0,74.0,26.0,7.0,17.0
2018-05-11 02:00:00,2018,5,11,4,76.0,107.0,71.0,53.0,60.0,67.650,...,93.0,47.0,67.0,39.0,87.0,96.0,47.0,74.0,26.0,7.0
2018-05-11 03:00:00,2018,5,11,4,93.0,76.0,107.0,71.0,53.0,67.920,...,96.0,93.0,47.0,67.0,39.0,87.0,96.0,47.0,74.0,26.0
2018-05-11 04:00:00,2018,5,11,4,69.0,93.0,76.0,107.0,71.0,67.880,...,60.0,96.0,93.0,47.0,67.0,39.0,87.0,96.0,47.0,74.0
2018-05-11 05:00:00,2018,5,11,4,24.0,69.0,93.0,76.0,107.0,67.540,...,62.0,60.0,96.0,93.0,47.0,67.0,39.0,87.0,96.0,47.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-08-20 11:00:00,2018,8,20,0,146.0,122.0,94.0,44.0,47.0,127.660,...,179.0,101.0,111.0,122.0,76.0,125.0,78.0,156.0,127.0,91.0
2018-08-20 12:00:00,2018,8,20,0,152.0,146.0,122.0,94.0,44.0,127.740,...,156.0,179.0,101.0,111.0,122.0,76.0,125.0,78.0,156.0,127.0
2018-08-20 13:00:00,2018,8,20,0,91.0,152.0,146.0,122.0,94.0,127.640,...,135.0,156.0,179.0,101.0,111.0,122.0,76.0,125.0,78.0,156.0
2018-08-20 14:00:00,2018,8,20,0,122.0,91.0,152.0,146.0,122.0,127.995,...,186.0,135.0,156.0,179.0,101.0,111.0,122.0,76.0,125.0,78.0


In [103]:
px.scatter(model.feature_importances_).show()