### Предобработка данных и создание новых признаков

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('bitcoin_historical_data.csv')
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume
0,2017-08-17 04:00:00,4261.48,4280.56,4261.48,4261.48,2.189061
1,2017-08-17 04:15:00,4261.48,4270.41,4261.32,4261.45,9.119865
2,2017-08-17 04:30:00,4280.0,4310.07,4267.99,4310.07,21.923552
3,2017-08-17 04:45:00,4310.07,4313.62,4291.37,4308.83,13.948531
4,2017-08-17 05:00:00,4308.83,4328.69,4304.31,4304.31,5.101153


In [4]:
df.shape

(263384, 6)

In [5]:
print(f'{df.isnull().sum()}')
print('пропусков в df нет')
print(f'дубликатов в df  - {df.duplicated().sum()}')

timestamp    0
open         0
high         0
low          0
close        0
volume       0
dtype: int64
пропусков в df нет
дубликатов в df  - 0


В данных нет пропусков и дубликатов, начнем с ними работать. Создадим новые признаки.

Посчитаем скользящие средние для разных временных окон (час, 4 часа, 1 день): простую (EMA) и экспоненциальную (SMA).

In [7]:
#24*4 = 96  количество 15-минуток в дне

df['SMA_1h'] = df['close'].rolling(window=4).mean()  
df['SMA_4h'] = df['close'].rolling(window=16).mean()  
df['SMA_1d'] = df['close'].rolling(window=96).mean()  

df['EMA_1h'] = df['close'].ewm(span=4, adjust=False).mean()  
df['EMA_4h'] = df['close'].ewm(span=16, adjust=False).mean() 

Посчитаем RSI.

In [8]:
delta = df['close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=16).mean() 
loss = (-delta.where(delta < 0, 0)).rolling(window=16).mean()
rs = gain / loss
df['RSI'] = 100 - (100 / (1 + rs))

Посчитаем ATR.

In [9]:
df['TR'] = np.maximum(
    df['high'] - df['low'],
    np.maximum(abs(df['high'] - df['close'].shift()), abs(df['low'] - df['close'].shift()))
)
df['ATR_1h'] = df['TR'].rolling(window=4).mean() 
df['ATR_4h'] = df['TR'].rolling(window=16).mean()  

Добавим также признаки на основе времени (вдруг пригодится).

In [10]:
df['day_of_week'] = pd.to_datetime(df['timestamp']).dt.dayofweek

Посмотрим на изменения цен за промежутки времени (15 минут, час, 4 часа), волатильность, а так же диапазон цен за период.

In [12]:
df['price_change_15m'] = df['close'].pct_change() 
df['price_change_1h'] = df['close'].pct_change(4)  
df['price_change_4h'] = df['close'].pct_change(16)  

df['volatility_1h'] = df['close'].rolling(window=4).std()  
df['volatility_4h'] = df['close'].rolling(window=16).std() 

df['high_low_diff_15m'] = df['high'] - df['low']
df['high_low_diff_1h'] = df['high'].rolling(window=4).max() - df['low'].rolling(window=4).min()

Добавим так же целевые признаки, которые, скорее всего, будем прогнрозировать - цену через 4 интервала (час) и изменение цены через час.

In [13]:
df['target_price_1h'] = df['close'].shift(-4) 
df['target_price_4h'] = df['close'].shift(-16) 

df['target_change_1h'] = df['close'].pct_change(4).shift(-4)  
df['target_change_4h'] = df['close'].pct_change(16).shift(-16) 

In [14]:
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,SMA_1h,SMA_4h,SMA_1d,EMA_1h,...,price_change_1h,price_change_4h,volatility_1h,volatility_4h,high_low_diff_15m,high_low_diff_1h,target_price_1h,target_price_4h,target_change_1h,target_change_4h
0,2017-08-17 04:00:00,4261.48,4280.56,4261.48,4261.48,2.189061,,,,4261.48,...,,,,,19.08,,4304.31,4360.71,0.01005,0.023285
1,2017-08-17 04:15:00,4261.48,4270.41,4261.32,4261.45,9.119865,,,,4261.468,...,,,,,9.09,,4320.0,4360.7,0.013739,0.02329
2,2017-08-17 04:30:00,4280.0,4310.07,4267.99,4310.07,21.923552,,,,4280.9088,...,,,,,42.08,,4291.37,4360.69,-0.004339,0.011745
3,2017-08-17 04:45:00,4310.07,4313.62,4291.37,4308.83,13.948531,4285.4575,,,4292.07728,...,,,27.70878,,22.25,52.3,4315.32,4360.69,0.001506,0.012036
4,2017-08-17 05:00:00,4308.83,4328.69,4304.31,4304.31,5.101153,4296.165,,,4296.970368,...,0.01005,,23.275334,,24.38,67.37,4330.0,4360.0,0.005968,0.012938
