### Предобработка данных и создание новых признаков

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('bitcoin_historical_data.csv')
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume
0,2017-08-17 04:00:00,4261.48,4280.56,4261.48,4261.48,2.189061
1,2017-08-17 04:15:00,4261.48,4270.41,4261.32,4261.45,9.119865
2,2017-08-17 04:30:00,4280.0,4310.07,4267.99,4310.07,21.923552
3,2017-08-17 04:45:00,4310.07,4313.62,4291.37,4308.83,13.948531
4,2017-08-17 05:00:00,4308.83,4328.69,4304.31,4304.31,5.101153


In [4]:
df.shape

(263384, 6)

In [5]:
print(f'{df.isnull().sum()}')
print('пропусков в df нет')
print(f'дубликатов в df  - {df.duplicated().sum()}')

timestamp    0
open         0
high         0
low          0
close        0
volume       0
dtype: int64
пропусков в df нет
дубликатов в df  - 0


В данных нет пропусков и дубликатов, начнем с ними работать. Создадим новые признаки.

### Добавление новых признаков

##### Посчитаем скользящие средние для разных временных окон (час, 5 часов, 1 день): простую (SMA) и экспоненциальную (EMA).

In [6]:
#24*4 = 96  количество 15-минуток в дне

df['SMA_1h'] = df['close'].rolling(window=4).mean()  
df['SMA_5h'] = df['close'].rolling(window=20).mean()  
df['SMA_1d'] = df['close'].rolling(window=96).mean()  

df['EMA_1h'] = df['close'].ewm(span=4, adjust=False).mean()  
df['EMA_4h'] = df['close'].ewm(span=16, adjust=False).mean() 

Посчитаем стохастик

In [7]:
n = 14  # Период для %K
df['Low_n'] = df['low'].rolling(window=n).min()
df['High_n'] = df['high'].rolling(window=n).max()

df['%K'] = (df['close'] - df['Low_n']) / (df['High_n'] - df['Low_n']) * 100

df['%D'] = df['%K'].rolling(window=3).mean()

Посчитаем macd

In [8]:
short_window = 12  
long_window = 26  
signal_window = 9

df['EMA_short'] = df['close'].ewm(span=short_window, adjust=False).mean()
df['EMA_long'] = df['close'].ewm(span=long_window, adjust=False).mean()

df['MACD'] = df['EMA_short'] - df['EMA_long']

df['Signal'] = df['MACD'].ewm(span=signal_window, adjust=False).mean()
df['MACD_Histogram'] = df['MACD'] - df['Signal']
df

Unnamed: 0,timestamp,open,high,low,close,volume,SMA_1h,SMA_5h,SMA_1d,EMA_1h,EMA_4h,Low_n,High_n,%K,%D,EMA_short,EMA_long,MACD,Signal,MACD_Histogram
0,2017-08-17 04:00:00,4261.48,4280.56,4261.48,4261.48,2.189061,,,,4261.480000,4261.480000,,,,,4261.480000,4261.480000,0.000000,0.000000,0.000000
1,2017-08-17 04:15:00,4261.48,4270.41,4261.32,4261.45,9.119865,,,,4261.468000,4261.476471,,,,,4261.475385,4261.477778,-0.002393,-0.000479,-0.001915
2,2017-08-17 04:30:00,4280.00,4310.07,4267.99,4310.07,21.923552,,,,4280.908800,4267.193356,,,,,4268.951479,4265.077202,3.874278,0.774473,3.099805
3,2017-08-17 04:45:00,4310.07,4313.62,4291.37,4308.83,13.948531,4285.4575,,,4292.077280,4272.091785,,,,,4275.086636,4268.318150,6.768487,1.973275,4.795211
4,2017-08-17 05:00:00,4308.83,4328.69,4304.31,4304.31,5.101153,4296.1650,,,4296.970368,4275.882163,,,,,4279.582538,4270.984213,8.598326,3.298285,5.300040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263379,2025-02-25 14:00:00,88826.11,89139.99,88656.37,89118.92,733.718220,89011.1425,88701.7185,91897.935104,89051.245785,89037.990797,87618.9,89544.4,77.902882,73.971013,89012.640568,89270.944399,-258.303831,-412.426357,154.122526
263380,2025-02-25 14:15:00,89118.92,89384.65,88578.67,89165.44,714.142890,89039.6300,88727.4905,91833.106250,89096.923471,89052.984821,87618.9,89544.4,80.318878,73.639228,89036.148173,89263.129258,-226.981085,-375.337302,148.356217
263381,2025-02-25 14:30:00,89165.45,89179.43,87860.23,88022.00,1694.966330,88783.1175,88680.1315,91758.955208,88666.954083,88931.692489,87618.9,89544.4,20.934822,59.718861,88880.125377,89171.193757,-291.068380,-358.483518,67.415138
263382,2025-02-25 14:45:00,88022.01,88120.00,87077.00,87157.25,2070.201630,88365.9025,88622.8075,91680.651146,88063.072450,88722.934549,87077.0,89544.4,3.252411,34.835371,88615.067627,89022.012738,-406.945112,-368.175837,-38.769275


Посчитаем ценовой осциллятор

In [9]:
df['Price_oscillator'] = df['EMA_short'] - df['EMA_long']

Посчитаем RSI.

In [10]:
delta = df['close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=16).mean() 
loss = (-delta.where(delta < 0, 0)).rolling(window=16).mean()
rs = gain / loss
df['RSI'] = 100 - (100 / (1 + rs))

Для полос Боллинджера потребуется тоже новые признаки

In [11]:
df['Std_5h'] = df['close'].rolling(window=20).std()

df['UpperBand_5h'] = df['SMA_5h'] + (2 * df['Std_5h'])
df['LowerBand_5h'] = df['SMA_5h'] - (2 * df['Std_5h'])

df['Bandwidth_5h'] = (df['UpperBand_5h'] - df['LowerBand_5h']) / df['SMA_5h'] * 100

##### Посчитаем ATR.

In [12]:
df['TR'] = np.maximum(
    df['high'] - df['low'],
    np.maximum(abs(df['high'] - df['close'].shift()), abs(df['low'] - df['close'].shift()))
)
df['ATR_1h'] = df['TR'].rolling(window=4).mean() 
df['ATR_4h'] = df['TR'].rolling(window=16).mean()  

Добавим также признаки на основе времени.

In [13]:
df['date'] = pd.to_datetime(df['timestamp']).dt.date
df['date'] = pd.to_datetime(df['date'])

In [14]:
df['day_of_week'] = pd.to_datetime(df['timestamp']).dt.dayofweek

Посмотрим на изменения цен за промежутки времени (15 минут, час, 4 часа), волатильность, а так же диапазон цен за период.

In [15]:
df['price_change_15m'] = df['close'].pct_change() 
df['price_change_1h'] = df['close'].pct_change(4)  
df['price_change_4h'] = df['close'].pct_change(16)  

df['volatility_1h'] = df['close'].rolling(window=4).std()  
df['volatility_4h'] = df['close'].rolling(window=16).std() 

df['high_low_diff_15m'] = df['high'] - df['low']
df['high_low_diff_1h'] = df['high'].rolling(window=4).max() - df['low'].rolling(window=4).min()

Добавим ставку ФРС в момент сделки.

In [16]:
fed_rates = pd.read_excel('сша-учётная-ставка-фрс.xlsx')
fed_rates['date'] = pd.to_datetime(fed_rates['Дата'])
del fed_rates['Дата']
fed_rates = fed_rates.sort_values('date')
fed_rates.rename(columns={'США учётная ставка ФРС': 'FRS_rate'}, inplace=True)
df['timestamp'] = pd.to_datetime(df['timestamp'])

df['timestamp_ns'] = df['timestamp'].astype('int64')
fed_rates['date_ns'] = fed_rates['date'].astype('int64')

df = df.sort_values('timestamp_ns')
fed_rates = fed_rates.sort_values('date_ns')

df = pd.merge_asof(df, fed_rates, left_on='timestamp_ns', right_on='date_ns', direction='backward')

Добавим так же целевой признак, который будем прогнрозировать - цену через 15 минут и изменение цены через 15 минут (знак).

In [17]:
df['target_price_15m'] = df['close'].shift(-1) 
df['target_change_15m_pct'] = df['close'].pct_change(4).shift(-1)
df['target_change_15m_abs'] = df['close'].shift(-1) - df['close']
df['target_change_ind_15m'] = (df['target_change_15m_abs']/df['target_change_15m_abs'].abs())

Удалим первые строчки, т.к. в них значения в некоторых столбцах NaN.

In [18]:
df = df.dropna()

In [19]:
df.drop(columns=['timestamp_ns', 'date_ns', 'date_y'], inplace=True)
df.rename(columns={'date_x': 'date'}, inplace=True)

In [20]:
df.columns

Index(['timestamp', 'open', 'high', 'low', 'close', 'volume', 'SMA_1h',
       'SMA_5h', 'SMA_1d', 'EMA_1h', 'EMA_4h', 'Low_n', 'High_n', '%K', '%D',
       'EMA_short', 'EMA_long', 'MACD', 'Signal', 'MACD_Histogram',
       'Price_oscillator', 'RSI', 'Std_5h', 'UpperBand_5h', 'LowerBand_5h',
       'Bandwidth_5h', 'TR', 'ATR_1h', 'ATR_4h', 'date', 'day_of_week',
       'price_change_15m', 'price_change_1h', 'price_change_4h',
       'volatility_1h', 'volatility_4h', 'high_low_diff_15m',
       'high_low_diff_1h', 'FRS_rate', 'target_price_15m',
       'target_change_15m_pct', 'target_change_15m_abs',
       'target_change_ind_15m'],
      dtype='object')

Мы получили таблицу с нужными нам признаками, с ней в дальнейшем будем работать.

In [21]:
df.to_csv("data.csv", index=False, encoding="utf-8")  