# Idea

Predict returns from prices and volume traded at previous periods using various models.

# Imports

In [9]:
import pandas as pd
import numpy as np

from ipynb.fs.full.utility_functions import (
    get_price_data,
    get_binance_px,
    get_train_test_data,
)
from datetime import datetime

# Get Historical Data For Cryptocurrencies

In [10]:
# The universe is based on this snapshot from December 20, 2020: https://coinmarketcap.com/historical/20201220/

univ = [
    "BTCUSDT", "ETHUSDT", "ADAUSDT", "BNBUSDT", "XRPUSDT", "DOTUSDT", "MATICUSDT", "LTCUSDT", "BCHUSDT",
    "LINKUSDT", "XLMUSDT", "USDCUSDT", "EOSUSDT", "TRXUSDT", "XTZUSDT", "FILUSDT", "NEOUSDT", "DAIUSDT",
    "DASHUSDT", "VETUSDT", "ATOMUSDT", "AAVEUSDT", "UNIUSDT", "GRTUSDT", "THETAUSDT", "IOTAUSDT", "BUSDUSDT",
    "ZECUSDT", "YFIUSDT", "ETCUSDT", "WAVESUSDT", "COMPUSDT", "SNXUSDT", "DOGEUSDT", "MKRUSDT", "ZILUSDT",
    "SUSHIUSDT", "KSMUSDT", "OMGUSDT", "ONTUSDT", "ALGOUSDT", "EGLDUSDT", "BATUSDT", "DGBUSDT", "ZRXUSDT",
    "TUSDUSDT", "QTUMUSDT", "ICXUSDT", "AVAXUSDT", "RENUSDT", "HBARUSDT", "NEARUSDT", "LRCUSDT", "CELOUSDT",
    "KNCUSDT", "LSKUSDT", "OCEANUSDT", "QNTUSDT", "USTUSDT", "BANDUSDT", "MANAUSDT", "ENJUSDT", "ANTUSDT",
    "BNTUSDT", "ZENUSDT", "NMRUSDT", "RVNUSDT", "IOSTUSDT", "OXTUSDT", "CRVUSDT", "MATICUSDT", "HNTUSDT",
    "BALUSDT", "CHZUSDT"
]

# px = get_price_data(univ, '4h', False, './class_project_input_prices.csv')
# px

In [11]:
# popular_univ = univ[["BTCUSDT", "ETHUSDT", "ADAUSDT", "BNBUSDT", "XRPUSDT"]]

In [12]:
from binance.client import Client as bnb_client

client = bnb_client(tld='US')
px = {}
freq = '4h'

univ = ["BTCUSDT"]

for coin in univ:
    print(f"Downloading data for symbol {coin}")
    data = get_binance_px(client, coin, freq)
    px[coin] = data.set_index('open_time')['close']

px = pd.DataFrame(px).astype(float)

Downloading data for symbol BTCUSDT


In [13]:
px

Unnamed: 0_level_0,BTCUSDT
open_time,Unnamed: 1_level_1
2020-12-20 00:00:00,23353.97
2020-12-20 04:00:00,23604.24
2020-12-20 08:00:00,23549.50
2020-12-20 12:00:00,23880.85
2020-12-20 16:00:00,23932.71
...,...
2025-11-04 08:00:00,104402.52
2025-11-04 12:00:00,103256.40
2025-11-04 16:00:00,100426.87
2025-11-04 20:00:00,101752.18


predict close price for row (t, t+1) from:

- (t-1, t): open, high, low, close, volume, quote_volume, num_trades, taker_base_volume, taker_quote_volume, trade hour
- (t-2, t-1): open, high, low, close, volume, quote_volume, num_trades, taker_base_volume, taker_quote_volume, trade hour
- (t-3, t-2): ...
- (t-7, t-6): ... (for seasonality of a day)

In [14]:
def construct_look_back_data(input_data):
    input_data = input_data.drop(columns=["ignore"])
    input_data.set_index('open_time', inplace=True)
    input_data["period_return"] = input_data["close"].astype(float) / input_data["close"].astype(float).shift() - 1
    
    raw_input_data_back_1 = input_data.rename(columns=lambda x: x + "_back_1").shift()
    raw_input_data_back_2 = input_data.rename(columns=lambda x: x + "_back_2").shift(2)
    raw_input_data_back_3 = input_data.rename(columns=lambda x: x + "_back_3").shift(3)
    raw_input_data_back_6 = input_data.rename(columns=lambda x: x + "_back_6").shift(6)
    
    merged_input_data = pd.concat(
        [raw_input_data_back_1, raw_input_data_back_2, raw_input_data_back_3, raw_input_data_back_6], axis=1
    )
    
    original_input_columns = [
        "open", "high", "low", "close", "volume", "quote_volume", "num_trades", "taker_base_volume",
        "taker_quote_volume", "period_return",
    ]

    new_input_columns = [
        *map(lambda x: x + "_back_1", original_input_columns),
        *map(lambda x: x + "_back_2", original_input_columns),
        *map(lambda x: x + "_back_3", original_input_columns),
        *map(lambda x: x + "_back_6", original_input_columns),
    ]
    
    input_x = merged_input_data[new_input_columns]
    input_x["open_hour"] = input_x.index.hour
    input_x = input_x.astype(float)
    input_x = input_x.loc[~np.isnan(input_x).any(axis=1)]
    
    y = input_data["period_return"].loc[input_x.index]
    
    return input_x, y

In [15]:
all_x, all_y = construct_look_back_data(data)

t = datetime(2024, 1, 1, 0, 0)

train_x, test_x = get_train_test_data(all_x, t)
train_y, test_y = get_train_test_data(all_y, t)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_x["open_hour"] = input_x.index.hour


In [16]:
import statsmodels.api as sm
import numpy as np

train_x = sm.add_constant(train_x)
model = sm.OLS(train_y, train_x)
model

<statsmodels.regression.linear_model.OLS at 0x14dc97e10>

In [17]:
results = model.fit()
results.params



const                        1.258418e-03
open_back_1                  1.159774e-05
high_back_1                  9.436773e-08
low_back_1                   5.459281e-07
close_back_1                -2.112234e-06
volume_back_1                2.185433e-05
quote_volume_back_1         -4.668048e-10
num_trades_back_1            1.828596e-08
taker_base_volume_back_1    -4.251484e-05
taker_quote_volume_back_1    9.032085e-10
period_return_back_1         2.878830e-02
open_back_2                  3.914917e-06
high_back_2                  5.995974e-08
low_back_2                   3.620014e-07
close_back_2                -9.174358e-06
volume_back_2                1.461919e-05
quote_volume_back_2          2.849348e-10
num_trades_back_2           -6.508250e-08
taker_base_volume_back_2    -3.109425e-05
taker_quote_volume_back_2   -5.359334e-10
period_return_back_2        -2.424560e-02
open_back_3                  2.182827e-06
high_back_3                  2.368924e-07
low_back_3                  -5.320

In [18]:
results.summary()

0,1,2,3
Dep. Variable:,period_return,R-squared:,0.014
Model:,OLS,Adj. R-squared:,0.008
Method:,Least Squares,F-statistic:,2.321
Date:,"Tue, 04 Nov 2025",Prob (F-statistic):,3.82e-06
Time:,19:22:10,Log-Likelihood:,18876.0
No. Observations:,6633,AIC:,-37670.0
Df Residuals:,6591,BIC:,-37380.0
Df Model:,41,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0013,0.001,1.462,0.144,-0.000,0.003
open_back_1,1.16e-05,8.15e-06,1.423,0.155,-4.37e-06,2.76e-05
high_back_1,9.437e-08,1.29e-07,0.734,0.463,-1.58e-07,3.46e-07
low_back_1,5.459e-07,6.68e-07,0.817,0.414,-7.63e-07,1.86e-06
close_back_1,-2.112e-06,1.19e-06,-1.779,0.075,-4.44e-06,2.15e-07
volume_back_1,2.185e-05,3.7e-05,0.591,0.554,-5.06e-05,9.43e-05
quote_volume_back_1,-4.668e-10,1.08e-09,-0.431,0.667,-2.59e-09,1.66e-09
num_trades_back_1,1.829e-08,1.28e-07,0.143,0.887,-2.33e-07,2.69e-07
taker_base_volume_back_1,-4.251e-05,7.47e-05,-0.569,0.569,-0.000,0.000

0,1,2,3
Omnibus:,1023.073,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17766.991
Skew:,0.055,Prob(JB):,0.0
Kurtosis:,11.017,Cond. No.,3520000000.0


In [22]:
test_x = sm.add_constant(test_x)
predicted_test_y = model.predict(results.params, test_x)
predicted_test_y

array([-0.00090643, -0.00110783, -0.00094586, ...,  0.00660139,
        0.00558108, -0.00434695])

In [23]:
predicted_test_y.plot()

AttributeError: 'numpy.ndarray' object has no attribute 'plot'

In [88]:
train_x.shape

(6633, 42)

In [90]:
test_x.shape

(4042, 41)

In [91]:
train_x.columns

Index(['const', 'open_back_1', 'high_back_1', 'low_back_1', 'close_back_1',
       'volume_back_1', 'quote_volume_back_1', 'num_trades_back_1',
       'taker_base_volume_back_1', 'taker_quote_volume_back_1',
       'period_return_back_1', 'open_back_2', 'high_back_2', 'low_back_2',
       'close_back_2', 'volume_back_2', 'quote_volume_back_2',
       'num_trades_back_2', 'taker_base_volume_back_2',
       'taker_quote_volume_back_2', 'period_return_back_2', 'open_back_3',
       'high_back_3', 'low_back_3', 'close_back_3', 'volume_back_3',
       'quote_volume_back_3', 'num_trades_back_3', 'taker_base_volume_back_3',
       'taker_quote_volume_back_3', 'period_return_back_3', 'open_back_6',
       'high_back_6', 'low_back_6', 'close_back_6', 'volume_back_6',
       'quote_volume_back_6', 'num_trades_back_6', 'taker_base_volume_back_6',
       'taker_quote_volume_back_6', 'period_return_back_6', 'open_hour'],
      dtype='object')

In [92]:
test_x.columns

Index(['open_back_1', 'high_back_1', 'low_back_1', 'close_back_1',
       'volume_back_1', 'quote_volume_back_1', 'num_trades_back_1',
       'taker_base_volume_back_1', 'taker_quote_volume_back_1',
       'period_return_back_1', 'open_back_2', 'high_back_2', 'low_back_2',
       'close_back_2', 'volume_back_2', 'quote_volume_back_2',
       'num_trades_back_2', 'taker_base_volume_back_2',
       'taker_quote_volume_back_2', 'period_return_back_2', 'open_back_3',
       'high_back_3', 'low_back_3', 'close_back_3', 'volume_back_3',
       'quote_volume_back_3', 'num_trades_back_3', 'taker_base_volume_back_3',
       'taker_quote_volume_back_3', 'period_return_back_3', 'open_back_6',
       'high_back_6', 'low_back_6', 'close_back_6', 'volume_back_6',
       'quote_volume_back_6', 'num_trades_back_6', 'taker_base_volume_back_6',
       'taker_quote_volume_back_6', 'period_return_back_6', 'open_hour'],
      dtype='object')

In [96]:
set(train_x.columns).difference(set(test_x.columns))

{'const'}