In [35]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from statsmodels.graphics.tsaplots import plot_acf
import  matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [36]:
df = pd.read_csv("../BTC_1_year_data.csv").set_index("close_time")
df

Unnamed: 0_level_0,open,high,low,close,volume
close_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-11-30 15:29:59.999000+00:00,96602.51,96659.50,96532.00,96659.50,133.45453
2024-11-30 15:44:59.999000+00:00,96659.49,96691.69,96602.01,96634.28,117.43398
2024-11-30 15:59:59.999000+00:00,96634.28,96732.15,96602.67,96645.41,85.14427
2024-11-30 16:14:59.999000+00:00,96645.42,96757.06,96615.52,96652.01,106.03529
2024-11-30 16:29:59.999000+00:00,96652.00,96652.01,96542.70,96555.42,241.85001
...,...,...,...,...,...
2025-11-30 14:14:59.999000+00:00,91770.00,91779.94,91641.05,91682.73,57.57016
2025-11-30 14:29:59.999000+00:00,91682.74,91707.30,91483.61,91554.20,117.52310
2025-11-30 14:44:59.999000+00:00,91554.19,91583.61,91256.88,91359.92,236.73655
2025-11-30 14:59:59.999000+00:00,91359.93,91499.99,91336.28,91499.99,67.62531


#### Feature Engineering

In [37]:
df["close_log_return"] = np.log(df["close"]/df["close"].shift())

    Create lagged features

In [38]:
df = df.copy()

In [39]:
df["close_log_returns_lag_1"] = df["close_log_return"].shift()
df["close_log_returns_lag_2"] = df["close_log_return"].shift(2)
df["close_log_returns_lag_3"] = df["close_log_return"].shift(3)

In [40]:
df = df.dropna(how="any")
# df= df.drop(columns=["open", "high", "low", "close", "volume"])

    Create binary classification target
        - 1=> Long => Price moves up
        - 0=> Short => Price goes down

In [41]:
df["close_log_return_dir"] = df["close_log_return"].map(lambda x:1 if x>0 else 0)
df = df.drop(columns="close_log_return")

In [42]:
df

Unnamed: 0_level_0,open,high,low,close,volume,close_log_returns_lag_1,close_log_returns_lag_2,close_log_returns_lag_3,close_log_return_dir
close_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-11-30 16:29:59.999000+00:00,96652.00,96652.01,96542.70,96555.42,241.85001,0.000068,0.000115,-0.000261,0
2024-11-30 16:44:59.999000+00:00,96555.42,96635.13,96531.35,96600.00,173.06312,-0.001000,0.000068,0.000115,1
2024-11-30 16:59:59.999000+00:00,96599.99,96960.00,96599.99,96871.31,203.06441,0.000462,-0.001000,0.000068,1
2024-11-30 17:14:59.999000+00:00,96871.31,96964.00,96825.02,96831.92,135.92532,0.002805,0.000462,-0.001000,0
2024-11-30 17:29:59.999000+00:00,96831.92,96935.95,96803.94,96869.71,113.49308,-0.000407,0.002805,0.000462,1
...,...,...,...,...,...,...,...,...,...
2025-11-30 14:14:59.999000+00:00,91770.00,91779.94,91641.05,91682.73,57.57016,0.000886,-0.001771,0.004175,0
2025-11-30 14:29:59.999000+00:00,91682.74,91707.30,91483.61,91554.20,117.52310,-0.000951,0.000886,-0.001771,0
2025-11-30 14:44:59.999000+00:00,91554.19,91583.61,91256.88,91359.92,236.73655,-0.001403,-0.000951,0.000886,0
2025-11-30 14:59:59.999000+00:00,91359.93,91499.99,91336.28,91499.99,67.62531,-0.002124,-0.001403,-0.000951,1


    Check class imbalance

In [43]:
print(df["close_log_return_dir"].value_counts())

close_log_return_dir
0    17559
1    17477
Name: count, dtype: int64


#### Split data into training and testing set but in temporal order

In [45]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [48]:
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.2, shuffle=False)