In [3]:
# Import the data!
import pandas as pd

df = pd.read_csv("../historic_data/SOL_funding_data.csv", parse_dates=["time"])
df = df.sort_values('time')
df.head()

Unnamed: 0,time,coin,funding,open_interest,prev_day_px,day_ntl_vlm,premium,oracle_px,mark_px,mid_px,impact_bid_px,impact_ask_px
0,2025-03-01 00:00:00+00:00,SOL,1.3e-05,2365018.64,137.75,1124200000.0,-0.00036,148.14,148.05,148.045,148.0343,148.0866
1,2025-03-01 01:00:00+00:00,SOL,1.3e-05,2214896.94,136.02,1161692000.0,-0.000343,145.96,145.91,145.905,145.9,145.91
2,2025-03-01 02:00:00+00:00,SOL,-1.2e-05,2245580.74,131.5,1105550000.0,-0.000599,150.22,150.11,150.125,150.12,150.13
3,2025-03-01 03:00:00+00:00,SOL,1.3e-05,2177874.8,129.15,1079674000.0,-0.000136,147.23,147.18,147.205,147.2,147.21
4,2025-03-01 04:00:00+00:00,SOL,1.1e-05,2160393.2,127.64,1048013000.0,-0.000412,145.74,145.68,145.675,145.67,145.68


In [4]:
n = 24 * 10  # 10 days ahead
df['future_10d_mean'] = df['funding'].shift(-n).rolling(n).mean()
df = df.dropna()

In [5]:
# Select lagged and rolling features
df['funding_avg_1d'] = df['funding'].rolling(24).mean()
df['funding_avg_7d'] = df['funding'].rolling(168).mean()
df['funding_avg_21d'] = df['funding'].rolling(504).mean()
df['premium_avg_1d'] = df['premium'].rolling(24).mean()
df['premium_avg_7d'] = df['premium'].rolling(168).mean()
df['premium_avg_21d'] = df['premium'].rolling(504).mean()
df['oi_avg_1d'] = df['open_interest'].rolling(24).mean()
df['oi_avg_7d'] = df['open_interest'].rolling(168).mean()
df['oi_avg_21d'] = df['open_interest'].rolling(504).mean()
df['px_avg_1d'] = df['mark_px'].rolling(24).mean()
df['px_avg_7d'] = df['mark_px'].rolling(168).mean()
df['px_avg_21d'] = df['mark_px'].rolling(508).mean()
df['vol_avg_1d'] = df['day_ntl_vlm'].rolling(24).mean()
df['vol_avg_7d'] = df['day_ntl_vlm'].rolling(168).mean()
df['vol_avg_21d'] = df['day_ntl_vlm'].rolling(508).mean()
df = df.dropna() # Drops stuff that is 21 days from start

In [6]:
# Here is where you load in the futures and prepare the training
features = [
    'funding_avg_1d', 'funding_avg_7d', 'funding_avg_21d',
    'premium_avg_1d', 'premium_avg_7d', 'premium_avg_21d',
    'oi_avg_1d', 'oi_avg_7d', 'oi_avg_21d',
    'px_avg_1d', 'px_avg_7d', 'px_avg_21d',
    'vol_avg_1d', 'vol_avg_7d', 'vol_avg_21d',
]
# Drop any missing values
df = df.dropna(subset=features + ['future_10d_mean'])

# Train/test split: last 3 months rows for testing, rest for training
train = df.iloc[:-2160]
test = df.iloc[-2160:]

X_train = train[features]
y_train = train['future_10d_mean']
X_test = test[features]
y_test = test['future_10d_mean']

# Train model
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

ModuleNotFoundError: No module named 'sklearn'

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error

# Compare predicted mean vs actual mean for last 100 test rows
mse = mean_squared_error(y_test, y_pred)
mae = np.mean(np.abs(y_test - y_pred))
print(f"Test MSE: {mse:.8f}")
print(f"Test MAE: {mae:.8f}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6))

# Plot actual 10-day mean
plt.plot(test['time'], y_test, label='Actual 10-day mean', color='blue')

# Plot predicted 10-day mean
plt.plot(test['time'], y_pred, label='Predicted 10-day mean', color='orange')

plt.legend()
plt.title('10-Day Mean Funding Rate Prediction vs Actual (Test Set)')
plt.xlabel('Time')
plt.ylabel('10-Day Mean Funding Rate')
plt.show()
