In [5]:
import requests
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from lightgbm import LGBMRegressor

# ------------------------
# 1. Data Fetching Function
# ------------------------
def get_historical_data(symbol, interval='1d', start_date=None, end_date=None, limit=1000):
    url = 'https://api.binance.com/api/v3/klines'
    params = {
        'symbol': symbol,
        'interval': interval,
        'limit': limit
    }
    
    if start_date:
        params['startTime'] = int(pd.Timestamp(start_date).timestamp() * 1000)
    if end_date:
        params['endTime'] = int(pd.Timestamp(end_date).timestamp() * 1000)
    
    response = requests.get(url, params=params)
    if response.status_code != 200:
        raise Exception(f'Error fetching data: {response.status_code} - {response.text}')
    
    data = response.json()
    df = pd.DataFrame(data, columns=[
        'timestamp', 'open', 'high', 'low', 'close', 'volume',
        'close_time', 'quote_asset_volume', 'number_of_trades',
        'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume', 'ignore'
    ])
    
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    df.set_index('timestamp', inplace=True)
    df.drop(columns=['ignore'], inplace=True)
    df = df.apply(pd.to_numeric, errors='coerce')
    
    return df

# ------------------------
# 2. Data Preparation
# ------------------------
# Parameters for data fetching
symbol = 'ETHUSDT'
interval = '1d'
start_date = '2022-06-01'
end_date = '2025-02-06'

# Fetch and prepare data
df = get_historical_data(symbol, interval, start_date, end_date)
df['target'] = df['close'].shift(-1)
df.dropna(inplace=True)



In [6]:
# Define feature columns and target
x_cols = ['open', 'high', 'low', 'volume', 
          'quote_asset_volume', 'number_of_trades', 
          'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume']
y_col = 'target'

X = df[x_cols]
y = df[y_col]

# Split data into training and testing sets (80% training, 20% testing)
train_size = int(len(X) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Scale the feature data using MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [7]:

# ------------------------
# 3. Model Training and Evaluation using LightGBM's Native API
# ------------------------
train_data = lgb.Dataset(X_train_scaled, label=y_train)
test_data = lgb.Dataset(X_test_scaled, label=y_test, reference=train_data)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
}


In [8]:
# Set early stopping callback
early_stopping_callback = lgb.early_stopping(stopping_rounds=10)

num_round = 200

print("Training using lgb.train...")

bst = lgb.train(params, train_data, num_round, valid_sets=[test_data], callbacks=[early_stopping_callback])

# ------------------------
# 4. Model Training and Evaluation using LGBMRegressor (scikit-learn API)
# ------------------------
model = LGBMRegressor(metric='rmse', 
                      n_estimators=num_round, 
                      learning_rate=0.05, 
                      num_leaves=31, 
                      colsample_bytree=0.9)

print("\nTraining using LGBMRegressor...")
model.fit(X_train_scaled, y_train)

test_preds = model.predict(X_test_scaled)

rmse = np.sqrt(mean_squared_error(y_test, test_preds))
mae = mean_absolute_error(y_test, test_preds)

print("\nEvaluation using LGBMRegressor:")
print("RMSE: ", rmse)
print("MAE: ", mae)


Training using lgb.train...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000520 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 784, number of used features: 8
[LightGBM] [Info] Start training from score 2047.107753
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[78]	valid_0's rmse: 139.465

Training using LGBMRegressor...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 784, number of used features: 8
[LightGBM] [Info] Start training from score 2047.107753

Evaluation using LGBMRegressor:
RMSE:  144.70395154539855
MAE:  113.08163933261503
