In [28]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [1]:
import sys
import os

sys.path.append(os.path.abspath("../")) 

In [None]:
import pandas as pd
import numpy as np
import json
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from src.metric_logging import log_experiment

In [27]:
# Load merged dataset
file_path = "../datasets/final_merged_dataset.parquet"
merged_data = pd.read_parquet(file_path)

# Ensure 'date' column is in datetime format
merged_data['date'] = pd.to_datetime(merged_data['date'])
merged_data.set_index('date', inplace=True)

# Select structured features: Bitcoin Price, Sentiment, and Volume
feature_columns = ['Close', 'Open', 'High', 'Low', 'Volume', 'sentiment_score', 'bert_sentiment']
data = merged_data[feature_columns]

# Drop NaN values
data.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(inplace=True)


In [32]:
data.head()
data.shape

Unnamed: 0_level_0,Close,Open,High,Low,Volume,sentiment_score,bert_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-01,0.262398,0.255213,0.254063,0.260019,0.096184,0.0,0.0
2021-01-02,0.315568,0.262601,0.32172,0.265682,0.175237,0.0,0.0
2021-01-03,0.328213,0.31578,0.349385,0.323992,0.2067,0.0,0.0
2021-01-04,0.312568,0.328945,0.327147,0.258427,0.213978,0.0,0.0
2021-01-05,0.35159,0.312836,0.346131,0.287934,0.17431,0.0,0.0


(761, 7)

In [45]:
# Create lag features (past N days as features)
N = 5  # Lookback window
for i in range(1, N+1):
    data[f'Close_lag_{i}'] = data['Close'].shift(i)

# Create rolling averages
data['rolling_avg_7'] = data['Close'].rolling(window=7).mean()
data['rolling_avg_14'] = data['Close'].rolling(window=14).mean()
data['rolling_avg_30'] = data['Close'].rolling(window=30).mean()

# Bollinger Bands (Volatility Measure)
data['bollinger_high'] = data['rolling_avg_14'] + (2 * data['Close'].rolling(window=14).std())
data['bollinger_low'] = data['rolling_avg_14'] - (2 * data['Close'].rolling(window=14).std())

# Drop NaN values after feature creation
data.dropna(inplace=True)

In [36]:
data.head()
data.shape

Unnamed: 0_level_0,Close,Open,High,Low,Volume,sentiment_score,bert_sentiment,Close_lag_1,Close_lag_2,Close_lag_3,Close_lag_4,Close_lag_5,rolling_avg_7,rolling_avg_14,rolling_avg_30,bollinger_high,bollinger_low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-01-30,0.356941,0.357631,0.35369,0.341475,0.167302,0.0,0.0,0.357846,0.341425,0.282839,0.324116,0.320186,0.328865,0.342974,0.367506,0.41838,0.267567
2021-01-31,0.334632,0.357147,0.34329,0.328282,0.131214,0.0,0.0,0.356941,0.357846,0.341425,0.282839,0.324116,0.331141,0.339281,0.369913,0.410488,0.268074
2021-02-01,0.342797,0.33481,0.34995,0.330527,0.156402,0.0,0.0,0.334632,0.356941,0.357846,0.341425,0.282839,0.334371,0.335014,0.370821,0.396373,0.273655
2021-02-02,0.380903,0.342897,0.373908,0.352287,0.161321,0.0,0.0,0.342797,0.334632,0.356941,0.357846,0.341425,0.342483,0.334242,0.372577,0.392734,0.275751
2021-02-03,0.418791,0.381099,0.404045,0.390779,0.155722,0.0,0.0,0.380903,0.342797,0.334632,0.356941,0.357846,0.361905,0.336897,0.376118,0.406894,0.2669


(732, 17)

In [46]:
# Define input features (X) and target variable (y)
X = data.drop(columns=['Close'])  # Features
y = data['Close'].shift(-1)  # Shifted to predict next day's close price

# Drop last row from both X and y to maintain equal length
X = X.iloc[:-1]  # Remove last row from X
y = y.dropna()  # Drop NaN row from y

# Print new shapes
print(f"Fixed X shape: {X.shape}, Fixed y shape: {y.shape}")

Fixed X shape: (702, 16), Fixed y shape: (702,)


In [47]:
X.head()
X.shape
y.head()
y.shape

Unnamed: 0_level_0,Open,High,Low,Volume,sentiment_score,bert_sentiment,Close_lag_1,Close_lag_2,Close_lag_3,Close_lag_4,Close_lag_5,rolling_avg_7,rolling_avg_14,rolling_avg_30,bollinger_high,bollinger_low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2021-02-28,0.587468,0.579851,0.544327,0.133223,0.133245,0.607697,0.587127,0.590049,0.604613,0.655047,0.638035,0.626243,0.671771,0.573171,0.829085,0.514458
2021-03-01,0.567484,0.63824,0.581219,0.134526,0.0,0.0,0.566836,0.587127,0.590049,0.604613,0.655047,0.613618,0.674097,0.58306,0.829127,0.519068
2021-03-02,0.653496,0.644779,0.622842,0.115996,0.0,0.0,0.653616,0.566836,0.587127,0.590049,0.604613,0.612389,0.672965,0.592887,0.829128,0.516802
2021-03-03,0.630387,0.690606,0.643429,0.132573,0.0,0.0,0.629432,0.653616,0.566836,0.587127,0.590049,0.614687,0.670743,0.603831,0.825995,0.515492
2021-03-04,0.671078,0.675378,0.631272,0.130018,0.0,0.0,0.671133,0.629432,0.653616,0.566836,0.587127,0.618735,0.666441,0.612233,0.822351,0.510531


(702, 16)

date
2021-02-28    0.653616
2021-03-01    0.629432
2021-03-02    0.671133
2021-03-03    0.632950
2021-03-04    0.640021
Name: Close, dtype: float64

(702,)

In [48]:
# Split 80% Train, 20% Test
train_size = int(len(data) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Print dataset sizes
print(f"Train Size: {len(X_train)}, Test Size: {len(X_test)}")

Train Size: 562, Test Size: 140


In [49]:
print(f"X_test index range: {X_test.index.min()} to {X_test.index.max()}")
print(f"y_test index range: {y_test.index.min()} to {y_test.index.max()}")

X_test index range: 2022-09-13 00:00:00 to 2023-01-30 00:00:00
y_test index range: 2022-09-13 00:00:00 to 2023-01-30 00:00:00


In [50]:
X_train_np, X_test_np = X_train.to_numpy(), X_test.to_numpy()
y_train_np, y_test_np = y_train.to_numpy(), y_test.to_numpy()
print(f"X_train shape: {X_train_np.shape}, y_train shape: {y_train_np.shape}")
print(f"X_test shape: {X_test_np.shape}, y_test shape: {y_test_np.shape}")

X_train shape: (562, 16), y_train shape: (562,)
X_test shape: (140, 16), y_test shape: (140,)


In [51]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

# Ensure input data is in NumPy format
X_train_np, X_test_np = X_train.to_numpy(), X_test.to_numpy()
y_train_np, y_test_np = y_train.to_numpy(), y_test.to_numpy()

# Define XGBoost model
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    n_estimators=500,  
    learning_rate=0.05,  
    max_depth=6,  
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Train XGBoost with manual early stopping
min_rmse = float("inf")
best_iteration = 0
tolerance = 50  # Same as early_stopping_rounds=50
rmse_history = []

for n in range(1, 501):  # Loop through each boosting round
    xgb_model.set_params(n_estimators=n)  # Update model with current round
    xgb_model.fit(X_train_np, y_train_np)  # Train model
    
    # Predict on test set
    y_pred = xgb_model.predict(X_test_np)
    
    # Compute RMSE
    rmse = np.sqrt(mean_squared_error(y_test_np, y_pred))
    rmse_history.append(rmse)

    print(f"Iteration {n}, RMSE: {rmse:.4f}")

    # Early stopping condition
    if rmse < min_rmse:
        min_rmse = rmse
        best_iteration = n  # Store best iteration
    elif n - best_iteration >= tolerance:
        print(f"Stopping early at iteration {n}. Best RMSE: {min_rmse:.4f}")
        break  # Stop training if no improvement for 50 rounds

Iteration 1, RMSE: 0.4202


Iteration 2, RMSE: 0.4024


Iteration 3, RMSE: 0.3854


Iteration 4, RMSE: 0.3693


Iteration 5, RMSE: 0.3539


Iteration 6, RMSE: 0.3394


Iteration 7, RMSE: 0.3256


Iteration 8, RMSE: 0.3123


Iteration 9, RMSE: 0.3000


Iteration 10, RMSE: 0.2882


Iteration 11, RMSE: 0.2768


Iteration 12, RMSE: 0.2662


Iteration 13, RMSE: 0.2555


Iteration 14, RMSE: 0.2452


Iteration 15, RMSE: 0.2355


Iteration 16, RMSE: 0.2261


Iteration 17, RMSE: 0.2175


Iteration 18, RMSE: 0.2090


Iteration 19, RMSE: 0.2010


Iteration 20, RMSE: 0.1938


Iteration 21, RMSE: 0.1865


Iteration 22, RMSE: 0.1794


Iteration 23, RMSE: 0.1732


Iteration 24, RMSE: 0.1670


Iteration 25, RMSE: 0.1611


Iteration 26, RMSE: 0.1556


Iteration 27, RMSE: 0.1503


Iteration 28, RMSE: 0.1455


Iteration 29, RMSE: 0.1404


Iteration 30, RMSE: 0.1356


Iteration 31, RMSE: 0.1310


Iteration 32, RMSE: 0.1266


Iteration 33, RMSE: 0.1226


Iteration 34, RMSE: 0.1186


Iteration 35, RMSE: 0.1148


Iteration 36, RMSE: 0.1111


Iteration 37, RMSE: 0.1090


Iteration 38, RMSE: 0.1061


Iteration 39, RMSE: 0.1030


Iteration 40, RMSE: 0.1003


Iteration 41, RMSE: 0.0987


Iteration 42, RMSE: 0.0969


Iteration 43, RMSE: 0.0954


Iteration 44, RMSE: 0.0930


Iteration 45, RMSE: 0.0922


Iteration 46, RMSE: 0.0909


Iteration 47, RMSE: 0.0901


Iteration 48, RMSE: 0.0885


Iteration 49, RMSE: 0.0879


Iteration 50, RMSE: 0.0870


Iteration 51, RMSE: 0.0863


Iteration 52, RMSE: 0.0857


Iteration 53, RMSE: 0.0849


Iteration 54, RMSE: 0.0840


Iteration 55, RMSE: 0.0831


Iteration 56, RMSE: 0.0825


Iteration 57, RMSE: 0.0820


Iteration 58, RMSE: 0.0814


Iteration 59, RMSE: 0.0803


Iteration 60, RMSE: 0.0799


Iteration 61, RMSE: 0.0795


Iteration 62, RMSE: 0.0789


Iteration 63, RMSE: 0.0778


Iteration 64, RMSE: 0.0775


Iteration 65, RMSE: 0.0770


Iteration 66, RMSE: 0.0768


Iteration 67, RMSE: 0.0759


Iteration 68, RMSE: 0.0756


Iteration 69, RMSE: 0.0754


Iteration 70, RMSE: 0.0752


Iteration 71, RMSE: 0.0750


Iteration 72, RMSE: 0.0748


Iteration 73, RMSE: 0.0744


Iteration 74, RMSE: 0.0743


Iteration 75, RMSE: 0.0741


Iteration 76, RMSE: 0.0739


Iteration 77, RMSE: 0.0738


Iteration 78, RMSE: 0.0738


Iteration 79, RMSE: 0.0732


Iteration 80, RMSE: 0.0729


Iteration 81, RMSE: 0.0729


Iteration 82, RMSE: 0.0724


Iteration 83, RMSE: 0.0722


Iteration 84, RMSE: 0.0720


Iteration 85, RMSE: 0.0715


Iteration 86, RMSE: 0.0712


Iteration 87, RMSE: 0.0711


Iteration 88, RMSE: 0.0709


Iteration 89, RMSE: 0.0709


Iteration 90, RMSE: 0.0707


Iteration 91, RMSE: 0.0707


Iteration 92, RMSE: 0.0707


Iteration 93, RMSE: 0.0706


Iteration 94, RMSE: 0.0705


Iteration 95, RMSE: 0.0704


Iteration 96, RMSE: 0.0704


Iteration 97, RMSE: 0.0703


Iteration 98, RMSE: 0.0700


Iteration 99, RMSE: 0.0700


Iteration 100, RMSE: 0.0699


Iteration 101, RMSE: 0.0698


Iteration 102, RMSE: 0.0699


Iteration 103, RMSE: 0.0698


Iteration 104, RMSE: 0.0697


Iteration 105, RMSE: 0.0696


Iteration 106, RMSE: 0.0696


Iteration 107, RMSE: 0.0695


Iteration 108, RMSE: 0.0693


Iteration 109, RMSE: 0.0692


Iteration 110, RMSE: 0.0688


Iteration 111, RMSE: 0.0689


Iteration 112, RMSE: 0.0688


Iteration 113, RMSE: 0.0688


Iteration 114, RMSE: 0.0687


Iteration 115, RMSE: 0.0687


Iteration 116, RMSE: 0.0687


Iteration 117, RMSE: 0.0687


Iteration 118, RMSE: 0.0687


Iteration 119, RMSE: 0.0685


Iteration 120, RMSE: 0.0685


Iteration 121, RMSE: 0.0685


Iteration 122, RMSE: 0.0685


Iteration 123, RMSE: 0.0685


Iteration 124, RMSE: 0.0684


Iteration 125, RMSE: 0.0684


Iteration 126, RMSE: 0.0684


Iteration 127, RMSE: 0.0683


Iteration 128, RMSE: 0.0681


Iteration 129, RMSE: 0.0681


Iteration 130, RMSE: 0.0681


Iteration 131, RMSE: 0.0680


Iteration 132, RMSE: 0.0681


Iteration 133, RMSE: 0.0682


Iteration 134, RMSE: 0.0681


Iteration 135, RMSE: 0.0681


Iteration 136, RMSE: 0.0680


Iteration 137, RMSE: 0.0680


Iteration 138, RMSE: 0.0679


Iteration 139, RMSE: 0.0679


Iteration 140, RMSE: 0.0678


Iteration 141, RMSE: 0.0676


Iteration 142, RMSE: 0.0676


Iteration 143, RMSE: 0.0676


Iteration 144, RMSE: 0.0676


Iteration 145, RMSE: 0.0676


Iteration 146, RMSE: 0.0677


Iteration 147, RMSE: 0.0677


Iteration 148, RMSE: 0.0677


Iteration 149, RMSE: 0.0677


Iteration 150, RMSE: 0.0677


Iteration 151, RMSE: 0.0677


Iteration 152, RMSE: 0.0677


Iteration 153, RMSE: 0.0678


Iteration 154, RMSE: 0.0678


Iteration 155, RMSE: 0.0678


Iteration 156, RMSE: 0.0678


Iteration 157, RMSE: 0.0679


Iteration 158, RMSE: 0.0679


Iteration 159, RMSE: 0.0679


Iteration 160, RMSE: 0.0679


Iteration 161, RMSE: 0.0679


Iteration 162, RMSE: 0.0680


Iteration 163, RMSE: 0.0680


Iteration 164, RMSE: 0.0680


Iteration 165, RMSE: 0.0679


Iteration 166, RMSE: 0.0679


Iteration 167, RMSE: 0.0679


Iteration 168, RMSE: 0.0679


Iteration 169, RMSE: 0.0680


Iteration 170, RMSE: 0.0678


Iteration 171, RMSE: 0.0678


Iteration 172, RMSE: 0.0678


Iteration 173, RMSE: 0.0678


Iteration 174, RMSE: 0.0680


Iteration 175, RMSE: 0.0680


Iteration 176, RMSE: 0.0680


Iteration 177, RMSE: 0.0680


Iteration 178, RMSE: 0.0678


Iteration 179, RMSE: 0.0678


Iteration 180, RMSE: 0.0678


Iteration 181, RMSE: 0.0678


Iteration 182, RMSE: 0.0678


Iteration 183, RMSE: 0.0678


Iteration 184, RMSE: 0.0676


Iteration 185, RMSE: 0.0676


Iteration 186, RMSE: 0.0676


Iteration 187, RMSE: 0.0676


Iteration 188, RMSE: 0.0676


Iteration 189, RMSE: 0.0676


Iteration 190, RMSE: 0.0676


Iteration 191, RMSE: 0.0676


Iteration 192, RMSE: 0.0676


Iteration 193, RMSE: 0.0676


Iteration 194, RMSE: 0.0676


Iteration 195, RMSE: 0.0677


Iteration 196, RMSE: 0.0677


Iteration 197, RMSE: 0.0677


Iteration 198, RMSE: 0.0677


Iteration 199, RMSE: 0.0677


Iteration 200, RMSE: 0.0678


Iteration 201, RMSE: 0.0678


Iteration 202, RMSE: 0.0679


Iteration 203, RMSE: 0.0680


Iteration 204, RMSE: 0.0680


Iteration 205, RMSE: 0.0679


Iteration 206, RMSE: 0.0679


Iteration 207, RMSE: 0.0677


Iteration 208, RMSE: 0.0677


Iteration 209, RMSE: 0.0677


Iteration 210, RMSE: 0.0678


Iteration 211, RMSE: 0.0679


Iteration 212, RMSE: 0.0679


Iteration 213, RMSE: 0.0678


Iteration 214, RMSE: 0.0679


Iteration 215, RMSE: 0.0679


Iteration 216, RMSE: 0.0679


Iteration 217, RMSE: 0.0678


Iteration 218, RMSE: 0.0678


Iteration 219, RMSE: 0.0678


Iteration 220, RMSE: 0.0678


Iteration 221, RMSE: 0.0678


Iteration 222, RMSE: 0.0678


Iteration 223, RMSE: 0.0678


Iteration 224, RMSE: 0.0678


Iteration 225, RMSE: 0.0678


Iteration 226, RMSE: 0.0678


Iteration 227, RMSE: 0.0678


Iteration 228, RMSE: 0.0678


Iteration 229, RMSE: 0.0678


Iteration 230, RMSE: 0.0678


Iteration 231, RMSE: 0.0677


Iteration 232, RMSE: 0.0677


Iteration 233, RMSE: 0.0677


Iteration 234, RMSE: 0.0677


Iteration 235, RMSE: 0.0677


Iteration 236, RMSE: 0.0677


Iteration 237, RMSE: 0.0676


Iteration 238, RMSE: 0.0676
Stopping early at iteration 238. Best RMSE: 0.0676


In [52]:
# Predict closing prices
y_pred = xgb_model.predict(X_test)

# Create DataFrame for evaluation
predictions_df = pd.DataFrame({
    "Actual_Close": y_test.values,
    "Predicted_Close": y_pred
}, index=y_test.index)

# Forecast for 7-day and 30-day using rolling predictions
predictions_df['Predicted_7d'] = predictions_df['Predicted_Close'].shift(-7)
predictions_df['Predicted_30d'] = predictions_df['Predicted_Close'].shift(-30)

print("✅ XGBoost Predictions completed successfully!")

✅ XGBoost Predictions completed successfully!


In [53]:
# Compute RMSE & MAPE for different forecasting horizons
rmse_1d = np.sqrt(mean_squared_error(predictions_df['Actual_Close'], predictions_df['Predicted_Close']))
rmse_7d = np.sqrt(mean_squared_error(predictions_df['Actual_Close'].iloc[:-7], predictions_df['Predicted_7d'].dropna()))
rmse_30d = np.sqrt(mean_squared_error(predictions_df['Actual_Close'].iloc[:-30], predictions_df['Predicted_30d'].dropna()))

# Compute MAPE
mape_1d = np.mean(np.abs((predictions_df['Actual_Close'] - predictions_df['Predicted_Close']) / predictions_df['Actual_Close'])) * 100
mape_7d = np.mean(np.abs((predictions_df['Actual_Close'].iloc[:-7] - predictions_df['Predicted_7d'].dropna()) / predictions_df['Actual_Close'].iloc[:-7])) * 100
mape_30d = np.mean(np.abs((predictions_df['Actual_Close'].iloc[:-30] - predictions_df['Predicted_30d'].dropna()) / predictions_df['Actual_Close'].iloc[:-30])) * 100

# Print Evaluation Results
print(f"✅ XGBoost Model Evaluation:")
print(f"RMSE (1-day): {rmse_1d:.4f}, MAPE (1-day): {mape_1d:.2f}%")
print(f"RMSE (7-day): {rmse_7d:.4f}, MAPE (7-day): {mape_7d:.2f}%")
print(f"RMSE (30-day): {rmse_30d:.4f}, MAPE (30-day): {mape_30d:.2f}%")

✅ XGBoost Model Evaluation:
RMSE (1-day): 0.0676, MAPE (1-day): inf%
RMSE (7-day): 0.0706, MAPE (7-day): inf%
RMSE (30-day): 0.0759, MAPE (30-day): inf%


In [54]:
# Prepare metrics for logging
xgb_metrics = {
    "RMSE": rmse_1d,
    "RMSE_7d": rmse_7d,
    "RMSE_30d": rmse_30d
}

# Log results to MLflow
log_experiment("XGBoost", {}, xgb_metrics)

print("✅ XGBoost RMSE logged successfully to DagsHub MLflow!")

jai_balayya
✅ XGBoost metrics logged successfully to DagsHub MLflow.
🏃 View run XGBoost-Baseline at: https://dagshub.com/vamsisaigarapati/bitcoin_price_pred_CSE574.mlflow/#/experiments/0/runs/500c75cca2f1444ca1c09606fc2968f4
🧪 View experiment at: https://dagshub.com/vamsisaigarapati/bitcoin_price_pred_CSE574.mlflow/#/experiments/0
✅ XGBoost RMSE logged successfully to DagsHub MLflow!
