In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [3]:
import pandas as pd

# Load dataset
merged_data = pd.read_parquet('../datasets/final_merged_dataset_hourly.parquet')

# Confirm the dataset is loaded
print(merged_data.head())
print(merged_data.columns)
print(f"Total data points (hours): {len(merged_data)}")

             datetime      Open      High       Low     Close    Volume  \
0 2021-01-01 00:00:00  0.250730  0.251106  0.250628  0.250493  0.076344   
1 2021-01-01 01:00:00  0.251433  0.251695  0.251353  0.251520  0.015258   
2 2021-01-01 02:00:00  0.256154  0.256011  0.255263  0.255877  0.034476   
3 2021-01-01 03:00:00  0.252147  0.252928  0.252351  0.253123  0.010795   
4 2021-01-01 04:00:00  0.254932  0.255284  0.254180  0.254066  0.023016   

   sentiment_score  bert_sentiment  prev_close  prev_vader_sentiment  \
0              0.0             0.0    0.250493                   0.0   
1              0.0             0.0    0.250493                   0.0   
2              0.0             0.0    0.251520                   0.0   
3              0.0             0.0    0.255877                   0.0   
4              0.0             0.0    0.253123                   0.0   

   prev_bert_sentiment  volatility_7d  volatility_14d  volatility_30d  \
0                  0.0       0.050683      

In [5]:
import xgboost as xgb

# Step 1.1: Select Features
feature_cols = [
    'Open', 'High', 'Low', 'Volume',
    'sentiment_score', 'bert_sentiment',
    'prev_close', 'prev_vader_sentiment', 'prev_bert_sentiment',
    'volatility_7d', 'volatility_14d', 'volatility_30d',
    'closing_7d_avg', 'closing_30d_avg'
]

# Step 1.2: Target column (you will create this now)
merged_data['target'] = merged_data['Close'].shift(-1)

# Step 1.3: Drop last row with NaN target
merged_data = merged_data.dropna(subset=['target'])

# Step 1.4: Create X and y
X = merged_data[feature_cols]
y = merged_data['target']

# Confirm the shapes
print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)


Feature matrix shape: (18264, 14)
Target vector shape: (18264,)


In [6]:
from sklearn.model_selection import train_test_split

# Step 2.1: Split without shuffling (important for time series!)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

# Confirm sizes
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")


Training samples: 14611
Testing samples: 3653


In [7]:
# Step 3.1: Initialize Model
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Step 3.2: Train Model
xgb_model.fit(X_train, y_train)

print("✅ XGBoost Model Trained Successfully.")


✅ XGBoost Model Trained Successfully.


In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import numpy as np

# Step 4.1: Make Predictions
y_pred = xgb_model.predict(X_test)

# Step 4.2: Calculate RMSE and MAPE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred) * 100  # Convert to %

# Step 4.3: Display Results
print(f"RMSE: {rmse:.6f}")
print(f"MAPE: {mape:.2f}%")


RMSE: 0.029161
MAPE: 7998402013154.49%


In [10]:
from sklearn.metrics import mean_absolute_percentage_error

# Define threshold
threshold = 0.01  # ignore targets with very small values

# Apply mask
mask = np.abs(y_test) > threshold

# Recalculate Safe MAPE
mape = mean_absolute_percentage_error(y_test[mask], y_pred[mask]) * 100

# RMSE remains the same
print(f"✅ RMSE: {rmse:.6f}")
print(f"✅ Safe MAPE: {mape:.2f}%")


✅ RMSE: 0.029161
✅ Safe MAPE: 88.28%


In [12]:
import sys
sys.path.append("../src")

from metric_logging import log_experiment

In [14]:
# Step 2: Prepare parameters and metrics
params = {
    "learning_rate": 0.1,
    "max_depth": 5,
    "n_estimators": 100,
    "objective": "reg:squarederror",
    "booster": "gbtree",
    "scale_pos_weight": 1
}

metrics = {
    "RMSE": rmse,
    "MAPE": mape  # Note: we are logging safe MAPE here
}

# Step 3: Log to DagsHub MLflow
log_experiment(
    model_name="XGBoost-hourly-data",
    model_object=xgb_model,  # Your trained XGBoost model
    params=params,
    metrics=metrics
)


Successfully registered model 'XGBoost-hourly-data_Model'.
2025/04/18 19:11:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoost-hourly-data_Model, version 1
Created version '1' of model 'XGBoost-hourly-data_Model'.


✅ XGBoost-hourly-data logged successfully to DagsHub MLflow.
🏃 View run XGBoost-hourly-data-Baseline at: https://dagshub.com/vamsisaigarapati/bitcoin_price_pred_CSE574.mlflow/#/experiments/0/runs/9e9580e3da844b3ea0318ff8d9d2224c
🧪 View experiment at: https://dagshub.com/vamsisaigarapati/bitcoin_price_pred_CSE574.mlflow/#/experiments/0
