In [2]:
import pandas as pd

# Load dataset
merged_data = pd.read_parquet('../datasets/final_merged_dataset_hourly.parquet')

# Confirm the dataset is loaded
print(merged_data.head())
print(merged_data.columns)
print(f"Total data points (hours): {len(merged_data)}")

             datetime      Open      High       Low     Close    Volume  \
0 2021-01-01 00:00:00  0.250730  0.251106  0.250628  0.250493  0.076344   
1 2021-01-01 01:00:00  0.251433  0.251695  0.251353  0.251520  0.015258   
2 2021-01-01 02:00:00  0.256154  0.256011  0.255263  0.255877  0.034476   
3 2021-01-01 03:00:00  0.252147  0.252928  0.252351  0.253123  0.010795   
4 2021-01-01 04:00:00  0.254932  0.255284  0.254180  0.254066  0.023016   

   sentiment_score  bert_sentiment  prev_close  prev_vader_sentiment  \
0              0.0             0.0    0.250493                   0.0   
1              0.0             0.0    0.250493                   0.0   
2              0.0             0.0    0.251520                   0.0   
3              0.0             0.0    0.255877                   0.0   
4              0.0             0.0    0.253123                   0.0   

   prev_bert_sentiment  volatility_7d  volatility_14d  volatility_30d  \
0                  0.0       0.050683      

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import numpy as np

# Assume your hourly merged dataset is already loaded as `merged_data`

# Step 1.1: Sort by datetime and set index if not already
merged_data = merged_data.sort_index()

# Step 1.2: Select Features and Target
feature_cols = [
    'Open', 'High', 'Low', 'Volume',
    'sentiment_score', 'bert_sentiment',
    'prev_close', 'prev_vader_sentiment', 'prev_bert_sentiment',
    'volatility_7d', 'volatility_14d', 'volatility_30d',
    'closing_7d_avg', 'closing_30d_avg'
]
target_col = 'Close'  # We are predicting the next hour's Close price

# Step 1.3: Shift the target for next-hour prediction
merged_data['target'] = merged_data['Close'].shift(-1)

# Drop any final row with NaN after shifting
merged_data = merged_data.dropna()

# Step 1.4: Prepare X and y
X = merged_data[feature_cols]
y = merged_data['target']

print(X.shape, y.shape)


(18264, 14) (18264,)


## Splitting the data

In [None]:
# Step 2.1: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

# Step 2.2: Initialize and Train Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Step 2.3: Make Predictions
y_pred = lr_model.predict(X_test)

# Step 2.4: Evaluate the Model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred) * 100  # in percentage

# Step 2.5: Print Evaluation Metrics
print(f"RMSE: {rmse:.6f}")
print(f"MAPE: {mape:.2f}%")



RMSE: 0.001976
MAPE: 396081300931.24%


In [7]:
# Safe MAPE calculation
threshold = 0.01  # anything below 0.01 is too small
mask = np.abs(y_test) > threshold

MAPE = mean_absolute_percentage_error(y_test[mask], y_pred[mask]) * 100

print(f"Safe MAPE: {MAPE:.2f}%")


Safe MAPE: 2.46%


In [16]:
import sys
sys.path.append("../src")

from metric_logging import log_experiment

In [10]:
rmse
MAPE
rmse

0.0019759597333663024

In [13]:
# Prepare parameters and metrics
params = {
    "model_type": "Linear Regression",
    "features_used": "Lag features, Moving Averages, Volatility Indicators",
    "test_size": "20%",
    "shuffle": "False"
}

metrics = {
    "RMSE": rmse,
    "MAPE": MAPE
}


In [18]:
import os
import mlflow
import mlflow.sklearn

# Set up MLflow tracking URI and authentication for DagsHub
MLFLOW_TRACKING_URI = "https://dagshub.com/vamsisaigarapati/bitcoin_price_pred_CSE574.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'vamsisaigarapati'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '0d66986d30f48a915d60b73c435bdae6ee103eb8'

# Configure MLflow
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment("Bitcoin_Price_Prediction_CSE574")

def log_experiment(model_name, params, metrics, model_object=None):
    """
    Logs model, parameters, and evaluation metrics to MLflow (DagsHub).
    Model logging is optional.
    
    :param model_name: (str) Name of the model (e.g., "ARIMA", "XGBoost", "LSTM").
    :param params: (dict) Hyperparameters used for the model.
    :param metrics: (dict) Performance metrics (e.g., RMSE, MAPE).
    :param model_object: (optional) Trained model object to be saved (e.g., sklearn, SARIMA, etc.)
    """
    with mlflow.start_run() as run:
        run_id = run.info.run_id

        # 1. Save the model only if provided
        if model_object is not None:
            mlflow.sklearn.log_model(
                sk_model=model_object,
                artifact_path="model",
                registered_model_name=f"{model_name}_Model"
            )

        # 2. Log parameters
        for param_name, param_value in params.items():
            mlflow.log_param(param_name, param_value)

        # 3. Log metrics
        for metric_name, metric_value in metrics.items():
            mlflow.log_metric(metric_name, metric_value)

        # 4. Set a tag for better tracking
        mlflow.set_tag("mlflow.runName", f"{model_name}-Baseline")

        print(f"✅ {model_name} logged successfully to DagsHub MLflow.")


In [19]:
log_experiment(
    model_name="Linear_Regression",
    params=params,
    metrics=metrics
)


✅ Linear_Regression logged successfully to DagsHub MLflow.
🏃 View run Linear_Regression-Baseline at: https://dagshub.com/vamsisaigarapati/bitcoin_price_pred_CSE574.mlflow/#/experiments/0/runs/037e6b8b0aef452aaa3aa036abb087c9
🧪 View experiment at: https://dagshub.com/vamsisaigarapati/bitcoin_price_pred_CSE574.mlflow/#/experiments/0
