In [2]:
import sys
import os

sys.path.append(os.path.abspath("../")) 

In [16]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from src.metric_logging import log_experiment

In [4]:
# Load merged dataset
file_path = "../datasets/final_merged_dataset.parquet"
merged_data = pd.read_parquet(file_path)

# Ensure 'date' column is in datetime format
merged_data['date'] = pd.to_datetime(merged_data['date'])
merged_data.set_index('date', inplace=True)

# Select relevant features: Price, Sentiment, and Technical Indicators
feature_columns = ['Close', 'Open', 'High', 'Low', 'Volume', 'sentiment_score', 'bert_sentiment', 'volatility_7d', 'volatility_30d']
data = merged_data[feature_columns]

# Drop NaN values
data.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(inplace=True)


In [5]:
# Create lag features (previous day's close)
data['prev_close'] = data['Close'].shift(1)

# Create rolling averages
data['rolling_avg_7'] = data['Close'].rolling(window=7).mean()
data['rolling_avg_14'] = data['Close'].rolling(window=14).mean()
data['rolling_avg_30'] = data['Close'].rolling(window=30).mean()

# Drop NaN values after creating lag & rolling features
data.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['prev_close'] = data['Close'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['rolling_avg_7'] = data['Close'].rolling(window=7).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['rolling_avg_14'] = data['Close'].rolling(window=14).mean()
A value is trying to be set on a 

In [6]:
# Define input features (X) and target variable (y)
X = data.drop(columns=['Close'])  # Features
y = data['Close']  # Target (Next Day's Closing Price)

# Split 80% Train, 20% Test
train_size = int(len(data) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Print dataset sizes
print(f"Train Size: {len(X_train)}, Test Size: {len(X_test)}")

Train Size: 585, Test Size: 147


In [8]:
# Initialize and train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Print coefficients (feature importance)
feature_importance = dict(zip(X_train.columns, model.coef_))
print("Feature Importance:", feature_importance)

Feature Importance: {'Open': -0.7970500113226934, 'High': 0.9378389265903682, 'Low': 0.6256260702710894, 'Volume': 0.0009978699453272756, 'sentiment_score': -0.006632962504089177, 'bert_sentiment': 0.0031352034957625756, 'volatility_7d': -0.009925537276190992, 'volatility_30d': 0.006822292457935214, 'prev_close': 0.18937918451281235, 'rolling_avg_7': 0.08018583088890796, 'rolling_avg_14': -0.045740217524099645, 'rolling_avg_30': 0.0062873833703330755}


## Make Predictions for Multiple Time Horizons

In [9]:
# Predict closing prices
y_pred = model.predict(X_test)

# Create DataFrame for evaluation
predictions_df = pd.DataFrame({
    "Actual_Close": y_test.values,
    "Predicted_Close": y_pred
}, index=y_test.index)

# Forecast for 7-day and 30-day using rolling predictions
predictions_df['Predicted_7d'] = predictions_df['Predicted_Close'].shift(-7)
predictions_df['Predicted_30d'] = predictions_df['Predicted_Close'].shift(-30)

print("✅ Predictions completed successfully!")

✅ Predictions completed successfully!


In [10]:
# Compute RMSE & MAPE for different forecasting horizons
rmse_1d = np.sqrt(mean_squared_error(predictions_df['Actual_Close'], predictions_df['Predicted_Close']))
rmse_7d = np.sqrt(mean_squared_error(predictions_df['Actual_Close'].iloc[:-7], predictions_df['Predicted_7d'].dropna()))
rmse_30d = np.sqrt(mean_squared_error(predictions_df['Actual_Close'].iloc[:-30], predictions_df['Predicted_30d'].dropna()))

# Compute MAPE
mape_1d = np.mean(np.abs((predictions_df['Actual_Close'] - predictions_df['Predicted_Close']) / predictions_df['Actual_Close'])) * 100
mape_7d = np.mean(np.abs((predictions_df['Actual_Close'].iloc[:-7] - predictions_df['Predicted_7d'].dropna()) / predictions_df['Actual_Close'].iloc[:-7])) * 100
mape_30d = np.mean(np.abs((predictions_df['Actual_Close'].iloc[:-30] - predictions_df['Predicted_30d'].dropna()) / predictions_df['Actual_Close'].iloc[:-30])) * 100

# Print Evaluation Results
print(f"✅ Linear Regression Model Evaluation:")
print(f"RMSE (1-day): {rmse_1d:.4f}, MAPE (1-day): {mape_1d:.2f}%")
print(f"RMSE (7-day): {rmse_7d:.4f}, MAPE (7-day): {mape_7d:.2f}%")
print(f"RMSE (30-day): {rmse_30d:.4f}, MAPE (30-day): {mape_30d:.2f}%")

✅ Linear Regression Model Evaluation:
RMSE (1-day): 0.0040, MAPE (1-day): inf%
RMSE (7-day): 0.0271, MAPE (7-day): inf%
RMSE (30-day): 0.0554, MAPE (30-day): inf%


In [15]:
# Prepare metrics for logging
lr_metrics = {
    "RMSE": rmse_1d,
    "RMSE_7d": rmse_7d,
    "RMSE_30d": rmse_30d
}

# Log results to MLflow
log_experiment("Linear_Regression", {}, lr_metrics)

print("✅ Linear Regression RMSE logged successfully to DagsHub MLflow!")

✅ Linear_Regression metrics logged successfully to DagsHub MLflow.
🏃 View run Linear_Regression-Baseline at: https://dagshub.com/vamsisaigarapati/bitcoin_price_pred_CSE574.mlflow/#/experiments/0/runs/2632b98b37d946c9801136d8375eed9f
🧪 View experiment at: https://dagshub.com/vamsisaigarapati/bitcoin_price_pred_CSE574.mlflow/#/experiments/0
✅ Linear Regression RMSE logged successfully to DagsHub MLflow!
