In [None]:
import pandas as pd

# Define file paths
btc_price_path = "../datasets/normalised_bitcoin_price.parquet"
sentiment_path = "../datasets/daily_sentiment.parquet"

# Load datasets
btc_data = pd.read_parquet(btc_price_path)
sentiment_data = pd.read_parquet(sentiment_path)

# Convert 'date' columns to datetime format (if not already)
btc_data['date'] = pd.to_datetime(btc_data['date'])
sentiment_data['date'] = pd.to_datetime(sentiment_data['date'])

# Display loaded datasets
print("Bitcoin Price Data:")
print(btc_data.head())
print("\nSentiment Data:")
print(sentiment_data.head())

In [None]:
# Ensure both VADER and BERT sentiment scores are included in sentiment_data
if 'bert_sentiment' not in sentiment_data.columns:
    raise ValueError("BERT sentiment score column is missing in sentiment dataset!")

# Merge on 'date' column (inner join to keep common dates)
merged_data = pd.merge(btc_data, sentiment_data, on='date', how='inner')

# Display merged dataset
print("Merged Dataset with VADER & BERT Sentiments:")
print(merged_data[['date', 'Close', 'sentiment_score', 'bert_sentiment']].head())

In [None]:
# Create lag features (Previous day's Close price, VADER sentiment, BERT sentiment)
merged_data['prev_close'] = merged_data['Close'].shift(1)
merged_data['prev_vader_sentiment'] = merged_data['sentiment_score'].shift(1)
merged_data['prev_bert_sentiment'] = merged_data['bert_sentiment'].shift(1)

# Display dataset with lag features
print("Dataset with Lag Features:")
print(merged_data[['date', 'Close', 'prev_close', 'sentiment_score', 'bert_sentiment', 'prev_vader_sentiment', 'prev_bert_sentiment']].head())

In [None]:
# Create lag features (Previous day's Close price, VADER sentiment, BERT sentiment)
merged_data['prev_close'] = merged_data['Close'].shift(1)
merged_data['prev_vader_sentiment'] = merged_data['sentiment_score'].shift(1)
merged_data['prev_bert_sentiment'] = merged_data['bert_sentiment'].shift(1)

# Display dataset with lag features
print("Dataset with Lag Features:")
print(merged_data[['date', 'Close', 'prev_close', 'sentiment_score', 'bert_sentiment', 'prev_vader_sentiment', 'prev_bert_sentiment']].head())

In [None]:
# Volatility indicators (Standard Deviation)
merged_data['volatility_7d'] = merged_data['Close'].rolling(window=7).std()
merged_data['volatility_14d'] = merged_data['Close'].rolling(window=14).std()
merged_data['volatility_30d'] = merged_data['Close'].rolling(window=30).std()

# Display dataset with volatility indicators
print("Dataset with Volatility Indicators:")

print(merged_data[['date', 'Close', 'volatility_7d', 'volatility_14d', 'volatility_30d']].head(15))


In [None]:
# Handling missing values: Forward Fill (then Backward Fill if needed)
merged_data.fillna(method='ffill', inplace=True)
merged_data.fillna(method='bfill', inplace=True)

# Display final dataset
print("Final Dataset After Handling Missing Values:")
print(merged_data.head(15))

In [None]:
# Save the final processed dataset
final_dataset_path = "../datasets/final_merged_dataset.parquet"
merged_data.to_parquet(final_dataset_path, index=False)

print(f"Processed dataset saved at: {final_dataset_path}")