In [6]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [1]:
import pandas as pd

# Define file paths
btc_price_path = "../datasets/normalised_bitcoin_price.parquet"
sentiment_path = "../datasets/daily_sentiment.parquet"

# Load datasets
btc_data = pd.read_parquet(btc_price_path)
sentiment_data = pd.read_parquet(sentiment_path)

# Convert 'date' columns to datetime format (if not already)
btc_data['date'] = pd.to_datetime(btc_data['date'])
sentiment_data['date'] = pd.to_datetime(sentiment_data['date'])

# Display loaded datasets
print("Bitcoin Price Data:")
print(btc_data.head())
print("\nSentiment Data:")
print(sentiment_data.head())

Bitcoin Price Data:
        date      Open      High       Low     Close    Volume
0 2021-01-01  0.255213  0.254063  0.260019  0.262398  0.096184
1 2021-01-02  0.262601  0.321720  0.265682  0.315568  0.175237
2 2021-01-03  0.315780  0.349385  0.323992  0.328213  0.206700
3 2021-01-04  0.328945  0.327147  0.258427  0.312568  0.213978
4 2021-01-05  0.312836  0.346131  0.287934  0.351590  0.174310

Sentiment Data:
        date  sentiment_score  bert_sentiment
0 2021-02-05         0.137524        0.607382
1 2021-02-06         0.145508        0.608538
2 2021-02-07         0.160835        0.611493
3 2021-02-08         0.166964        0.607657
4 2021-02-09         0.160324        0.609625


In [2]:
# Ensure both VADER and BERT sentiment scores are included in sentiment_data
if 'bert_sentiment' not in sentiment_data.columns:
    raise ValueError("BERT sentiment score column is missing in sentiment dataset!")

# Merge on 'date' column (inner join to keep common dates)
merged_data = pd.merge(btc_data, sentiment_data, on='date', how='inner')

# Display merged dataset
print("Merged Dataset with VADER & BERT Sentiments:")
print(merged_data[['date', 'Close', 'sentiment_score', 'bert_sentiment']].head())

Merged Dataset with VADER & BERT Sentiments:
        date     Close  sentiment_score  bert_sentiment
0 2021-02-05  0.431773         0.137524        0.607382
1 2021-02-06  0.453436         0.145508        0.608538
2 2021-02-07  0.446434         0.160835        0.611493
3 2021-02-08  0.587282         0.166964        0.607657
4 2021-02-09  0.592779         0.160324        0.609625


In [17]:
# Merge on 'date' column (left join to keep all dates from btc_data)
merged_data = pd.merge(btc_data, sentiment_data, on='date', how='left')

# Fill missing values with a default value (e.g., 0 for sentiment scores)
merged_data = merged_data.copy()  # Ensure it is a proper DataFrame copy
merged_data['sentiment_score'] = merged_data['sentiment_score'].fillna(0)
merged_data['bert_sentiment'].fillna(0, inplace=True)

# Display merged dataset
print("Merged Dataset with Left Join and Filled Missing Values:")
print(merged_data[['date', 'Close', 'sentiment_score', 'bert_sentiment']].head())

Merged Dataset with Left Join and Filled Missing Values:
        date     Close  sentiment_score  bert_sentiment
0 2021-01-01  0.262398              0.0             0.0
1 2021-01-02  0.315568              0.0             0.0
2 2021-01-03  0.328213              0.0             0.0
3 2021-01-04  0.312568              0.0             0.0
4 2021-01-05  0.351590              0.0             0.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['bert_sentiment'].fillna(0, inplace=True)


In [18]:
# Create lag features (Previous day's Close price, VADER sentiment, BERT sentiment)
merged_data['prev_close'] = merged_data['Close'].shift(1)
merged_data['prev_vader_sentiment'] = merged_data['sentiment_score'].shift(1)
merged_data['prev_bert_sentiment'] = merged_data['bert_sentiment'].shift(1)

# Display dataset with lag features
print("Dataset with Lag Features:")
print(merged_data[['date', 'Close', 'prev_close', 'sentiment_score', 'bert_sentiment', 'prev_vader_sentiment', 'prev_bert_sentiment']].head())

Dataset with Lag Features:
        date     Close  prev_close  sentiment_score  bert_sentiment  \
0 2021-01-01  0.262398         NaN              0.0             0.0   
1 2021-01-02  0.315568    0.262398              0.0             0.0   
2 2021-01-03  0.328213    0.315568              0.0             0.0   
3 2021-01-04  0.312568    0.328213              0.0             0.0   
4 2021-01-05  0.351590    0.312568              0.0             0.0   

   prev_vader_sentiment  prev_bert_sentiment  
0                   NaN                  NaN  
1                   0.0                  0.0  
2                   0.0                  0.0  
3                   0.0                  0.0  
4                   0.0                  0.0  


In [19]:
# Create lag features (Previous day's Close price, VADER sentiment, BERT sentiment)
merged_data['prev_close'] = merged_data['Close'].shift(1)
merged_data['prev_vader_sentiment'] = merged_data['sentiment_score'].shift(1)
merged_data['prev_bert_sentiment'] = merged_data['bert_sentiment'].shift(1)

# Display dataset with lag features
print("Dataset with Lag Features:")
print(merged_data[['date', 'Close', 'prev_close', 'sentiment_score', 'bert_sentiment', 'prev_vader_sentiment', 'prev_bert_sentiment']].head())

Dataset with Lag Features:
        date     Close  prev_close  sentiment_score  bert_sentiment  \
0 2021-01-01  0.262398         NaN              0.0             0.0   
1 2021-01-02  0.315568    0.262398              0.0             0.0   
2 2021-01-03  0.328213    0.315568              0.0             0.0   
3 2021-01-04  0.312568    0.328213              0.0             0.0   
4 2021-01-05  0.351590    0.312568              0.0             0.0   

   prev_vader_sentiment  prev_bert_sentiment  
0                   NaN                  NaN  
1                   0.0                  0.0  
2                   0.0                  0.0  
3                   0.0                  0.0  
4                   0.0                  0.0  


In [20]:
# Volatility indicators (Standard Deviation)
merged_data['volatility_7d'] = merged_data['Close'].rolling(window=7).std()
merged_data['volatility_14d'] = merged_data['Close'].rolling(window=14).std()
merged_data['volatility_30d'] = merged_data['Close'].rolling(window=30).std()

# Display dataset with volatility indicators
print("Dataset with Volatility Indicators:")

print(merged_data[['date', 'Close', 'volatility_7d', 'volatility_14d', 'volatility_30d']].head(15))


Dataset with Volatility Indicators:
         date     Close  volatility_7d  volatility_14d  volatility_30d
0  2021-01-01  0.262398            NaN             NaN             NaN
1  2021-01-02  0.315568            NaN             NaN             NaN
2  2021-01-03  0.328213            NaN             NaN             NaN
3  2021-01-04  0.312568            NaN             NaN             NaN
4  2021-01-05  0.351590            NaN             NaN             NaN
5  2021-01-06  0.406282            NaN             NaN             NaN
6  2021-01-07  0.455465       0.064527             NaN             NaN
7  2021-01-08  0.483016       0.069735             NaN             NaN
8  2021-01-09  0.472528       0.071175             NaN             NaN
9  2021-01-10  0.435870       0.064000             NaN             NaN
10 2021-01-11  0.381992       0.048718             NaN             NaN
11 2021-01-12  0.350248       0.049064             NaN             NaN
12 2021-01-13  0.415783       0.048540   

In [21]:
# Calculate 7-day and 30-day moving averages of the closing prices
merged_data['closing_7d_avg'] = merged_data['Close'].rolling(window=7).mean()
merged_data['closing_30d_avg'] = merged_data['Close'].rolling(window=30).mean()

# Display dataset with moving averages
print("Dataset with 7-day and 30-day Moving Averages:")
print(merged_data[['date', 'Close', 'closing_7d_avg', 'closing_30d_avg']].head(15))

Dataset with 7-day and 30-day Moving Averages:
         date     Close  closing_7d_avg  closing_30d_avg
0  2021-01-01  0.262398             NaN              NaN
1  2021-01-02  0.315568             NaN              NaN
2  2021-01-03  0.328213             NaN              NaN
3  2021-01-04  0.312568             NaN              NaN
4  2021-01-05  0.351590             NaN              NaN
5  2021-01-06  0.406282             NaN              NaN
6  2021-01-07  0.455465        0.347441              NaN
7  2021-01-08  0.483016        0.378957              NaN
8  2021-01-09  0.472528        0.401380              NaN
9  2021-01-10  0.435870        0.416760              NaN
10 2021-01-11  0.381992        0.426677              NaN
11 2021-01-12  0.350248        0.426486              NaN
12 2021-01-13  0.415783        0.427843              NaN
13 2021-01-14  0.451917        0.427336              NaN
14 2021-01-15  0.406301        0.416377              NaN


In [22]:
# Handling missing values: Forward Fill (then Backward Fill if needed)
merged_data.fillna(method='ffill', inplace=True)
merged_data.fillna(method='bfill', inplace=True)

# Display final dataset
print("Final Dataset After Handling Missing Values:")
merged_data.head(15)

Final Dataset After Handling Missing Values:


  merged_data.fillna(method='ffill', inplace=True)
  merged_data.fillna(method='bfill', inplace=True)


Unnamed: 0,date,Open,High,Low,Close,Volume,sentiment_score,bert_sentiment,prev_close,prev_vader_sentiment,prev_bert_sentiment,volatility_7d,volatility_14d,volatility_30d,closing_7d_avg,closing_30d_avg
0,2021-01-01,0.255213,0.254063,0.260019,0.262398,0.096184,0.0,0.0,0.262398,0.0,0.0,0.064527,0.068655,0.057509,0.347441,0.367506
1,2021-01-02,0.262601,0.32172,0.265682,0.315568,0.175237,0.0,0.0,0.262398,0.0,0.0,0.064527,0.068655,0.057509,0.347441,0.367506
2,2021-01-03,0.31578,0.349385,0.323992,0.328213,0.2067,0.0,0.0,0.315568,0.0,0.0,0.064527,0.068655,0.057509,0.347441,0.367506
3,2021-01-04,0.328945,0.327147,0.258427,0.312568,0.213978,0.0,0.0,0.328213,0.0,0.0,0.064527,0.068655,0.057509,0.347441,0.367506
4,2021-01-05,0.312836,0.346131,0.287934,0.35159,0.17431,0.0,0.0,0.312568,0.0,0.0,0.064527,0.068655,0.057509,0.347441,0.367506
5,2021-01-06,0.352177,0.392615,0.352775,0.406282,0.196865,0.0,0.0,0.35159,0.0,0.0,0.064527,0.068655,0.057509,0.347441,0.367506
6,2021-01-07,0.406657,0.455441,0.4114,0.455465,0.224462,0.0,0.0,0.406282,0.0,0.0,0.064527,0.068655,0.057509,0.347441,0.367506
7,2021-01-08,0.455875,0.489063,0.418242,0.483016,0.234208,0.0,0.0,0.455465,0.0,0.0,0.069735,0.068655,0.057509,0.378957,0.367506
8,2021-01-09,0.483052,0.479348,0.460426,0.472528,0.158103,0.0,0.0,0.483016,0.0,0.0,0.071175,0.068655,0.057509,0.40138,0.367506
9,2021-01-10,0.472728,0.47904,0.401425,0.43587,0.210533,0.0,0.0,0.472528,0.0,0.0,0.064,0.068655,0.057509,0.41676,0.367506


In [23]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 761 entries, 0 to 760
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   date                  761 non-null    datetime64[ns]
 1   Open                  761 non-null    float64       
 2   High                  761 non-null    float64       
 3   Low                   761 non-null    float64       
 4   Close                 761 non-null    float64       
 5   Volume                761 non-null    float64       
 6   sentiment_score       761 non-null    float64       
 7   bert_sentiment        761 non-null    float64       
 8   prev_close            761 non-null    float64       
 9   prev_vader_sentiment  761 non-null    float64       
 10  prev_bert_sentiment   761 non-null    float64       
 11  volatility_7d         761 non-null    float64       
 12  volatility_14d        761 non-null    float64       
 13  volatility_30d      

In [24]:
# Save the final processed dataset
final_dataset_path = "../datasets/final_merged_dataset.parquet"
merged_data.to_parquet(final_dataset_path, index=False)

print(f"Processed dataset saved at: {final_dataset_path}")

Processed dataset saved at: ../datasets/final_merged_dataset.parquet
