In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/bitcoin.csv", parse_dates=["Date"])
df = df.sort_values("Date").reset_index(drop=True)

# trim early zero-price period
df = df[df['btc_market_price'] > 0].copy()

# construct t+7 target
df['target_t7'] = df['btc_market_price'].shift(-7)
df = df.dropna(subset=['target_t7'])

# drop redundant / irrelevant fields
drop_cols = [
    "btc_market_cap",
    "btc_estimated_transaction_volume_usd",
    "btc_cost_per_transaction_percent",
    "btc_n_transactions_total",
    "btc_n_orphaned_blocks",  
]
df = df.drop(columns=drop_cols, errors="ignore")

# collapse correlated groups
df = df.drop(columns=[
    "btc_blocks_size",  # keep avg_block_size
    "btc_difficulty",   # keep hash_rate
    "btc_miners_revenue", # keep transaction_fees
    "btc_n_transactions_excluding_popular",
    "btc_n_transactions_excluding_chains_longer_than_100"
], errors="ignore")

# log transforms for heavy-tailed fields
log_cols = [
    "btc_trade_volume",
    "btc_transaction_fees",
    "btc_output_volume",
    "btc_estimated_transaction_volume"
]
for col in log_cols:
    if col in df.columns:
        df[col + "_log"] = np.log1p(df[col])

# save cleaned dataset
df.to_csv("../data/bitcoin_clean.csv", index=False)

df.head(10)

Unnamed: 0,Date,btc_market_price,btc_total_bitcoins,btc_trade_volume,btc_avg_block_size,btc_n_transactions_per_block,btc_median_confirmation_time,btc_hash_rate,btc_transaction_fees,btc_cost_per_transaction,btc_n_unique_addresses,btc_n_transactions,btc_output_volume,btc_estimated_transaction_volume,target_t7,btc_trade_volume_log,btc_transaction_fees_log,btc_output_volume_log,btc_estimated_transaction_volume_log
175,2010-08-17,0.0769,3744250.0,923.0018,0.000959,1.0,0.0,0.003333,0.67,1.430952,393.0,352.0,72855.15,48276.0,0.066889,6.828714,0.512824,11.196242,10.784711
176,2010-08-18,0.074,3750900.0,206.7786,0.001973,1.0,0.0,0.003384,1.56,1.305501,449.0,377.0,52829.65,35791.0,0.0665,5.336473,0.940007,10.874847,10.48548
177,2010-08-19,0.0688,3757900.0,51.8784,0.000715,1.0,0.0,0.003562,0.0,1.308696,395.0,368.0,32027.42,13756.0,0.066499,3.967995,0.0,10.374379,9.529303
178,2010-08-20,0.0667,3766250.0,293.9825,0.000649,1.0,0.0,0.004249,0.0,1.399359,433.0,398.0,36647.52,21143.0,0.065,5.686916,0.0,10.509128,9.959111
179,2010-08-21,0.066899,3775450.0,731.0702,0.000528,1.0,0.0,0.004681,0.0,1.768594,396.0,348.0,33790.41,17456.0,0.065,6.595876,0.0,10.427962,9.767496
180,2010-08-22,0.0664,3785400.0,1118.9382,0.000491,1.0,0.0,0.005063,0.0,1.887657,358.0,350.0,45810.46,29971.0,0.0648,7.021029,0.0,10.73229,10.308019
181,2010-08-23,0.066,3796250.0,300.8243,0.000495,1.0,0.0,0.005521,0.0,1.840874,431.0,389.0,29274.46,11325.0,0.069,5.709845,0.0,10.284505,9.334856
182,2010-08-24,0.066889,3806500.0,469.8232,0.000434,1.0,0.0,0.005215,0.0,2.034458,393.0,337.0,28624.03,11639.0,0.06497,6.154483,0.0,10.262037,9.362203
183,2010-08-25,0.0665,3817850.0,296.0216,0.000772,1.0,0.0,0.005775,0.8,1.905997,414.0,396.0,24234.76,7927.0,0.0649,5.693805,0.587787,10.095585,8.978156
184,2010-08-26,0.066499,3828250.0,270.9007,0.000499,2.0,0.0,0.005292,0.0,1.84917,391.0,374.0,30534.45,18406.0,0.0629,5.605437,0.0,10.326644,9.820486
