# Feature Engineering Notebook
- loads in fully cleaned data with target already constructed
- engineers features and final dataset for model training, selection, and evaluation

In [2]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

In [3]:
df = pd.read_csv("../data/bitcoin_clean.csv", parse_dates=["Date"])
df = df.sort_values("Date").reset_index(drop=True)

In [4]:
def make_lags(df_in, col, lags):
    """Helper function to create lag columns"""
    out = df_in.copy()
    for lag in lags:
        out[f"{col}_lag{lag}"] = out[col].shift(lag)
    return out

def make_rolls(df_in, col, windows_mean=(3,7,14,30), windows_std=(7,14,30)):
    """Helper function to create rolling statistics"""
    out = df_in.copy()
    for w in windows_mean:
        out[f"{col}_ma{w}"] = out[col].rolling(window=w, min_periods=w).mean()
    for w in windows_std:
        out[f"{col}_std{w}"] = out[col].rolling(window=w, min_periods=w).std()
    return out

In [5]:
# lags and rolling stats for price
df = make_lags(df, "btc_market_price", lags=[1,7,14,30])
df = make_rolls(df, "btc_market_price", windows_mean=(7,14,30), windows_std=(7,14,30))

In [6]:
# add day of week and month cols then dummify
df["dow"] = df["Date"].dt.dayofweek
df["month"] = df["Date"].dt.month
df = pd.get_dummies(df, columns=["dow", "month"], drop_first=True)

In [7]:
# log transform heavy tailed fields
log_cols = [
    "btc_trade_volume",
    "btc_transaction_fees",
    "btc_output_volume",
    "btc_estimated_transaction_volume"
]
for col in log_cols:
    if col in df.columns:
        df[col + "_log"] = np.log1p(df[col])

df = df.dropna().reset_index(drop=True)

In [8]:
df.to_csv("../data/bitcoin_features.csv", index=False)

df.head(10)

Unnamed: 0,Date,btc_market_price,btc_total_bitcoins,btc_trade_volume,btc_avg_block_size,btc_median_confirmation_time,btc_hash_rate,btc_transaction_fees,btc_cost_per_transaction,btc_n_unique_addresses,btc_n_transactions,btc_output_volume,btc_estimated_transaction_volume,target_t7,btc_market_price_lag1,btc_market_price_lag7,btc_market_price_lag14,btc_market_price_lag30,btc_market_price_ma7,btc_market_price_ma14,btc_market_price_ma30,btc_market_price_std7,btc_market_price_std14,btc_market_price_std30,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,btc_trade_volume_log,btc_transaction_fees_log,btc_output_volume_log,btc_estimated_transaction_volume_log
0,2010-09-16,0.0619,4005400.0,43.8,0.000549,0.0,0.006733,0.0,1.5475,385.0,380.0,39173.55,20542.0,0.063,0.175,0.0624,0.0629,0.0769,0.07886,0.070706,0.068505,0.042412,0.030037,0.020299,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,3.802208,0.0,10.575783,9.930276
1,2010-09-17,0.0609,4016200.0,435.72,0.000439,0.0,0.007655,0.0,1.735409,382.0,379.0,44589.06,29419.0,0.06281,0.0619,0.06201,0.0634,0.074,0.078701,0.070527,0.068068,0.042487,0.030091,0.020318,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,6.079292,0.0,10.705266,10.28943
2,2010-09-18,0.0609,4028050.0,425.5566,0.00074,0.0,0.008399,0.18,1.674397,445.0,431.0,81783.0,64511.0,0.0624,0.0609,0.062,0.0613,0.0688,0.078544,0.070499,0.067805,0.042561,0.030101,0.020359,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,6.055745,0.1655144,11.311837,11.074607
3,2010-09-19,0.062599,4038550.0,771.1302,0.000484,0.0,0.009581,0.0,1.938907,379.0,339.0,63683.909603,47333.0,0.062279,0.0609,0.064999,0.0629,0.0667,0.078201,0.070477,0.067668,0.042698,0.030107,0.02038,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,6.649153,0.0,11.061703,10.764984
4,2010-09-20,0.0634,4049100.0,858.4764,0.000583,0.0,0.009627,0.0,1.585,469.0,422.0,71345.62,37079.0,0.062206,0.062599,0.06201,0.064,0.066899,0.0784,0.070434,0.067551,0.042614,0.030117,0.020395,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,6.756323,0.0,11.175305,10.520833
5,2010-09-21,0.0633,4057950.0,344.157,0.00113,0.0,0.008076,1e-07,0.880825,743.0,636.0,39944.08,19532.0,0.06271,0.0634,0.0641,0.06185,0.0664,0.078286,0.070538,0.067448,0.042659,0.030088,0.020409,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,5.843999,1e-07,10.595261,9.879861
6,2010-09-22,0.0628,4066600.0,693.624,0.001543,0.0,0.007893,0.0,0.679875,999.0,799.0,51528.04,29221.0,0.06219,0.0633,0.175,0.06201,0.066,0.062257,0.070594,0.067341,0.00105,0.030071,0.020425,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,6.543371,0.0,10.849901,10.282677
7,2010-09-23,0.063,4077200.0,930.3198,0.001009,0.0,0.009673,0.0,0.956734,801.0,698.0,123082.78,52468.0,0.06192,0.0628,0.0619,0.0624,0.066889,0.062414,0.070637,0.067212,0.00107,0.030059,0.02044,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,6.836603,0.0,11.720621,10.867978
8,2010-09-24,0.06281,4089500.0,41.0628,0.000651,0.0,0.011224,0.0,1.384522,631.0,558.0,48867.6,33287.0,0.061999,0.063,0.0609,0.06201,0.0665,0.062687,0.070694,0.067089,0.000838,0.030042,0.020456,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,3.739164,0.0,10.79689,10.412952
9,2010-09-25,0.0624,4099600.0,129.15,0.000635,0.0,0.009216,0.0,1.397428,499.0,451.0,32462.93,17081.0,0.061999,0.06281,0.0609,0.062,0.066499,0.062901,0.070723,0.066952,0.00036,0.030033,0.020474,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,4.868688,0.0,10.387885,9.745781


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2708 entries, 0 to 2707
Data columns (total 45 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   Date                                  2708 non-null   datetime64[ns]
 1   btc_market_price                      2708 non-null   float64       
 2   btc_total_bitcoins                    2708 non-null   float64       
 3   btc_trade_volume                      2708 non-null   float64       
 4   btc_avg_block_size                    2708 non-null   float64       
 5   btc_median_confirmation_time          2708 non-null   float64       
 6   btc_hash_rate                         2708 non-null   float64       
 7   btc_transaction_fees                  2708 non-null   float64       
 8   btc_cost_per_transaction              2708 non-null   float64       
 9   btc_n_unique_addresses                2708 non-null   float64       
 10  