In [None]:
import os
import pandas as pd
import numpy as np

PROCESSED_DATA = os.path.join("data", "processed-data")

In [9]:


# File paths for the processed data files
btc_processed_file = os.path.join(PROCESSED_DATA, "btc_1min_processed.csv.gz")
eth_processed_file = os.path.join(PROCESSED_DATA, "eth_1min_processed.csv.gz")
combined_file = os.path.join(PROCESSED_DATA, "crypto_1min_combined.csv.gz")
NEWS_FILTERED_CSV = os.path.join(PROCESSED_DATA, "aggregated-news_filtered.csv.gz")

# Read the processed CSV files using gzip compression
btc_df = pd.read_csv(btc_processed_file, compression="gzip")
eth_df = pd.read_csv(eth_processed_file, compression="gzip")
crypto_df = pd.read_csv(combined_file, compression="gzip")
df_filtered_news = pd.read_csv(NEWS_FILTERED_CSV, compression="gzip")


  crypto_df = pd.read_csv(combined_file, compression="gzip")


In [None]:
def prepare_df(df):
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    # Drop rows with invalid/NaT dates
    df.dropna(subset=['Date'], inplace=True)

    # Now set and sort index
    df.set_index('Date', inplace=True)
    df.sort_index(ascending=True, inplace=True)

    # Ensure no duplicates
    df = df[~df.index.duplicated(keep='first')]

    return df


In [None]:
def add_time_based_price_changes_merge_asof(df, offsets, price_col='Close'):
    df = df.copy()
    df_reset = df.reset_index().rename(columns={'Date': 'time'})
    df_reset.sort_values('time', inplace=True)
    earliest_time = df_reset["time"].min()
    for label, offset_str in offsets:
        offset_td = pd.Timedelta(offset_str)
        target_col = f"target_time_{label}"
        df_reset[target_col] = df_reset["time"] - offset_td
        temp = df_reset[["time", price_col]].copy()
        merged = pd.merge_asof(
            df_reset,
            temp,
            left_on=target_col,
            right_on="time",
            direction="backward",
            suffixes=("", f"_{label}_ago")
        )
        df_reset[f"{price_col}_{label}_ago"] = merged[f"{price_col}_{label}_ago"]
        df_reset.loc[df_reset[target_col] < earliest_time, f"{price_col}_{label}_ago"] = np.nan
        df_reset[f"PctChange_{label}"] = (df_reset[price_col] - df_reset[f"{price_col}_{label}_ago"]) / df_reset[f"{price_col}_{label}_ago"]
        df_reset.drop(columns=[target_col], inplace=True)
    df_final = df_reset.set_index("time")
    return df_final

In [None]:
def add_moving_averages(df, windows=[20, 50, 200], price_col='Close'):
    df = df.copy()
    for w in windows:
        df[f'SMA_{w}'] = df[price_col].rolling(window=w).mean()
        df[f'EMA_{w}'] = df[price_col].ewm(span=w, adjust=False).mean()
    return df

In [None]:
def add_rsi(df, period=14, price_col='Close', rsi_col='RSI'):
    df = df.copy()
    delta = df[price_col].diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.ewm(com=period - 1, min_periods=period).mean()
    avg_loss = loss.ewm(com=period - 1, min_periods=period).mean()
    rs = avg_gain / avg_loss
    df[rsi_col] = 100 - (100 / (1 + rs))
    return df

In [None]:
def add_macd(df, short=12, long=26, signal=9, price_col='Close'):
    df = df.copy()
    ema_short = df[price_col].ewm(span=short, adjust=False).mean()
    ema_long = df[price_col].ewm(span=long, adjust=False).mean()
    df['MACD'] = ema_short - ema_long
    df['MACD_Signal'] = df['MACD'].ewm(span=signal, adjust=False).mean()
    df['MACD_Hist'] = df['MACD'] - df['MACD_Signal']
    return df


In [None]:
def add_bollinger_bands(df, window=20, n_std=2, price_col='Close'):
    df = df.copy()
    rolling_mean = df[price_col].rolling(window=window).mean()
    rolling_std = df[price_col].rolling(window=window).std()
    df['BB_Middle'] = rolling_mean
    df['BB_Upper'] = rolling_mean + (n_std * rolling_std)
    df['BB_Lower'] = rolling_mean - (n_std * rolling_std)
    return df

In [None]:
def add_volume_oscillator(df, short=5, long=20, volume_col='Volume'):
    df = df.copy()
    df['VolMA_short'] = df[volume_col].rolling(window=short).mean()
    df['VolMA_long'] = df[volume_col].rolling(window=long).mean()
    df['VolumeOscillator'] = ((df['VolMA_short'] - df['VolMA_long']) / df['VolMA_long']) * 100
    return df

In [None]:
def add_obv(df, price_col='Close', volume_col='Volume'):
    df = df.copy()
    df['prev_close'] = df[price_col].shift(1)
    obv_vals = [0]
    for i in range(1, len(df)):
        if df[price_col].iloc[i] > df['prev_close'].iloc[i]:
            obv_vals.append(obv_vals[-1] + df[volume_col].iloc[i])
        elif df[price_col].iloc[i] < df['prev_close'].iloc[i]:
            obv_vals.append(obv_vals[-1] - df[volume_col].iloc[i])
        else:
            obv_vals.append(obv_vals[-1])
    df['OBV'] = obv_vals
    df.drop(columns=['prev_close'], inplace=True)
    return df

In [None]:
def add_candlestick_patterns(df, open_col='Open', high_col='High', low_col='Low', close_col='Close'):
    df = df.copy()
    df['candle_range'] = df[high_col] - df[low_col]
    df['body_size'] = (df[close_col] - df[open_col]).abs()
    df['Doji'] = (df['body_size'] <= 0.1 * df['candle_range']).astype(int)
    return df

In [None]:
def add_atr(df, period=14, high_col='High', low_col='Low', close_col='Close'):
    df = df.copy()
    df['prev_close'] = df[close_col].shift(1)
    df['tr1'] = df[high_col] - df[low_col]
    df['tr2'] = (df[high_col] - df['prev_close']).abs()
    df['tr3'] = (df[low_col] - df['prev_close']).abs()
    df['TR'] = df[['tr1','tr2','tr3']].max(axis=1)
    df['ATR'] = df['TR'].ewm(alpha=1/period, adjust=False).mean()
    df.drop(columns=['prev_close','tr1','tr2','tr3','TR'], inplace=True)
    return df

In [None]:
def add_adx(df, period=14, high_col='High', low_col='Low', close_col='Close'):
    df = df.copy()
    df['prev_high'] = df[high_col].shift(1)
    df['prev_low'] = df[low_col].shift(1)
    df['prev_close'] = df[close_col].shift(1)
    df['tr1'] = df[high_col] - df[low_col]
    df['tr2'] = (df[high_col] - df['prev_close']).abs()
    df['tr3'] = (df[low_col] - df['prev_close']).abs()
    df['TR'] = df[['tr1','tr2','tr3']].max(axis=1)
    df['+DM'] = np.where((df[high_col] - df['prev_high']) > (df['prev_low'] - df[low_col]), 
                         np.clip(df[high_col] - df['prev_high'], a_min=0, a_max=None), 0)
    df['-DM'] = np.where((df['prev_low'] - df[low_col]) > (df[high_col] - df['prev_high']), 
                         np.clip(df['prev_low'] - df[low_col], a_min=0, a_max=None), 0)
    df['TR_ema'] = df['TR'].ewm(alpha=1/period, adjust=False).mean()
    df['+DM_ema'] = df['+DM'].ewm(alpha=1/period, adjust=False).mean()
    df['-DM_ema'] = df['-DM'].ewm(alpha=1/period, adjust=False).mean()
    df['+DI'] = 100 * (df['+DM_ema'] / df['TR_ema'])
    df['-DI'] = 100 * (df['-DM_ema'] / df['TR_ema'])
    df['DX'] = 100 * ( (df['+DI'] - df['-DI']).abs() / (df['+DI'] + df['-DI']) )
    df['ADX'] = df['DX'].ewm(alpha=1/period, adjust=False).mean()
    df.drop(columns=['prev_high','prev_low','prev_close','tr1','tr2','tr3','TR','+DM','-DM',
                     'TR_ema','+DM_ema','-DM_ema','DX'], inplace=True)
    return df

In [None]:
def add_stochastic(df, k_window=14, d_window=3, high_col='High', low_col='Low', close_col='Close'):
    df = df.copy()
    df['roll_low'] = df[low_col].rolling(k_window).min()
    df['roll_high'] = df[high_col].rolling(k_window).max()
    df['%K'] = 100 * (df[close_col] - df['roll_low']) / (df['roll_high'] - df['roll_low'])
    df['%D'] = df['%K'].rolling(d_window).mean()
    df.drop(columns=['roll_low','roll_high'], inplace=True)
    return df

In [None]:
def add_williams_r(df, period=14, high_col='High', low_col='Low', close_col='Close'):
    df = df.copy()
    df['roll_high'] = df[high_col].rolling(period).max()
    df['roll_low'] = df[low_col].rolling(period).min()
    df['Williams_%R'] = -100 * (df['roll_high'] - df[close_col]) / (df['roll_high'] - df['roll_low'])
    df.drop(columns=['roll_high','roll_low'], inplace=True)
    return df

In [None]:
def add_all_features(df):
    df = df.copy()
    df = add_time_based_price_changes_merge_asof(
        df, 
        offsets=[('1m','1min'), ('5m','5min'), ('1h','1h'), ('1d','1d'), ('1w','7d'), ('1mo','30d')],
        price_col='Close'
    )
    df = add_moving_averages(df, windows=[20, 50, 200], price_col='Close')
    df = add_rsi(df, period=14, price_col='Close', rsi_col='RSI_14')
    df = add_macd(df, short=12, long=26, signal=9, price_col='Close')
    df = add_bollinger_bands(df, window=20, n_std=2, price_col='Close')
    df = add_volume_oscillator(df, short=5, long=20, volume_col='Volume')
    df = add_obv(df, price_col='Close', volume_col='Volume')
    df = add_candlestick_patterns(df, open_col='Open', high_col='High', low_col='Low', close_col='Close')
    df = add_atr(df, period=14, high_col='High', low_col='Low', close_col='Close')
    df = add_adx(df, period=14, high_col='High', low_col='Low', close_col='Close')
    df = add_stochastic(df, k_window=14, d_window=3, high_col='High', low_col='Low', close_col='Close')
    df = add_williams_r(df, period=14, high_col='High', low_col='Low', close_col='Close')
    df.dropna(inplace=True)
    return df


In [None]:
btc_df_prepared = prepare_df(btc_df)
btc_df_features = add_all_features(btc_df_prepared)


In [8]:
btc_output_file = os.path.join(PROCESSED_DATA, "btc_1min_with_features.csv.gz")
btc_df_features.to_csv(btc_output_file, index=False, compression="gzip")


In [None]:
eth_processed_file = os.path.join(PROCESSED_DATA, "eth_1min_processed.csv.gz")
eth_df = pd.read_csv(eth_processed_file, compression="gzip")

eth_df_prepared = prepare_df(eth_df)
eth_df_features = add_all_features(eth_df_prepared)

In [30]:
eth_output_file = os.path.join(PROCESSED_DATA, "eth_1min_with_features.csv.gz")
eth_df_features.to_csv(eth_output_file, index=False, compression="gzip")

### Code to pull the data in:

In [None]:
import os
import pandas as pd

# Define the directory containing processed data files
PROCESSED_DATA = os.path.join("data", "processed-data")

# File paths for the processed data files WITH features
btc_features_file = os.path.join(PROCESSED_DATA, "btc_1min_with_features.csv.gz")
eth_features_file = os.path.join(PROCESSED_DATA, "eth_1min_with_features.csv.gz")

# Load the featured data into the correct variables
btc_df_features = pd.read_csv(btc_features_file, compression="gzip")
eth_df_features = pd.read_csv(eth_features_file, compression="gzip")
