## Get Data

In [12]:
import pandas as pd


btc_path = r"..\data_fetch\BTC_3years.csv"
eth_path = r"..\data_fetch\ETH_3years.csv"

btc = pd.read_csv(btc_path)
eth = pd.read_csv(eth_path)

print("BTC head:")

btc.head()



BTC head:


Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume
0,2022-11-01,20485.273438,20485.273438,20647.289062,20359.845703,20494.898438,39819303159
1,2022-11-02,20159.503906,20159.503906,20742.810547,20087.134766,20482.958984,55552169483
2,2022-11-03,20209.988281,20209.988281,20382.095703,20086.240234,20162.689453,43228750179
3,2022-11-04,21147.230469,21147.230469,21209.560547,20188.019531,20208.769531,64072727950
4,2022-11-05,21282.691406,21282.691406,21446.886719,21097.634766,21144.832031,37846047609


In [13]:
print("ETH head:")

eth.head()

ETH head:


Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume
0,2022-11-01,1579.70459,1579.70459,1606.605713,1568.098511,1572.645386,14507311208
1,2022-11-02,1519.711792,1519.711792,1613.410645,1507.244751,1579.497925,23254218281
2,2022-11-03,1531.541748,1531.541748,1556.759644,1517.101685,1519.724854,14248351007
3,2022-11-04,1645.093384,1645.093384,1661.334717,1529.268433,1531.397583,20806964347
4,2022-11-05,1627.968018,1627.968018,1660.48645,1625.964233,1645.156494,11006973190


In [14]:
btc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096 entries, 0 to 1095
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       1096 non-null   object 
 1   Adj Close  1096 non-null   float64
 2   Close      1096 non-null   float64
 3   High       1096 non-null   float64
 4   Low        1096 non-null   float64
 5   Open       1096 non-null   float64
 6   Volume     1096 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 60.1+ KB


### Change datatype

In [15]:
btc["date"] = pd.to_datetime(btc["Date"])
eth["date"] = pd.to_datetime(eth["Date"])
     
btc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096 entries, 0 to 1095
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       1096 non-null   object        
 1   Adj Close  1096 non-null   float64       
 2   Close      1096 non-null   float64       
 3   High       1096 non-null   float64       
 4   Low        1096 non-null   float64       
 5   Open       1096 non-null   float64       
 6   Volume     1096 non-null   int64         
 7   date       1096 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(5), int64(1), object(1)
memory usage: 68.6+ KB


### Change columns name



In [4]:
# A helper function to rename OHLCV columns for any crypto dataset
def standardize_columns(df, prefix):
    """
    df: BTC or ETH dataframe
    prefix: 'BTC.USD.' or 'ETH.USD.'
    """
    rename_map = {
        f"{prefix}Open": "open",
        f"{prefix}High": "high",
        f"{prefix}Low": "low",
        f"{prefix}Close": "close",
        f"{prefix}Volume": "volume",
        f"{prefix}Adjusted": "adjusted" if f"{prefix}Adjusted" in df.columns else None
    }
    
    # Remove keys that map to None
    rename_map = {k: v for k, v in rename_map.items() if v is not None}

    return df.rename(columns=rename_map)

# Apply to BTC and ETH data
btc = standardize_columns(btc, prefix="BTC.USD.")
eth = standardize_columns(eth, prefix="ETH.USD.")
btc.head()

Unnamed: 0,open,high,low,close,volume,adjusted,date,day_of_week,weekend
0,70216.89844,71559.01563,68779.70313,69482.46875,49989800000.0,69482.46875,2024-11-01,Fri,0
1,69486.02344,69867.35156,69033.71875,69289.27344,18184610000.0,69289.27344,2024-11-02,Sat,1
2,69296.38281,69361.65625,67482.52344,68741.11719,34868310000.0,68741.11719,2024-11-03,Sun,1
3,68742.13281,69433.17969,66803.64844,67811.50781,41184820000.0,67811.50781,2024-11-04,Mon,0
4,67811.17188,70522.78906,67458.86719,69359.5625,46046890000.0,69359.5625,2024-11-05,Tue,0


In [5]:
btc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   open         366 non-null    float64       
 1   high         366 non-null    float64       
 2   low          366 non-null    float64       
 3   close        366 non-null    float64       
 4   volume       366 non-null    float64       
 5   adjusted     366 non-null    float64       
 6   date         366 non-null    datetime64[ns]
 7   day_of_week  366 non-null    object        
 8   weekend      366 non-null    int64         
dtypes: datetime64[ns](1), float64(6), int64(1), object(1)
memory usage: 25.9+ KB


## Technical Indicators for BTC/ETH

**This project constructs a comprehensive set of numerical features for BTC and ETH to support short-term price movement prediction. The features are grouped into:**
1. Price & Momentum 
2. Technical Indicators
3. Volatility & Risk
4. Volume-Based Features
5. Time-Based Features

# Price & Momentum Features

These features capture short-term market movement and momentum strength.

daily_return
Daily percentage change in the closing price.

log_return
Logarithmic daily return, commonly used in financial time series.

SMA(5), SMA(10), SMA(20)
Simple Moving Averages over 5, 10, and 20 days to represent short- and medium-term trends.

EMA(12), EMA(26)
Exponential Moving Averages with faster response to new price changes.

In [6]:
import numpy as np

def add_price_momentum_features(df, close_col="close"):
    df = df.sort_values("date").copy()

    # Daily returns
    df["daily_return"] = df[close_col].pct_change()
    df["log_return"] = np.log(df[close_col] / df[close_col].shift(1))

    # Simple moving averages
    df["SMA_5"] = df[close_col].rolling(window=5).mean()
    df["SMA_10"] = df[close_col].rolling(window=10).mean()
    df["SMA_20"] = df[close_col].rolling(window=20).mean()

    # Exponential moving average
    df["EMA_12"] = df[close_col].ewm(span=12, adjust=False).mean()

    return df




In [7]:
btc = add_price_momentum_features(btc, close_col="close")
eth = add_price_momentum_features(eth, close_col="close") 
btc.head(50)

Unnamed: 0,open,high,low,close,volume,adjusted,date,day_of_week,weekend,daily_return,log_return,SMA_5,SMA_10,SMA_20,EMA_12
0,70216.89844,71559.01563,68779.70313,69482.46875,49989800000.0,69482.46875,2024-11-01,Fri,0,,,,,,69482.46875
1,69486.02344,69867.35156,69033.71875,69289.27344,18184610000.0,69289.27344,2024-11-02,Sat,1,-0.00278,-0.002784,,,,69452.746395
2,69296.38281,69361.65625,67482.52344,68741.11719,34868310000.0,68741.11719,2024-11-03,Sun,1,-0.007911,-0.007943,,,,69343.264979
3,68742.13281,69433.17969,66803.64844,67811.50781,41184820000.0,67811.50781,2024-11-04,Mon,0,-0.013523,-0.013616,,,,69107.61003
4,67811.17188,70522.78906,67458.86719,69359.5625,46046890000.0,69359.5625,2024-11-05,Tue,0,0.022829,0.022572,68936.785938,,,69146.371948
5,69358.5,76460.15625,69322.03125,75639.07813,118593000000.0,75639.07813,2024-11-06,Wed,0,0.090536,0.086669,70168.107814,,,70145.249822
6,75637.08594,76943.11719,74480.42188,75904.85938,63467650000.0,75904.85938,2024-11-07,Thu,0,0.003514,0.003508,71491.225002,,,71031.3436
7,75902.83594,77252.75,75648.74219,76545.47656,55176860000.0,76545.47656,2024-11-08,Fri,0,0.00844,0.008404,73052.096876,,,71879.671748
8,76556.1875,76932.76563,75773.78906,76778.86719,29009480000.0,76778.86719,2024-11-09,Sat,1,0.003049,0.003044,74845.568752,,,72633.394124
9,76775.54688,81474.42188,76565.42969,80474.1875,82570590000.0,80474.1875,2024-11-10,Sun,1,0.048129,0.047007,77068.493752,73002.639845,,73839.670028


# Technical Indicators

Indicators widely used in financial modeling to measure trend strength and potential reversal points.

RSI(14)
The 14-day Relative Strength Index, reflecting market momentum and overbought/oversold conditions.

MACD_line
The difference between the 12-day and 26-day EMAs.

MACD_signal
The 9-day EMA of the MACD line.

MACD_hist
MACD histogram, defined as MACD_line minus MACD_signal, highlighting trend acceleration.

In [8]:
import numpy as np
import pandas as pd

def add_technical_indicators(df: pd.DataFrame, close_col: str) -> pd.DataFrame:
   
    # Use the specified close price column
    close = df[close_col]

    # ===== 1) RSI(14) =====
    # Price change between consecutive days
    delta = close.diff()

    # Positive changes (gains) and negative changes (losses)
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    # Use EMA (Wilder's method) to smooth gains and losses over 14 days
    window = 14
    avg_gain = gain.ewm(alpha=1/window, min_periods=window, adjust=False).mean()
    avg_loss = loss.ewm(alpha=1/window, min_periods=window, adjust=False).mean()

    # Avoid division by zero by replacing 0 loss with NaN
    rs = avg_gain / avg_loss.replace(0, np.nan)

    # RSI scaled to 0–100
    df["RSI_14"] = 100 - (100 / (1 + rs))

    # ===== 2) MACD (12, 26, 9) =====
    # Fast and slow EMAs
    ema_12 = close.ewm(span=12, adjust=False).mean()
    ema_26 = close.ewm(span=26, adjust=False).mean()

    # MACD line: difference between short-term and long-term EMAs
    df["MACD_line"] = ema_12 - ema_26

    # Signal line: 9-day EMA of the MACD line
    df["MACD_signal"] = df["MACD_line"].ewm(span=9, adjust=False).mean()

    # MACD histogram: MACD line minus signal line
    df["MACD_hist"] = df["MACD_line"] - df["MACD_signal"]

    return df


In [9]:
btc = add_technical_indicators(btc, close_col="close")
eth = add_technical_indicators(eth, close_col="close")


btc


Unnamed: 0,open,high,low,close,volume,adjusted,date,day_of_week,weekend,daily_return,log_return,SMA_5,SMA_10,SMA_20,EMA_12,RSI_14,MACD_line,MACD_signal,MACD_hist
0,70216.89844,71559.01563,68779.70313,69482.46875,4.998980e+10,69482.46875,2024-11-01,Fri,0,,,,,,69482.468750,,0.000000,0.000000,0.000000
1,69486.02344,69867.35156,69033.71875,69289.27344,1.818461e+10,69289.27344,2024-11-02,Sat,1,-0.002780,-0.002784,,,,69452.746395,,-15.411592,-3.082318,-12.329273
2,69296.38281,69361.65625,67482.52344,68741.11719,3.486831e+10,68741.11719,2024-11-03,Sun,1,-0.007911,-0.007943,,,,69343.264979,,-71.038134,-16.673481,-54.364653
3,68742.13281,69433.17969,66803.64844,67811.50781,4.118482e+10,67811.50781,2024-11-04,Mon,0,-0.013523,-0.013616,,,,69107.610030,,-187.967505,-50.932286,-137.035219
4,67811.17188,70522.78906,67458.86719,69359.56250,4.604689e+10,69359.56250,2024-11-05,Tue,0,0.022829,0.022572,68936.785938,,,69146.371948,,-153.945214,-71.534872,-82.410342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,114129.08590,116078.98440,112291.67970,112956.16410,6.452807e+10,112956.16410,2025-10-28,Tue,0,-0.010193,-0.010245,112844.717200,110971.44297,111581.851565,111934.790383,50.501681,-748.503255,-1268.393634,519.890379
362,112921.32810,113642.72660,109368.71880,110055.30470,6.219204e+10,110055.30470,2025-10-29,Wed,0,-0.025681,-0.026017,112648.993760,111110.30235,110999.337505,111645.638740,44.649810,-842.989051,-1183.312717,340.323666
363,110059.19530,111612.35160,106376.68750,108305.54690,6.967396e+10,108305.54690,2025-10-30,Thu,0,-0.015899,-0.016027,111981.757820,110881.96407,110753.896490,111131.778457,41.524258,-1046.991490,-1156.048472,109.056982
364,108304.41410,111031.82030,108288.27340,109556.16410,6.009036e+10,109556.16410,2025-10-31,Fri,0,0.011547,0.011481,110998.501580,110989.89142,110691.310555,110889.376248,44.513921,-1095.126599,-1143.864097,48.737498


# Volatility & Risk Features

These features measure the magnitude of market fluctuations.

volatility_7d
7-day rolling standard deviation of daily returns.

volatility_21d
21-day rolling standard deviation of daily returns.

In [10]:
import pandas as pd

# Volatility & Risk Features
# These features measure how much the market fluctuates over time.

# 7-day rolling volatility:
# For each day, compute the standard deviation of daily returns over the past 7 days.
btc["volatility_7d"] = btc["daily_return"].rolling(window=7).std()
eth["volatility_7d"] = eth["daily_return"].rolling(window=7).std()
# 21-day rolling volatility:
# For each day, compute the standard deviation of daily returns over the past 21 days.
btc["volatility_21d"] = btc["daily_return"].rolling(window=21).std()
eth["volatility_21d"] = eth["daily_return"].rolling(window=21).std()

btc.head()

Unnamed: 0,open,high,low,close,volume,adjusted,date,day_of_week,weekend,daily_return,...,SMA_5,SMA_10,SMA_20,EMA_12,RSI_14,MACD_line,MACD_signal,MACD_hist,volatility_7d,volatility_21d
0,70216.89844,71559.01563,68779.70313,69482.46875,49989800000.0,69482.46875,2024-11-01,Fri,0,,...,,,,69482.46875,,0.0,0.0,0.0,,
1,69486.02344,69867.35156,69033.71875,69289.27344,18184610000.0,69289.27344,2024-11-02,Sat,1,-0.00278,...,,,,69452.746395,,-15.411592,-3.082318,-12.329273,,
2,69296.38281,69361.65625,67482.52344,68741.11719,34868310000.0,68741.11719,2024-11-03,Sun,1,-0.007911,...,,,,69343.264979,,-71.038134,-16.673481,-54.364653,,
3,68742.13281,69433.17969,66803.64844,67811.50781,41184820000.0,67811.50781,2024-11-04,Mon,0,-0.013523,...,,,,69107.61003,,-187.967505,-50.932286,-137.035219,,
4,67811.17188,70522.78906,67458.86719,69359.5625,46046890000.0,69359.5625,2024-11-05,Tue,0,0.022829,...,68936.785938,,,69146.371948,,-153.945214,-71.534872,-82.410342,,


# Volume-Based Features

These features capture changes in trading activity and market participation.

volume_zscore
Standardized trading volume to detect abnormal spikes.

volume_change
Day-to-day percentage change in volume.

In [11]:
# Volume-Based Features

# 21-day rolling mean and std of volume for z-score
# This standardizes today's volume relative to the past month.
def add_volume_features(df, volume_col="volume", window_vol=21):
   
    df = df.sort_values("date").copy()

    rolling_mean_vol = (
        df[volume_col]
        .rolling(window=window_vol, min_periods=window_vol)
        .mean()
    )
    rolling_std_vol = (
        df[volume_col]
        .rolling(window=window_vol, min_periods=window_vol)
        .std()
    )

    # Handle potential division by zero in z-score
    df["volume_zscore"] = (df[volume_col] - rolling_mean_vol) / rolling_std_vol.replace(0, np.nan)

    # Day-to-day volume change
    df["volume_change"] = df[volume_col].pct_change()

    return df


In [12]:
btc = add_volume_features(btc, volume_col="volume", window_vol=21)
eth = add_volume_features(eth, volume_col="volume", window_vol=21)
btc.head(50)


Unnamed: 0,open,high,low,close,volume,adjusted,date,day_of_week,weekend,daily_return,...,SMA_20,EMA_12,RSI_14,MACD_line,MACD_signal,MACD_hist,volatility_7d,volatility_21d,volume_zscore,volume_change
0,70216.89844,71559.01563,68779.70313,69482.46875,49989800000.0,69482.46875,2024-11-01,Fri,0,,...,,69482.46875,,0.0,0.0,0.0,,,,
1,69486.02344,69867.35156,69033.71875,69289.27344,18184610000.0,69289.27344,2024-11-02,Sat,1,-0.00278,...,,69452.746395,,-15.411592,-3.082318,-12.329273,,,,-0.636234
2,69296.38281,69361.65625,67482.52344,68741.11719,34868310000.0,68741.11719,2024-11-03,Sun,1,-0.007911,...,,69343.264979,,-71.038134,-16.673481,-54.364653,,,,0.917462
3,68742.13281,69433.17969,66803.64844,67811.50781,41184820000.0,67811.50781,2024-11-04,Mon,0,-0.013523,...,,69107.61003,,-187.967505,-50.932286,-137.035219,,,,0.181153
4,67811.17188,70522.78906,67458.86719,69359.5625,46046890000.0,69359.5625,2024-11-05,Tue,0,0.022829,...,,69146.371948,,-153.945214,-71.534872,-82.410342,,,,0.118055
5,69358.5,76460.15625,69322.03125,75639.07813,118593000000.0,75639.07813,2024-11-06,Wed,0,0.090536,...,,70145.249822,,375.394811,17.851065,357.543746,,,,1.575483
6,75637.08594,76943.11719,74480.42188,75904.85938,63467650000.0,75904.85938,2024-11-07,Thu,0,0.003514,...,,71031.3436,,807.043821,175.689616,631.354205,,,,-0.464828
7,75902.83594,77252.75,75648.74219,76545.47656,55176860000.0,76545.47656,2024-11-08,Fri,0,0.00844,...,,71879.671748,,1187.136652,377.979023,809.157628,0.035577,,,-0.13063
8,76556.1875,76932.76563,75773.78906,76778.86719,29009480000.0,76778.86719,2024-11-09,Sat,1,0.003049,...,,72633.394124,,1490.019613,600.387141,889.632472,0.035173,,,-0.474246
9,76775.54688,81474.42188,76565.42969,80474.1875,82570590000.0,80474.1875,2024-11-10,Sun,1,0.048129,...,,73839.670028,,2005.124185,881.33455,1123.789635,0.035392,,,1.846331


# Time-Based Features

Since cryptocurrency markets exhibit weekly behavioral patterns, time features are added.

day_of_week
Encodes the weekday (Monday–Sunday) to capture cyclical effects.

weekend
Binary indicator (1 = weekend, 0 = weekday), as crypto markets behave differently on weekends.

In [13]:
# Time-Based Features

def add_time_features(df, date_col="date"):
    """
    Add time-based features:
    - day_of_week_num: Monday=0 ... Sunday=6
    - day_of_week: string name (e.g., 'Monday')
    - weekend: 1 if Saturday/Sunday else 0
    
    Required columns:
    - date_col: date as datetime or string
    """
    df = df.copy()
    
    df[date_col] = pd.to_datetime(df[date_col])
    df["day_of_week_num"] = df[date_col].dt.dayofweek
    df["day_of_week"] = df[date_col].dt.day_name()
    df["weekend"] = df["day_of_week_num"].isin([5, 6]).astype(int)

    return df


In [14]:
btc = add_time_features(btc, date_col="date")
eth = add_time_features(eth, date_col="date")
btc.head(50)

Unnamed: 0,open,high,low,close,volume,adjusted,date,day_of_week,weekend,daily_return,...,EMA_12,RSI_14,MACD_line,MACD_signal,MACD_hist,volatility_7d,volatility_21d,volume_zscore,volume_change,day_of_week_num
0,70216.89844,71559.01563,68779.70313,69482.46875,49989800000.0,69482.46875,2024-11-01,Friday,0,,...,69482.46875,,0.0,0.0,0.0,,,,,4
1,69486.02344,69867.35156,69033.71875,69289.27344,18184610000.0,69289.27344,2024-11-02,Saturday,1,-0.00278,...,69452.746395,,-15.411592,-3.082318,-12.329273,,,,-0.636234,5
2,69296.38281,69361.65625,67482.52344,68741.11719,34868310000.0,68741.11719,2024-11-03,Sunday,1,-0.007911,...,69343.264979,,-71.038134,-16.673481,-54.364653,,,,0.917462,6
3,68742.13281,69433.17969,66803.64844,67811.50781,41184820000.0,67811.50781,2024-11-04,Monday,0,-0.013523,...,69107.61003,,-187.967505,-50.932286,-137.035219,,,,0.181153,0
4,67811.17188,70522.78906,67458.86719,69359.5625,46046890000.0,69359.5625,2024-11-05,Tuesday,0,0.022829,...,69146.371948,,-153.945214,-71.534872,-82.410342,,,,0.118055,1
5,69358.5,76460.15625,69322.03125,75639.07813,118593000000.0,75639.07813,2024-11-06,Wednesday,0,0.090536,...,70145.249822,,375.394811,17.851065,357.543746,,,,1.575483,2
6,75637.08594,76943.11719,74480.42188,75904.85938,63467650000.0,75904.85938,2024-11-07,Thursday,0,0.003514,...,71031.3436,,807.043821,175.689616,631.354205,,,,-0.464828,3
7,75902.83594,77252.75,75648.74219,76545.47656,55176860000.0,76545.47656,2024-11-08,Friday,0,0.00844,...,71879.671748,,1187.136652,377.979023,809.157628,0.035577,,,-0.13063,4
8,76556.1875,76932.76563,75773.78906,76778.86719,29009480000.0,76778.86719,2024-11-09,Saturday,1,0.003049,...,72633.394124,,1490.019613,600.387141,889.632472,0.035173,,,-0.474246,5
9,76775.54688,81474.42188,76565.42969,80474.1875,82570590000.0,80474.1875,2024-11-10,Sunday,1,0.048129,...,73839.670028,,2005.124185,881.33455,1123.789635,0.035392,,,1.846331,6


In [15]:
btc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   open             366 non-null    float64       
 1   high             366 non-null    float64       
 2   low              366 non-null    float64       
 3   close            366 non-null    float64       
 4   volume           366 non-null    float64       
 5   adjusted         366 non-null    float64       
 6   date             366 non-null    datetime64[ns]
 7   day_of_week      366 non-null    object        
 8   weekend          366 non-null    int32         
 9   daily_return     365 non-null    float64       
 10  log_return       365 non-null    float64       
 11  SMA_5            362 non-null    float64       
 12  SMA_10           357 non-null    float64       
 13  SMA_20           347 non-null    float64       
 14  EMA_12           366 non-null    float64  

In [16]:
feature_cols = [
    "daily_return",
    "log_return",
    "SMA_5", "SMA_10", "SMA_20",
    "EMA_12", 
    "RSI_14",
    "MACD_line", "MACD_signal", "MACD_hist",
    "volatility_7d", "volatility_21d",
    "volume_zscore", "volume_change",
    # 如果你之後有情緒特徵，也可以加在這裡
    # "sentiment_mean", "news_count", "sentiment_z"
]
# Drop rows where any of the feature columns are NaN
btc_clean = btc.dropna(subset=feature_cols).copy()
eth_clean = eth.dropna(subset=feature_cols).copy()


In [17]:
# Export BTC features to CSV
btc_clean.to_csv(
    r"data\BTC_features.csv",
    index=False  # do not save the DataFrame index as a column
)

# Export ETH features to CSV
eth_clean.to_csv(
    r"data\ETH_features.csv",
    index=False
)
