In [None]:
import pandas as pd
import numpy as np
from ta.momentum import RSIIndicator
from ta.trend import MACD, SMAIndicator, EMAIndicator
from ta.volatility import BollingerBands

def hurst_exponent(ts):
    if len(ts) < 20:
        return np.nan
    lags = range(2, 20)
    tau = [np.std(ts[lag:] - ts[:-lag]) for lag in lags]
    
    if np.any(np.isnan(tau)) or np.any(np.isinf(tau)):
        return np.nan
    poly = np.polyfit(np.log(lags), np.log(tau), 1)
    return poly[0]


In [33]:
# Cargar datos
df = pd.read_csv("/Users/melaniealvarez/Documents/Octavo semestres/Data Mining/trading/project_trading_DT/data/raw/TLT.csv", parse_dates=["Date"])

# ------------------ Variables temporales ------------------
df["day_of_week"] = df["Date"].dt.dayofweek
df["is_month_end"] = df["Date"].dt.is_month_end.astype(int)
df["month"] = df["Date"].dt.month

# ------------------ Precio y retornos ------------------
df["price_diff"] = df["Close"] - df["Open"]
df["pct_diff"] = df["price_diff"] / df["Open"]
df["log_vol"] = np.log1p(df["Volume"])  # Log volumen

df["return_daily"] = df["Close"].pct_change().shift(1)
for lag in range(1, 6):
    df[f"return_lag_{lag}"] = df["return_daily"].shift(lag)

# ------------------ Media móvil y volatilidad ------------------
df["sma_5"] = SMAIndicator(close=df["Close"], window=5).sma_indicator().shift(1)
df["ema_5"] = EMAIndicator(close=df["Close"], window=5).ema_indicator().shift(1)
df["rolling_std_return_5"] = df["return_daily"].rolling(window=5).std().shift(1)

# ------------------ Indicadores técnicos (con shift) ------------------
rsi = RSIIndicator(close=df["Close"], window=5)
df["RSI_5_lag1"] = rsi.rsi().shift(1)

macd = MACD(close=df["Close"], window_slow=26, window_fast=12, window_sign=9)
df["MACD_lag1"] = macd.macd().shift(1)
df["MACD_signal_lag1"] = macd.macd_signal().shift(1)

bb = BollingerBands(close=df["Close"], window=20, window_dev=2)
df["bb_middle_lag1"] = bb.bollinger_mavg().shift(1)
df["bb_upper_lag1"] = bb.bollinger_hband().shift(1)
df["bb_lower_lag1"] = bb.bollinger_lband().shift(1)

# ------------------ Condiciones binarias ------------------
sma_50 = SMAIndicator(close=df["Close"], window=50).sma_indicator()
df["price_above_SMA50"] = (df["Close"] > sma_50).shift(1).fillna(0).astype(int)
df["RSI_overbought"] = (rsi.rsi() > 70).shift(1).fillna(0).astype(int)
df["MACD_above_signal"] = (macd.macd() > macd.macd_signal()).shift(1).fillna(0).astype(int)
df["volume_spike"] = (df["Volume"] > 1.5 * df["Volume"].rolling(window=5).mean()).shift(1).fillna(0).astype(int)


# ------------------ Hurst Exponent ------------------
df["hurst_20"] = df["Close"].rolling(window=20).apply(hurst_exponent).shift(1)

df.tail(40)



Unnamed: 0,Date,Close,High,Low,Open,Volume,day_of_week,is_month_end,month,price_diff,...,MACD_lag1,MACD_signal_lag1,bb_middle_lag1,bb_upper_lag1,bb_lower_lag1,price_above_SMA50,RSI_overbought,MACD_above_signal,volume_spike,hurst_20
1037,2025-02-12,86.644905,87.012418,86.277384,86.7045,48536200,2,0,2,-0.059595,...,0.340192,0.101465,87.1788,89.521591,84.83601,0,0,1,0,
1038,2025-02-13,88.085175,88.293765,87.479266,87.499128,42718100,3,0,2,0.586046,...,0.215598,0.124292,87.290209,89.282751,85.297666,0,0,1,1,
1039,2025-02-14,88.552025,89.078469,88.522227,88.780478,27460700,4,0,2,-0.228453,...,0.230418,0.145517,87.400884,89.310118,85.49165,1,0,1,0,
1040,2025-02-18,87.509071,88.244106,87.449476,87.946122,29429200,1,0,2,-0.437051,...,0.276645,0.171743,87.521044,89.402244,85.639844,1,0,1,0,
1041,2025-02-19,87.618324,87.856712,87.300471,87.360066,20584300,2,0,2,0.258258,...,0.226512,0.182696,87.581634,89.377389,85.785878,0,0,1,0,
1042,2025-02-20,87.946121,88.164647,87.856721,87.856721,27913400,3,0,2,0.0894,...,0.193367,0.184831,87.609085,89.389393,85.828778,0,0,1,0,
1043,2025-02-21,89.008942,89.326795,88.22424,88.273907,46912300,4,0,2,0.735034,...,0.191345,0.186133,87.675692,89.400924,85.950459,1,0,1,0,
1044,2025-02-24,89.267204,89.455925,88.661295,88.730825,26218000,0,0,2,0.536379,...,0.272363,0.203379,87.82909,89.452654,86.205527,1,0,1,1,
1045,2025-02-25,90.806793,90.886258,90.190957,90.359815,50125100,1,0,2,0.446979,...,0.353337,0.233371,87.976102,89.56099,86.391214,1,0,1,0,
1046,2025-02-26,91.343178,91.442506,90.588273,90.806799,32220200,2,0,2,0.536379,...,0.535568,0.29381,88.149121,90.127855,86.170386,1,1,1,0,


In [None]:
# ------------------ VWAP y binaria ------------------
df["typical_price"] = (df["High"] + df["Low"] + df["Close"]) / 3
df["vwap"] = df["typical_price"]  # En datos diarios, VWAP = typical price
df["vwap_lag1"] = df["vwap"].shift(1)
df["close_above_vwap"] = (df["Close"] > df["vwap_lag1"]).astype(int)

# ------------------ Crear target ------------------
df["target"] = (df["Close"].shift(-1) > df["Close"]).astype(int)

# ------------------ Eliminar columnas auxiliares y filas con NaN ------------------
print(df.isna().sum().sort_values(ascending=False))
df.drop(columns=["typical_price", "vwap"], inplace=True)
df.dropna(inplace=True)

# ------------------ Selección final de features ------------------
selected_features = [
    "day_of_week", "is_month_end", "month",
    "price_diff", "pct_diff", "log_vol",
    "return_lag_1", "return_lag_2", "return_lag_3", "return_lag_4", "return_lag_5",
    "sma_5", "ema_5", "rolling_std_return_5",
    "RSI_5_lag1", "MACD_lag1", "MACD_signal_lag1",
    "bb_middle_lag1", "bb_upper_lag1", "bb_lower_lag1",
    "volume_spike", "price_above_SMA50", "RSI_overbought", "MACD_above_signal",
    "hurst_20", "vwap_lag1", "close_above_vwap"
]

X = df[selected_features]
y = df["target"]

In [23]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   day_of_week           0 non-null      int32  
 1   is_month_end          0 non-null      int64  
 2   month                 0 non-null      int32  
 3   price_diff            0 non-null      float64
 4   pct_diff              0 non-null      float64
 5   log_vol               0 non-null      float64
 6   return_lag_1          0 non-null      float64
 7   return_lag_2          0 non-null      float64
 8   return_lag_3          0 non-null      float64
 9   return_lag_4          0 non-null      float64
 10  return_lag_5          0 non-null      float64
 11  sma_5                 0 non-null      float64
 12  ema_5                 0 non-null      float64
 13  rolling_std_return_5  0 non-null      float64
 14  RSI_5_lag1            0 non-null      float64
 15  MACD_lag1             0 non-null      fl

In [24]:
y

Series([], Name: target, dtype: int64)

In [25]:
hurst_exponent(df["Close"].iloc[0:25])


nan