In [16]:
import pandas as pd
import numpy as np
from ta.momentum import RSIIndicator
from ta.trend import MACD, SMAIndicator
from ta.volatility import BollingerBands

In [17]:
# Cargar datos
df = pd.read_csv("../../data/raw/SPLV_raw.csv", parse_dates=["Date"])

In [18]:
# Variables de calendario
df["day_of_week"] = df["Date"].dt.dayofweek
df["is_month_end"] = df["Date"].dt.is_month_end.astype(int)
df["month"] = df["Date"].dt.month  

In [19]:
# Variables de calendario
df["day_of_week"] = df["Date"].dt.dayofweek
df["is_month_end"] = df["Date"].dt.is_month_end.astype(int)
df["month"] = df["Date"].dt.month  

In [20]:
# Diferencias de precio y retornos
df["price_diff"] = df["Close"] - df["Open"]
df["pct_diff"] = df["price_diff"] / df["Open"]
df["return_daily"] = df["Close"].pct_change()
for lag in range(1, 6):
    df[f"return_lag_{lag}"] = df["return_daily"].shift(lag)

In [21]:
# Medias móviles y desviación
df["sma_5"] = SMAIndicator(close=df["Close"], window=5).sma_indicator().shift(1)
df["sma_10"] = SMAIndicator(close=df["Close"], window=10).sma_indicator().shift(1)  
df["rolling_std_return_5"] = df["return_daily"].rolling(window=5).std().shift(1)

In [22]:
# Indicadores técnicos
df["RSI_5"] = RSIIndicator(close=df["Close"], window=5).rsi().shift(1)

macd = MACD(close=df["Close"])
df["MACD"] = macd.macd().shift(1)
df["MACD_signal"] = macd.macd_signal().shift(1)

bb = BollingerBands(close=df["Close"], window=5, window_dev=2)
df["bb_middle"] = bb.bollinger_mavg().shift(1)
df["bb_upper"] = bb.bollinger_hband().shift(1)
df["bb_lower"] = bb.bollinger_lband().shift(1)

In [23]:
# Bandera de volumen alto (outlier)
df["rolling_mean_vol"] = df["Volume"].rolling(window=5).mean().shift(1)
df["rolling_std_vol"] = df["Volume"].rolling(window=5).std().shift(1)
df["volume_outlier"] = (
    df["Volume"] > df["rolling_mean_vol"] + 2 * df["rolling_std_vol"]
).astype(int)

In [24]:
# Flags binarias técnicas
sma50 = SMAIndicator(close=df["Close"], window=50).sma_indicator()
df["price_above_SMA50"] = (df["Close"] > sma50).shift(1).fillna(0).astype(int)
df["RSI_overbought"] = (df["RSI_5"] > 70).astype(int)
df["MACD_above_signal"] = (df["MACD"] > df["MACD_signal"]).astype(int)

In [25]:
# Nuevas señales
df["return_3d"] = df["Close"].pct_change(3).shift(1)
df["vol_month"] = df["Close"].rolling(21).std().shift(1)
df["price_vs_sma10"] = (df["Close"] - df["sma_10"]) / df["sma_10"]

In [26]:
# Momentum y señales adicionales
df["momentum_3d"] = df["Close"] - df["Close"].shift(3)
df["vol_change"] = (df["Volume"] - df["rolling_mean_vol"]) / df["rolling_mean_vol"]
df["sma_cross_up"] = (
    (df["sma_5"] > df["sma_10"]) & (df["sma_5"].shift(1) <= df["sma_10"].shift(1))
).astype(int)

In [27]:
df["volatility_ratio"] = df["rolling_std_return_5"] / (df["vol_month"] + 1e-6)
df["gap_up"] = (df["Open"] - df["Close"].shift(1)) / (df["Close"].shift(1) + 1e-6)
df["lower_shadow"] = (df["Close"] - df["Low"]) / ((df["High"] - df["Low"]) + 1e-6)

In [28]:
# Target: sube el cierre de mañana respecto a hoy
df["target"] = (df["Close"].shift(-1) > df["Close"]).astype(int)

In [29]:
# Verificar desbalance
print("Distribución del target:")
print(df["target"].value_counts(normalize=True))

Distribución del target:
target
1    0.531105
0    0.468895
Name: proportion, dtype: float64


In [None]:
# Selección de columnas finales (actualizadas)
feature_cols = [
    "day_of_week", "month", "is_month_end",
    "price_diff", "pct_diff", "return_daily",
    "return_lag_1", "return_lag_2", "return_lag_3", "return_lag_4", "return_lag_5",
    "sma_5", "sma_10", "rolling_std_return_5",
    "RSI_5", "MACD", "MACD_signal",
    "bb_middle", "bb_upper", "bb_lower",
    "volume_outlier", "price_above_SMA50",
    "RSI_overbought", "MACD_above_signal",
    "return_3d", "vol_month", "price_vs_sma10", 
    "momentum_3d", "vol_change", "sma_cross_up", 
    "volatility_ratio", "gap_up", "lower_shadow",
    "target"
]

# %%
# Limpiar y exportar
df_features = df[["Date", "Open", "High", "Low", "Close"] + feature_cols].dropna().copy()
df_features.to_csv("../../data/processed/SPLV_clean.csv", index=False)
print("✅ Features listas y guardadas en data/processed/SPLV_clean.csv")


✅ Features listas y guardadas en data/processed/SPLV_clean.csv
