In [71]:
#Importaciones
import pandas as pd
import numpy as np
import os
from datetime import datetime
from ta.momentum import RSIIndicator
from ta.trend import MACD, SMAIndicator
from ta.volatility import BollingerBands

In [72]:
# Cargar datos
csv_path = "../../data/raw/JNJ.csv"
df = pd.read_csv(csv_path, parse_dates=["Date"])

In [73]:
# 3) Log-transform de Volume
df["log_vol"] = np.log1p(df["Volume"])

df = df.drop(columns=['Volume'])

In [74]:
# 4) Features de calendario
df['year'] = df['Date'].dt.year
df["month"] = df["Date"].dt.month
df['day'] = df['Date'].dt.day
df["day_of_week"] = df["Date"].dt.dayofweek  
df["is_month_end"] = df["Date"].dt.is_month_end.astype(int)

In [75]:
# 5) Price differences y returns
df["price_diff"] = df["Close"] - df["Open"]
df["pct_diff"] = df["price_diff"] / df["Open"]
df["return_daily"] = df["Close"].pct_change()
df["return_lag_1"] = df["return_daily"].shift(1)

In [76]:
# 6) Rolling statistics
df["sma_5"] = SMAIndicator(close=df["Close"], window=5).sma_indicator()
df["rolling_std_return_5"] = df["return_daily"].rolling(window=5).std()

In [77]:
# 7) Technical indicators
# RSI(5)
df["RSI_5"] = RSIIndicator(close=df["Close"], window=5).rsi()

In [78]:
# MACD & signal
macd = MACD(close=df["Close"])
df["MACD"] = macd.macd()
df["MACD_signal"] = macd.macd_signal()

In [79]:
# Bollinger Bands (20, 2)
bb = BollingerBands(close=df["Close"], window=5, window_dev=2)
df["bb_middle"] = bb.bollinger_mavg()
df["bb_upper"] = bb.bollinger_hband()
df["bb_lower"] = bb.bollinger_lband()

In [80]:
# 8) Flags y binary features
df["volume_spike"] = (
    df["log_vol"] > df["log_vol"].mean() + 2 * df["log_vol"].std()
).astype(int)
df["price_above_SMA50"] = (
    df["Close"] > SMAIndicator(close=df["Close"], window=50).sma_indicator()
).astype(int)
df["RSI_overbought"] = (df["RSI_5"] > 70).astype(int)
df["MACD_above_signal"] = (df["MACD"] > df["MACD_signal"]).astype(int)

In [81]:
# 9) Limpiar filas con NaN generadas por cálculos
feature_cols = [
    "log_vol", "day_of_week", "is_month_end", "month",
    "price_diff", "pct_diff", "return_daily", "return_lag_1",
    "sma_5", "rolling_std_return_5",
    "RSI_5", "MACD", "MACD_signal",
    "bb_middle", "bb_upper", "bb_lower",
    "volume_spike", "price_above_SMA50", "RSI_overbought", "MACD_above_signal"
]
df_features = df.dropna(subset=feature_cols).copy()

In [82]:
# Crear variable objetivo 'target' (1 si Close > Open, 0 si Close <= Open)
#df_features['target'] = np.where(df_features['Close'] > df_features['Open'], 1, 0)
# Definir el target: Predecir si el precio de SPLV sube o baja (1: sube, 0: baja)
df_features["target"] = (df_features["Close"].shift(-1) > df_features["Close"]).astype(int)

In [83]:
# Información general del DataFrame
print("\nInformación del DataFrame:")
print(df_features.info())


Información del DataFrame:
<class 'pandas.core.frame.DataFrame'>
Index: 1042 entries, 33 to 1074
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  1042 non-null   datetime64[ns]
 1   Close                 1042 non-null   float64       
 2   High                  1042 non-null   float64       
 3   Low                   1042 non-null   float64       
 4   Open                  1042 non-null   float64       
 5   log_vol               1042 non-null   float64       
 6   year                  1042 non-null   int32         
 7   month                 1042 non-null   int32         
 8   day                   1042 non-null   int32         
 9   day_of_week           1042 non-null   int32         
 10  is_month_end          1042 non-null   int64         
 11  price_diff            1042 non-null   float64       
 12  pct_diff              1042 non-null   float64       

In [84]:
# 10) Guardar CSV con todas las features
output_path = "../../data/processed/JNJ_clean.csv"
df_features.to_csv(output_path, index=False)
print(f"Features saved to {output_path}")

Features saved to ../../data/processed/JNJ_clean.csv
