In [15]:
#Importaciones
import pandas as pd
import numpy as np
import os
from datetime import datetime
from ta.momentum import RSIIndicator
from ta.trend import MACD, SMAIndicator
from ta.volatility import BollingerBands

In [16]:
# Cargar datos
csv_path = "../../data/raw/JNJ.csv"
df = pd.read_csv(csv_path, parse_dates=["Date"])

In [17]:
# 3) Log-transform de Volume
df["log_vol"] = np.log1p(df["Volume"])

df = df.drop(columns=['Volume'])

In [18]:
# 4) Features de calendario
df['year'] = df['Date'].dt.year
df["month"] = df["Date"].dt.month
df['day'] = df['Date'].dt.day
df["day_of_week"] = df["Date"].dt.dayofweek  
df["is_month_end"] = df["Date"].dt.is_month_end.astype(int)

In [19]:
# 5) Price differences y returns
df["price_diff"] = df["Close"].shift(1) - df["Open"].shift(1)
df["pct_diff"] = df["price_diff"] / df["Open"].shift(1)
df["return_daily"] = df["Close"].pct_change().shift(1)
df["return_lag_1"] = df["return_daily"].shift(1)
df["return_lag_2"] = df["return_daily"].shift(2)
df["return_lag_3"] = df["return_daily"].shift(3)
df["return_lag_4"] = df["return_daily"].shift(4)
df["return_lag_5"] = df["return_daily"].shift(5)


In [20]:
# 6) Rolling statistics
df["sma_5"] = SMAIndicator(close=df["Close"], window=5).sma_indicator().shift(1)
df["rolling_std_return_5"] = df["return_daily"].rolling(window=5).std().shift(1)

In [21]:
# 7) Technical indicators
# RSI(5)
df["RSI_5"] = RSIIndicator(close=df["Close"], window=5).rsi().shift(1)

In [22]:
# MACD & signal
macd = MACD(close=df["Close"])
df["MACD"] = macd.macd().shift(1)
df["MACD_signal"] = macd.macd_signal().shift(1)

In [23]:
# Bollinger Bands (20, 2)
bb = BollingerBands(close=df["Close"], window=5, window_dev=2)
df["bb_middle"] = bb.bollinger_mavg().shift(1)
df["bb_upper"] = bb.bollinger_hband().shift(1)
df["bb_lower"] = bb.bollinger_lband().shift(1)

In [24]:
# 8) Flags y binary features
# Calcular la media y desviación estándar con una ventana deslizante y luego hacer un shift(1) para no usar datos de hoy
df["rolling_mean_vol"] = df["log_vol"].rolling(window=5).mean().shift(1)  # Media de los últimos 5 días, desplazada
df["rolling_std_vol"] = df["log_vol"].rolling(window=5).std().shift(1)    # Desviación estándar de los últimos 5 días, desplazada

# Crear la variable volume_spike
df["volume_spike"] = (
    df["log_vol"] > df["rolling_mean_vol"] + 2 * df["rolling_std_vol"]
).astype(int)

# Create SMA50 indicator
sma_indicator = SMAIndicator(close=df["Close"], window=50)
df["price_above_SMA50"] = (df["Close"] > sma_indicator.sma_indicator()).shift(1).fillna(0).astype(int)

# RSI overbought condition
df["RSI_overbought"] = (df["RSI_5"] > 70).shift(1).fillna(0).astype(int)

# MACD above signal line
df["MACD_above_signal"] = (df["MACD"] > df["MACD_signal"]).shift(1).fillna(0).astype(int)

In [25]:
# 9) Limpiar filas con NaN generadas por cálculos
feature_cols = [
    "log_vol", "day_of_week", "is_month_end", "month",
    "price_diff", "pct_diff", "return_daily", "return_lag_1","return_lag_2","return_lag_3","return_lag_4","return_lag_5",
    "sma_5", "rolling_std_return_5",
    "RSI_5", "MACD", "MACD_signal",
    "bb_middle", "bb_upper", "bb_lower",
    "volume_spike", "price_above_SMA50", "RSI_overbought", "MACD_above_signal"
]
df_features = df.dropna(subset=feature_cols).copy()

In [26]:
# Definir el target: Predecir si el precio de JNJ sube o baja (1: sube, 0: baja)
#Predecir hoy con ayer
#df_features["target"] = (df_features["Close"] > df_features["Close"].shift(1)).astype(int)

# Predecir mañana con hoy
df_features["target"] = (df_features["Close"].shift(-1) > df_features["Close"]).astype(int)

# (1 si Close > Open, 0 si Close <= Open)
#df_features['target'] = np.where(df_features['Close'] > df_features['Open'], 1, 0)

In [27]:
# Información general del DataFrame
print("\nInformación del DataFrame:")
print(df_features.info())


Información del DataFrame:
<class 'pandas.core.frame.DataFrame'>
Index: 3559 entries, 34 to 3592
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  3559 non-null   datetime64[ns]
 1   Close                 3559 non-null   float64       
 2   High                  3559 non-null   float64       
 3   Low                   3559 non-null   float64       
 4   Open                  3559 non-null   float64       
 5   log_vol               3559 non-null   float64       
 6   year                  3559 non-null   int32         
 7   month                 3559 non-null   int32         
 8   day                   3559 non-null   int32         
 9   day_of_week           3559 non-null   int32         
 10  is_month_end          3559 non-null   int64         
 11  price_diff            3559 non-null   float64       
 12  pct_diff              3559 non-null   float64       

In [28]:
# 10) Guardar CSV con todas las features
output_path = "../../data/processed/JNJ_clean.csv"
df_features.to_csv(output_path, index=False)
print(f"Features saved to {output_path}")

Features saved to ../../data/processed/JNJ_clean.csv
