# Configuraci√≥n del Entorno

Configura el entorno de ejecuci√≥n con todas las dependencias necesarias para la implementaci√≥n.

- Instala PyTorch y librer√≠as auxiliares
- Configura el dispositivo (GPU/CPU) para acelerar el entrenamiento
- Importa todos los m√≥dulos que se usar√°n en el proyecto
- `torch.__version__`: Versi√≥n de PyTorch (puedes cambiar a una espec√≠fica)
- `device`: Cambia entre "cuda" (GPU) o "cpu" seg√∫n disponibilidad
- Prepara el contexto para las celdas siguientes


In [1]:
# BITCOIN TRADING CON M-DQN (PyTorch)
## Implementaci√≥n basada en el paper de Scientific Reports
#### !python -m pip venv .venv
# Entorno ya creado
# !source BTCenv/bin/activate
# %pip install gym-trading-env

import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import random
from collections import deque
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PyTorch version: 2.10.0+cu128
CUDA available: True


# Preprocesamiento de Datos

Prepara los datos hist√≥ricos de Bitcoin para el entrenamiento del modelo.

- Carga datos CSV de precios de Bitcoin
- Realiza limpieza y transformaci√≥n de datos
- Genera caracter√≠sticas t√©cnicas (returns, volatilidad)

### Par√°metros:

- `window_size=24`: Tama√±o de ventana para caracter√≠sticas hist√≥ricas
- `noise_level=0.1`: Nivel de ruido en sentimientos simulados
- Columnas del CSV: Ajustar seg√∫n formato de tus datos

**_üõëNotas y Apuntes a resolver_**:

- Las caracter√≠sticas generadas deben tener sentido financiero


In [2]:
class BitcoinPricePreprocessor:
    def __init__(self, window_size=24):
        self.window_size = window_size

    def load_data(
        self, csv_path: str = "./btc_hourly.csv", skip_first_line: bool = True
    ):
        """Cargar CSV de CryptoDataDownload"""
        print(f"Cargando: {csv_path}")

        # Saltar primera l√≠nea descriptiva
        df = pd.read_csv(csv_path, skiprows=1, parse_dates=["date"], index_col="date")

        # Renombrar columnas
        df.columns = [
            "unix",
            "symbol",
            "open",
            "high",
            "low",
            "close",
            "volume_btc",
            "volume_usd",
        ]

        # Convertir a num√©rico
        numeric_cols = ["open", "high", "low", "close", "volume_btc", "volume_usd"]
        for col in numeric_cols:
            df[col] = pd.to_numeric(df[col], errors="coerce")

        # Filtrar precios inv√°lidos
        df = df[df["close"] > 0]

        print(
            f"{len(df)} filas cargadas | {df.index[0].date()} a {df.index[-1].date()}"
        )
        return df

    def create_features(self, df, feature_level="medium"):
        """Crea caracter√≠sticas t√©cnicas seg√∫n nivel de complejidad"""
        print(f"Creando caracter√≠sticas ({feature_level} level)...")

        # B√ÅSICAS (siempre necesarias)
        df["returns"] = df["close"].pct_change()
        df["volatility_6h"] = df["returns"].rolling(6).std()
        df["volatility_24h"] = df["returns"].rolling(24).std()
        df["momentum_6h"] = df["close"] / df["close"].shift(6) - 1

        if feature_level in ["medium", "high"]:
            # MEDIUM: caracter√≠sticas adicionales del paper
            df["price_range"] = (df["high"] - df["low"]) / df["close"]
            df["volume_ratio"] = df["volume_usd"] / df["volume_usd"].rolling(24).mean()
            df["hour_of_day"] = df.index.hour

            # SMA y cruces
            df["sma_12"] = df["close"].rolling(12).mean()
            df["sma_24"] = df["close"].rolling(24).mean()
            df["sma_cross"] = df["sma_12"] - df["sma_24"]

        if feature_level == "high":
            # HIGH: caracter√≠sticas avanzadas
            df["rsi"] = self._calculate_rsi(df["close"], period=14)
            df["bb_upper"], df["bb_lower"] = self._bollinger_bands(df["close"])
            df["atr"] = self._average_true_range(df)

        df.dropna(inplace=True)
        print(f"{len(df.columns)} caracter√≠sticas creadas")
        return df

    def _calculate_rsi(self, prices, period=14):
        """Calcula RSI"""
        delta = prices.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
        rs = gain / loss
        return 100 - (100 / (1 + rs))

    def _bollinger_bands(self, prices, window=20, num_std=2):
        """Calcula Bollinger Bands"""
        sma = prices.rolling(window).mean()
        std = prices.rolling(window).std()
        upper = sma + (std * num_std)
        lower = sma - (std * num_std)
        return upper, lower

    def _average_true_range(self, df, period=14):
        """Calcula Average True Range"""
        high_low = df["high"] - df["low"]
        high_close = np.abs(df["high"] - df["close"].shift())
        low_close = np.abs(df["low"] - df["close"].shift())
        ranges = pd.concat([high_low, high_close, low_close], axis=1)
        true_range = ranges.max(axis=1)
        return true_range.rolling(period).mean()

class TwitterSentimentProcessor:
    def __init__(self):
        self.analyzer = SentimentIntensityAnalyzer()
        self.relevant_columns = [
            'origen', 'date', 'username', 'user_fullname', 
            'n_replies', 'n_likes', 'n_retweets', 'url', 'text'
        ]

    def clean_text(self, text):
        """Limpieza de texto (URLs, menciones, caracteres especiales)"""
        if not isinstance(text, str): return ""
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'[^a-z\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def process_sentiment(self, csv_path):
        """
        Carga selectiva -> Quitar Zona Horaria -> Limpieza -> VADER -> Agregaci√≥n
        """
        print(f"Leyendo dataset de Twitter: {csv_path}")
        
        df_sent = pd.read_csv(csv_path, sep=';', usecols=self.relevant_columns, low_memory=False)
        
        df_sent['date'] = pd.to_datetime(df_sent['date'], errors='coerce') # Convertir a datetime
        
        # Eliminamos la zona horaria para que haga match perfecto con btc_hourly
        df_sent['date'] = df_sent['date'].dt.tz_localize(None)
        # La pregunta es... debemos?
        
        df_sent.dropna(subset=['date', 'text'], inplace=True) # Depuramos filas sin fecha o texto
        
        print("Limpiando data")
        df_sent['clean_text'] = df_sent['text'].apply(self.clean_text)
        
        print("Estimando sentimiento con VADER")
        df_sent['vader_score'] = df_sent['clean_text'].apply(
            lambda x: self.analyzer.polarity_scores(x)['compound'] if len(x) > 0 else 0
        )
        
        # Agrupamos por hora y redondeamos a 2 decimales
        df_sent.set_index('date', inplace=True)
        hourly_sentiment = df_sent['vader_score'].resample('h').mean().round(2) 
        
        return hourly_sentiment

def load_btc_and_twitter_data(price_file, twitter_file):
    price_proc = BitcoinPricePreprocessor()
    sent_proc = TwitterSentimentProcessor()
    
    # Procesar precios
    df_price = price_proc.load_data(price_file)
    df_price = price_proc.create_features(df_price, feature_level='medium')
    
    # Obtener Sentimiento Real
    hourly_sent = sent_proc.process_sentiment(twitter_file)
    
    # Sincronizaci√≥n
    # Unimos usando el √≠ndice de tiempo del precio
    df = df_price.join(hourly_sent, how='left')
    df.rename(columns={'vader_score': 'sentiment'}, inplace=True)
    
    # Interpolar sentimientos faltantes (lineal, max 3 horas de hueco)
    df['sentiment'] = df['sentiment'].interpolate(method='linear', limit=3)
    
    # Eliminar NaNs cr√≠ticos (donde no hay ni precio ni sentimiento tras interpolar)
    df.dropna(inplace=True)
    
    # Split Train/Test
    train_size = int(len(df) * 0.8)
    train_df = df.iloc[:train_size]
    test_df = df.iloc[train_size:]
    
    print(f"\nSincronizaci√≥n finalizada:")
    print(f"- Registros totales: {len(df)}")
    print(f"- Sentimiento promedio: {df['sentiment'].mean():.4f}")
    
    return train_df, test_df

# Funci√≥n de carga de datos
# def load_btc_data_for_training(file_path="btc_hourly.csv", feature_level="medium"):
#     """Carga y prepara datos para entrenamiento"""
#     preprocessor = BitcoinPricePreprocessor()

#     # Cargar
#     df = preprocessor.load_data(file_path)

#     # Crear caracter√≠sticas
#     df = preprocessor.create_features(df, feature_level=feature_level)

#     # Estad√≠sticas
#     print("\tResumen:")
#     print(f"\tFilas: {len(df)}")
#     print(f"\tColumnas: {len(df.columns)}")
#     print(f"\tPrecio inicial: ${df['close'].iloc[0]:.2f}")
#     print(f"\tPrecio final: ${df['close'].iloc[-1]:.2f}")
#     print(f"\tReturns promedio: {df['returns'].mean():.6f}")
#     print(f"\tVolatilidad promedio: {df['volatility_24h'].mean():.6f}")

#     return df, preprocessor


In [3]:
# btc = BitcoinPricePreprocessor().load_data()
# btc

price_proc = BitcoinPricePreprocessor()
sent_proc = TwitterSentimentProcessor()

path_btc = 'data/raw/bitcoin_price/btc_hourly.csv'
path_twitter = 'data/raw/twitter/engtweetsbtc_clean.csv'

df_btc = price_proc.load_data(path_btc)

# Ordenamos cronol√≥gicamente
df_btc = df_btc.sort_index(ascending=True)

# Crear caracter√≠sticas t√©cnicas
df_btc = price_proc.create_features(df_btc, feature_level='medium')

# Procesar Twitter
hourly_sentiment = sent_proc.process_sentiment(path_twitter)

# Sincronizaci√≥n e Interpolaci√≥n
# Sin zona horaria del timestamp de twitter
df_final = df_btc.join(hourly_sentiment, how='left')
df_final.rename(columns={'vader_score': 'sentiment'}, inplace=True)

df_final['sentiment'] = df_final['sentiment'].interpolate(method='linear', limit=3) # max 3h gap
df_final.dropna(inplace=True) # Limpia las primeras 24h de NaNs de las ventanas m√≥viles

# Mostrar Resultados
cols_to_show = ['close', 'returns', 'volatility_24h', 'sma_cross', 'sentiment']
print("\n--- Vista previa del Dataset Sincronizado (Primeras filas) ---")
display(df_final[cols_to_show].head(10))

print("\n--- Estad√≠sticas del Sentimiento ---")
print(df_final['sentiment'].describe())

Cargando: data/raw/bitcoin_price/btc_hourly.csv
90608 filas cargadas | 2026-02-07 a 2015-10-08
Creando caracter√≠sticas (medium level)...
18 caracter√≠sticas creadas
Leyendo dataset de Twitter: data/raw/twitter/engtweetsbtc_clean.csv
Limpiando data
Estimando sentimiento con VADER

--- Vista previa del Dataset Sincronizado (Primeras filas) ---


Unnamed: 0_level_0,close,returns,volatility_24h,sma_cross,sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-10-09 13:00:00,244.0,0.001519,0.001265,-0.272917,0.06
2015-10-09 14:00:00,244.0,0.0,0.001265,-0.197917,0.0
2015-10-09 15:00:00,244.0,0.0,0.001265,-0.12625,0.13
2015-10-09 16:00:00,244.0,0.0,0.00114,-0.0825,0.22
2015-10-09 17:00:00,243.7,-0.00123,0.000958,-0.020417,0.0
2015-10-09 18:00:00,243.7,0.0,0.000505,0.000417,0.0
2015-10-09 19:00:00,244.06,0.001477,0.000594,0.03625,0.26
2015-10-09 20:00:00,249.97,0.024215,0.004976,0.318333,0.02
2015-10-09 21:00:00,245.39,-0.018322,0.006354,0.4075,0.0
2015-10-09 22:00:00,245.39,0.0,0.006354,0.496667,0.13



--- Estad√≠sticas del Sentimiento ---
count    49230.000000
mean         0.068449
std          0.061968
min         -0.540000
25%          0.020000
50%          0.070000
75%          0.110000
max          0.670000
Name: sentiment, dtype: float64


# Entorno de Trading

Crea un simulador de mercado de Bitcoin donde el agente puede operar.

- Simula un broker con balance, posiciones y comisiones
- Define estados observables (precio, posici√≥n, balance)
- Implementa acciones (comprar, vender, mantener)
- Calcula recompensas basadas en profit y penalizaciones
- Entorno controlado para entrenamiento RL
- Permite probar estrategias sin riesgo real
- Implementa restricciones del mundo real (comisiones)

### Pr√°metros:

- `initial_balance=10000`: Capital inicial para trading
- `fee=0.0015`: Comisi√≥n por transacci√≥n (0.15% como paper)
- Acciones disponibles: Puedes a√±adir m√°s (ej: short selling)
- Las penalizaciones por inactividad ayudan a evitar overfitting. Penaliza con -0.001 por inactividad o acci√≥n "HOLD"

**_üõëNotas y Apuntes a resolver_**:

- El entorno sigue la interfaz est√°ndar de Gymnasium


In [7]:
# class BitcoinTradingEnv:
#     """Entorno personalizado para trading de Bitcoin"""

#     def __init__(self, price_data, sentiment_data=None, initial_balance=10_000):
#         self.price_data = price_data
#         self.sentiment_data = sentiment_data
#         self.initial_balance = initial_balance
#         self.reset()

#     def reset(self):
#         self.current_step = 0
#         self.balance = self.initial_balance
#         self.bitcoin_held = 0
#         self.total_profit = 0
#         self.trades = []
#         self.current_price = self.price_data["close"].iloc[self.current_step]

#         return self._get_state()

#     def _get_state(self):
#         # Estado: [balance_ratio, bitcoin_ratio, returns, sentiment]
#         balance_ratio = self.balance / (self.initial_balance + 1e-8)
#         bitcoin_value = self.bitcoin_held * self.current_price
#         total = self.balance + bitcoin_value
#         bitcoin_ratio = bitcoin_value / (total + 1e-8)

#         state = [balance_ratio, bitcoin_ratio]

#         # A√±adir datos hist√≥ricos si est√°n disponibles
#         if self.current_step > 0:
#             prev_price = self.price_data["close"].iloc[self.current_step - 1]
#             state.append((self.current_price - prev_price) / prev_price)
#         else:
#             state.append(0)

#         # A√±adir sentimiento si existe
#         if self.sentiment_data is not None:
#             state.append(self.sentiment_data.iloc[self.current_step])
#         else:
#             state.append(0)

#         return np.array(state, dtype=np.float32)

#     def step(self, action: int) -> tuple[np.ndarray, float, bool, dict]:
#         # action: 0=hold, 1=buy, 2=sell
#         self.current_price = self.price_data["close"].iloc[self.current_step]

#         reward = 0
#         fee = 0.0015  # 0.15% como en el paper

#         if action == 1:  # BUY
#             if self.balance > 0:
#                 cost = self.current_price * (1 + fee)
#                 max_bitcoin = self.balance / cost
#                 self.bitcoin_held += max_bitcoin
#                 self.balance = 0
#                 self.trades.append(("buy", self.current_step, self.current_price))

#         elif action == 2:  # SELL
#             if self.bitcoin_held > 0:
#                 revenue = self.bitcoin_held * self.current_price * (1 - fee)
#                 self.balance += revenue
#                 profit = revenue - (self.bitcoin_held * self.current_price)
#                 reward += profit
#                 self.total_profit += profit
#                 self.bitcoin_held = 0
#                 self.trades.append(("sell", self.current_step, self.current_price))

#         # Penalizaci√≥n por inactividad (como en el paper)
#         if action == 0:
#             if len(self.trades) > 0 and self.trades[-1][0] == "hold":
#                 reward -= 0.001
#             self.trades.append(("hold", self.current_step, self.current_price))

#         # Siguiente paso
#         self.current_step += 1
#         next_state = self._get_state()
#         done = self.current_step >= len(self.price_data) - 1

#         return next_state, reward, done, {}

class BitcoinTradingEnv:
    """Entorno personalizado para trading de Bitcoin"""

    def __init__(self, df, initial_balance=10_000):
        # Ahora recibe tu tabla maestra 'df_final'
        self.df = df
        self.initial_balance = initial_balance
        self.reset()

    def reset(self):
        self.current_step = 0
        self.balance = self.initial_balance
        self.bitcoin_held = 0
        self.total_profit = 0
        self.trades = []
        self.current_price = self.df["close"].iloc[self.current_step]
        
        # Para calcular ganancias reales, necesitamos saber a cu√°nto compramos
        self.average_buy_price = 0 

        return self._get_state()

    def _get_state(self):
        """El estado que el agente 've' en cada hora"""
        # Proporci√≥n de dinero en efectivo vs total
        total_portfolio = self.balance + (self.bitcoin_held * self.current_price)
        balance_ratio = self.balance / (self.initial_balance + 1e-8)
        
        # Proporci√≥n de Bitcoin vs total
        bitcoin_ratio = (self.bitcoin_held * self.current_price) / (total_portfolio + 1e-8)

        # Extraemos el retorno y sentimiento directamente de tu tabla
        current_returns = self.df["returns"].iloc[self.current_step]
        current_sentiment = self.df["sentiment"].iloc[self.current_step]

        # Estado
        state = [balance_ratio, bitcoin_ratio, current_returns, current_sentiment]

        return np.array(state, dtype=np.float32)

    def step(self, action: int):
        # action: 0=hold, 1=buy, 2=sell
        self.current_price = self.df["close"].iloc[self.current_step]

        reward = 0
        fee = 0.0015  # 0.15% de comisi√≥n como en el paper

        if action == 1:  # COMPRAR (BUY)
            if self.balance > 0:
                cost = self.current_price * (1 + fee)
                max_bitcoin = self.balance / cost
                
                self.bitcoin_held += max_bitcoin
                self.average_buy_price = self.current_price # Guardamos precio de compra
                self.balance = 0
                
                self.trades.append(("buy", self.current_step, self.current_price))

        elif action == 2:  # VENDER (SELL)
            if self.bitcoin_held > 0:
                revenue = self.bitcoin_held * self.current_price * (1 - fee)
                
                # Calculamos ganancia basada en lo que cost√≥, no en el precio actual
                cost_basis = self.bitcoin_held * self.average_buy_price
                profit = revenue - cost_basis
                
                self.balance += revenue
                reward += profit
                self.total_profit += profit
                self.bitcoin_held = 0
                
                self.trades.append(("sell", self.current_step, self.current_price))

        # Penalizaci√≥n por inactividad (como en el paper)
        if action == 0:
            if len(self.trades) > 0 and self.trades[-1][0] == "hold":
                reward -= 0.001
            self.trades.append(("hold", self.current_step, self.current_price))

        # Siguiente paso temporal
        self.current_step += 1
        
        # Verificar si llegamos al final de la tabla
        done = self.current_step >= len(self.df) - 1
        
        # Si no hemos terminado, actualizamos el precio para el siguiente estado
        if not done:
            self.current_price = self.df["close"].iloc[self.current_step]
            
        next_state = self._get_state()

        return next_state, reward, done, {}

# Arquitectura de Redes Neuronales

Define las arquitecturas de las redes DQN para cada m√≥dulo del sistema.

- Trade-DQN: Toma precios ‚Üí recomienda acciones (buy/hold/sell)
- Predictive-DQN: Toma precio+sentimiento ‚Üí predice cambio porcentual
- Main-DQN: Combina outputs anteriores ‚Üí decisi√≥n final

### Par√°metros:

- Dimensiones de capas (64, 32, etc.)
- Funciones de activaci√≥n (ReLU, LeakyReLU)
- N√∫mero de neuronas por capa
- Dropout rates para regularizaci√≥n

**_üõëNotas y Apuntes a resolver_**:

- Las arquitecturas siguen las descritas en el paper
- Predictive-DQN tiene 20001 salidas para -100 a +100 con 2 decimales


In [8]:
# 4.1 DQN base
class TradeDQN(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(1, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 8),
            nn.ReLU(),
            nn.Linear(8, 3),
        )

    def forward(self, x):
        x = self.flatten(x)
        return self.linear_relu_stack(x)


# 4.3 Predictive-DQN
class PredictiveDQN(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(2, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 20001),  # -100 a +100 con 2 decimales
        )

    def forward(self, x):
        x = self.flatten(x)
        return self.linear_relu_stack(x)


class MainDQN(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(2, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 3),
        )

    def forward(self, x):
        x = self.flatten(x)
        return self.linear_relu_stack(x)

# Agente DQN

Implementa el algoritmo de aprendizaje por refuerzo con t√©cnicas de estabilizaci√≥n.

- Experience Replay: Almacena y muestrea experiencias pasadas
- Target Network: Red separada para calcular targets estables
- Œµ-greedy con decaimiento
- Optimizaci√≥n con Adam y MSE loss
- Evita correlaciones en datos secuenciales

### Par√°metros

- `lr=0.001`: Tasa de aprendizaje
- `gamma=0.95`: Factor de descuento de recompensas futuras
- `epsilon_decay=0.995`: Velocidad de reducci√≥n de exploraci√≥n
- `memory_size=10000`: Tama√±o del buffer de experiencias

**_üõëNotas y Apuntes a resolver_**:

- Experience replay es clave para convergencia estable


In [9]:
class DQNAgent:
    def __init__(self, lr: float = 0.001, gamma: float = 0.95):
        self.gamma = gamma
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995

        self.trade_model = TradeDQN().to(device)
        self.predictive_model = PredictiveDQN().to(device)
        self.main_model = MainDQN().to(device)

        self.trade_optimizer = optim.Adam(self.trade_model.parameters(), lr=lr)
        self.predictive_optimizer = optim.Adam(
            self.predictive_model.parameters(), lr=lr
        )
        self.main_optimizer = optim.Adam(self.main_model.parameters(), lr=lr)

        self.main_state: tuple[int, float] = (0, 0)  # trade_action, predictive_action
        self.memory = deque(maxlen=10_000)
        self.batch_size = 64

    def remember(
        self,
        state: tuple[float, float, int, float],
        action: int,
        reward: float,
        next_state: tuple[float, float],
        done: bool,
    ) -> None:
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state: tuple[float, float]) -> int:
        if np.random.rand() <= self.epsilon:
            return random.randrange(3)  # buy, sell, hold

        bitcoin_price = state[0]
        sentiment_value = state[1]

        with torch.no_grad():
            self.main_state = (
                self.trade_model(
                    torch.tensor([bitcoin_price], dtype=torch.float32, device=device)
                )
                .argmax()
                .item(),
                self.predictive_model(
                    torch.tensor(
                        [bitcoin_price, sentiment_value],
                        dtype=torch.float32,
                        device=device,
                    )
                )
                .argmax()
                .item(),
            )
            q_values: torch.Tensor = self.main_model(
                torch.tensor(self.main_state, dtype=torch.float32, device=device)
            )
            return q_values.argmax().item()

    def replay(self) -> None:
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.tensor(np.array(states), dtype=torch.float32, device=device)
        actions = torch.tensor(actions, dtype=torch.uint8, device=device).unsqueeze(1)
        rewards = torch.tensor(rewards, dtype=torch.float32, device=device)
        next_states = torch.tensor(
            np.array(next_states), dtype=torch.float32, device=device
        )
        dones = torch.tensor(dones, dtype=torch.bool, device=device)

        # Q-values actuales
        self.trade_model.train()
        self.predictive_model.train()
        self.main_model.train()
        trade_q_values = self.trade_model(states[:, 0:1]).argmax(dim=1)
        predictive_q_values = self.predictive_model(states[:, 0:2]).argmax(dim=1)
        main_q_values = self.main_model(states[:, 2:]).gather(1, actions).squeeze()

        # Q-values del siguiente estado
        with torch.no_grad():
            trade_next_q_values = self.trade_model(next_states[:, 0:1]).max(1)[0]
            predictive_next_q_values = self.predictive_model(next_states[:, 0:2]).max(
                1
            )[0]
            main_next_q_values = self.main_model(next_states[:, 2:]).max(1)[0]
            trade_target_q = rewards[0] + (1 - dones) * self.gamma * trade_next_q_values
            predictive_target_q = (
                rewards[1] + (1 - dones) * self.gamma * predictive_next_q_values
            )
            main_target_q = rewards[2] + (1 - dones) * self.gamma * main_next_q_values

        # Loss
        main_loss = F.mse_loss(main_q_values, main_target_q)
        trade_loss = F.mse_loss(trade_q_values, trade_target_q)
        predictive_loss = F.mse_loss(predictive_q_values, predictive_target_q)

        # Optimizaci√≥n
        main_loss.backward()
        self.main_optimizer.step()
        self.main_optimizer.zero_grad()

        trade_loss.backward()
        self.trade_optimizer.step()
        self.trade_optimizer.zero_grad()

        predictive_loss.backward()
        self.predictive_optimizer.step()
        self.predictive_optimizer.zero_grad()

        # Decaimiento de epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target(self):
        self.main_model.load_state_dict(self.main_model.state_dict())

# Entrenamiento del sistema MDQN

Orquesta el ciclo completo de entrenamiento del modelo.

- Loop sobre episodios de entrenamiento
- Interacci√≥n agente-entorno paso a paso
- Registro de m√©tricas de rendimiento (Contra overfitting)

### Par√°metros

- `episodes=100`: N√∫mero de episodios de entrenamiento
- `target_update_freq=10`: Frecuencia de actualizaci√≥n de target network
- Batch size: Tama√±o del minibatch para training


In [10]:
def train_mdqn(env, agent, episodes=100, target_update_freq=400):
    rewards_history = []
    profit_history = []

    for episode in range(episodes):
        state = env.reset()
        model_state = (state[0], state[-1])
        total_reward = 0

        for step in range(len(env.price_data) - 1):
            action = agent.act(model_state)
            next_state, reward, done, _ = env.step(action)

            state_for_memory = (
                next_state[0],
                next_state[-1],
                agent.main_state[2],
                agent.main_state[3],
            )
            agent.remember(state_for_memory, action, reward, next_state, done)
            agent.replay()

            model_state = (next_state[0], next_state[-1])
            total_reward += reward

            if done:
                break

        # Actualizar target network
        if episode % target_update_freq == 0:
            agent.update_target()

        rewards_history.append(total_reward)
        profit_history.append(env.total_profit)

        print(
            f"Episode {episode + 1}/{episodes} | "
            f"Reward: {total_reward:.2f} | "
            f"Profit: {env.total_profit:.2f} | "
            f"Epsilon: {agent.epsilon:.3f}"
        )

    return rewards_history, profit_history

# Evaluaci√≥n y M√©tricas Financieras

Eval√∫a el rendimiento del modelo entrenado con m√©tricas financieras est√°ndar.

- ROI (Return on Investment): Ganancia porcentual sobre capital
- Sharpe Ratio: Retorno ajustado por riesgo
- N√∫mero de trades: Frecuencia de operaciones
- Balance final: Capital total al final del periodo
- Cuantifica el √©xito de la estrategia
- Permite comparar con segunda versi√≥n del modelo

### Par√°metros

- Per√≠odo de evaluaci√≥n (train/test split)
- M√©tricas adicionales (Sortino Ratio, Max Drawdown)
- Umbrales de riesgo para Sharpe Ratio

**_üõëNotas y Apuntes a resolver_**:

- ROI alto ‚â† buena estrategia (puede ser muy riesgosa)
- Sharpe Ratio > 1 generalmente considerado bueno
- Las m√©tricas deben evaluarse en conjunto


In [11]:
def calculate_metrics(env, agent, test_data):
    # ROI
    initial = env.initial_balance
    final = env.balance + env.bitcoin_held * env.current_price
    roi = ((final - initial) / initial) * 100

    # Sharpe Ratio (simplificado)
    returns = []
    for i in range(1, len(test_data)):
        ret = (test_data["close"].iloc[i] - test_data["close"].iloc[i - 1]) / test_data[
            "close"
        ].iloc[i - 1]
        returns.append(ret)

    sharpe_ratio = np.mean(returns) / (np.std(returns) + 1e-8) * np.sqrt(365 * 24)

    return {
        "ROI": roi,
        "Sharpe Ratio": sharpe_ratio,
        "Final Balance": final,
        "Number of Trades": len([t for t in env.trades if t[0] in ["buy", "sell"]]),
    }

# Ejecuci√≥n Principal

Flujo completo del sistema con datos simulados/reales.

1. Carga y preprocesa datos de Bitcoin
2. Entrena Trade-DQN
3. Eval√∫a en conjunto de test
4. Visualiza resultados

### Par√°metros

- Ruta del archivo CSV de datos
- Capital inicial para simulaci√≥n
- Divisi√≥n train/test (80/20 por defecto)
- Hiperpar√°metros de entrenamiento

**_üõëNotas y Apuntes a resolver_**:

- Ajustar par√°metros seg√∫n tus datos disponibles: btc_hourly .csv
- Guardar modelos entrenados para uso en entorno real


In [None]:
# Cargar datos
preprocessor = BitcoinPricePreprocessor()
df = preprocessor.load_data("btc_hourly.csv")
df = preprocessor.create_features(df)

# Simular sentimientos (o cargar reales)
sentiment_sim = TwitterSentimentSimulator()
sentiments = sentiment_sim.generate_sentiment(df)

# Dividir datos
train_size = int(len(df) * 0.8)
train_data = df.iloc[:train_size]
test_data = df.iloc[train_size:]

# Entrenar Trade-DQN
print("Entrenando TradeDQN")
env = BitcoinTradingEnv(train_data, initial_balance=10_000)
agent = DQNAgent()  # ajustar seg√∫n estado
rewards, profits = train_mdqn(env, agent, episodes=50)

# Visualizar resultados
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(rewards)
plt.title("Recompensas por episodio")
plt.subplot(1, 2, 2)
plt.plot(profits)
plt.title("Profit acumulado")
plt.show()

# Evaluar
test_env = BitcoinTradingEnv(test_data, initial_balance=10000)
metrics = calculate_metrics(test_env, agent, test_data)
print("M√©tricas finales:")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")