# Entrenamiento Mejorado de Modelos Caria

Este notebook entrena los modelos mejorados con:
- **Quality Model**: Percentiles por fecha (identifica outliers incluso en mala economía)
- **Valuation Model**: DCF/Múltiplos con target de precio futuro 5 años
- **Momentum Model**: Features mejoradas (volumen, SMAs 200/50, RSI)

Incluye descarga de datos FRED (macro, commodities, currencies) y feature engineering completo.


## 1. Instalar Dependencias


In [None]:
# Instalar dependencias necesarias
!pip install -q pandas numpy scikit-learn xgboost lightgbm joblib pyarrow pyyaml fredapi python-dotenv


## 2. Montar Google Drive y Configurar Rutas


In [None]:
from google.colab import drive
import os
from pathlib import Path

# Montar Google Drive
drive.mount('/content/drive')

# Configurar ruta base donde están tus datos
# AJUSTA ESTA RUTA según donde subiste los archivos en Drive
DRIVE_BASE_PATH = '/content/drive/MyDrive/caria_data'  # Cambia esto a tu ruta

# Crear estructura de directorios en Colab
BASE_DIR = Path('/content/caria_workspace')
BASE_DIR.mkdir(exist_ok=True)

# Directorios necesarios
(BASE_DIR / 'data' / 'gold').mkdir(parents=True, exist_ok=True)
(BASE_DIR / 'data' / 'silver' / 'macro').mkdir(parents=True, exist_ok=True)
(BASE_DIR / 'models').mkdir(parents=True, exist_ok=True)
(BASE_DIR / 'artifacts' / 'models').mkdir(parents=True, exist_ok=True)

print(f"✓ Estructura creada en: {BASE_DIR}")

# Verificar que existe Drive
if os.path.exists(DRIVE_BASE_PATH):
    print(f"✓ Ruta Drive encontrada: {DRIVE_BASE_PATH}")
else:
    print(f"⚠ Ruta Drive no encontrada: {DRIVE_BASE_PATH}")
    print("Por favor ajusta DRIVE_BASE_PATH a la ubicación de tus datos")


## 3. Configurar FRED API Key

Necesitas una API key gratuita de FRED: https://fred.stlouisfed.org/docs/api/api_key.html


In [None]:
# Configurar tu FRED API key aquí
# Obtén una gratis en: https://fred.stlouisfed.org/docs/api/api_key.html
FRED_API_KEY = ""  # Pega tu API key aquí

if not FRED_API_KEY:
    print("⚠ Por favor configura FRED_API_KEY en la celda anterior")
else:
    print("✓ FRED API Key configurada")
    os.environ["FRED_API_KEY"] = FRED_API_KEY


## 4. Descargar Datos FRED (Macro, Commodities, Currencies)


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from fredapi import Fred
import warnings
warnings.filterwarnings('ignore')

# Series FRED a descargar (expandido con commodities y currencies)
FRED_SERIES = {
    # Macro
    "GDPC1": "GDP Real",
    "CPIAUCSL": "CPI",
    "UNRATE": "Unemployment Rate",
    "FEDFUNDS": "Fed Funds Rate",
    "DGS10": "10Y Treasury",
    "DGS2": "2Y Treasury",
    "MANPMI": "PMI Manufacturing",
    "UMCSENT": "Consumer Sentiment",
    # Commodities - Metales
    "GOLDAMGBD228NLBM": "Gold",
    "PSLVAMUSD": "Silver",
    "PCOPPUSDM": "Copper",
    "PNICKUSDM": "Nickel",
    "PALUMUSDM": "Aluminum",
    # Commodities - Energía
    "DCOILWTICO": "Crude Oil WTI",
    "PNRGINDEXM": "Natural Gas",
    "DHOILNYH": "Heating Oil",
    # Commodities - Agrícolas
    "PWHEAMTUSDM": "Wheat",
    "PSOYABUSDM": "Soybeans",
    "PCOFFOTMUSDM": "Coffee",
    "PSUGAISAUSDM": "Sugar",
    # Currencies
    "DEXUSEU": "EUR/USD",
    "DEXCHUS": "CNY/USD",
    "DEXJPUS": "JPY/USD",
    "DEXUSUK": "GBP/USD",
    "DEXCAUS": "CAD/USD",
    "DEXMXUS": "MXN/USD",
    "DTWEXBGS": "Dollar Index Broad",
    "DTWEXEMEGS": "Dollar Index EM",
    # Credit spreads
    "BAA10Y": "BAA-10Y Spread",
    "AAA": "AAA Corporate Yield",
    "BAA": "BAA Corporate Yield",
}

def download_fred_series(fred, series_id, start_date="1900-01-01"):
    """Descarga una serie de FRED."""
    try:
        data = fred.get_series(series_id, observation_start=start_date)
        if data.empty:
            return pd.DataFrame()
        df = pd.DataFrame({"date": data.index, series_id: data.values})
        df["date"] = pd.to_datetime(df["date"])
        return df.sort_values("date").reset_index(drop=True)
    except Exception as e:
        print(f"  [ERROR] {series_id}: {e}")
        return pd.DataFrame()

# Descargar datos FRED
print("Descargando datos desde FRED...")
fred = Fred(api_key=FRED_API_KEY)

all_dataframes = []
for series_id, description in FRED_SERIES.items():
    print(f"  Descargando {series_id} ({description})...")
    df = download_fred_series(fred, series_id)
    if not df.empty:
        all_dataframes.append(df)
        print(f"    ✓ {len(df)} observaciones desde {df['date'].min()} hasta {df['date'].max()}")
    else:
        print(f"    ⚠ Sin datos disponibles")

# Combinar todas las series
if all_dataframes:
    merged = all_dataframes[0]
    for df in all_dataframes[1:]:
        merged = merged.merge(df, on="date", how="outer")
    
    merged = merged.sort_values("date").reset_index(drop=True)
    
    # Resamplear a frecuencia diaria (forward-fill)
    date_range = pd.date_range(start=merged['date'].min(), end=merged['date'].max(), freq='D')
    merged_daily = pd.DataFrame(index=date_range)
    merged_daily = merged_daily.join(merged.set_index('date'), how='left')
    merged_daily = merged_daily.ffill()  # Fixed: usar ffill() en lugar de ffill() deprecado
    merged_daily = merged_daily.reset_index()
    merged_daily = merged_daily.rename(columns={'index': 'date'})
    
    # Guardar
    output_path = BASE_DIR / 'data' / 'silver' / 'macro' / 'fred_data.parquet'
    merged_daily.to_parquet(output_path, index=False)
    print(f"\n✓ Datos FRED guardados: {output_path}")
    print(f"  Columnas: {len(merged_daily.columns)}, Filas: {len(merged_daily)}")
    
    # También copiar a Drive
    drive_macro_path = Path(DRIVE_BASE_PATH) / 'data' / 'silver' / 'macro'
    drive_macro_path.mkdir(parents=True, exist_ok=True)
    merged_daily.to_parquet(drive_macro_path / 'fred_data.parquet', index=False)
    print(f"✓ Copiado a Drive: {drive_macro_path / 'fred_data.parquet'}")
else:
    print("⚠ No se descargaron datos FRED")


In [None]:
# Función helper para codificar variables categóricas (debe estar disponible globalmente)
def encode_categorical_features(df_train, df_val, df_test, features):
    """Codifica variables categóricas usando label encoding."""
    from sklearn.preprocessing import LabelEncoder
    
    df_train = df_train[features].copy()
    df_val = df_val[features].copy()
    df_test = df_test[features].copy()
    
    # Identificar columnas categóricas (object o string)
    categorical_cols = []
    for col in features:
        if col in df_train.columns:
            if df_train[col].dtype == 'object' or df_train[col].dtype.name == 'category':
                categorical_cols.append(col)
    
    # Codificar variables categóricas
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        # Entrenar con train + val + test para tener todos los valores posibles
        all_values = pd.concat([df_train[col], df_val[col], df_test[col]], axis=0).dropna().astype(str)
        le.fit(all_values.unique())
        label_encoders[col] = le
        
        # Transformar cada dataset
        df_train[col] = df_train[col].astype(str).map(lambda x: le.transform([x])[0] if x in le.classes_ else 0)
        df_val[col] = df_val[col].astype(str).map(lambda x: le.transform([x])[0] if x in le.classes_ else 0)
        df_test[col] = df_test[col].astype(str).map(lambda x: le.transform([x])[0] if x in le.classes_ else 0)
    
    if categorical_cols:
        print(f"  Variables categóricas codificadas: {categorical_cols}")
    
    return df_train, df_val, df_test

print("✓ Función encode_categorical_features definida")


## 5. Feature Engineering Macro


In [None]:
# Cargar datos FRED
macro_df = pd.read_parquet(BASE_DIR / 'data' / 'silver' / 'macro' / 'fred_data.parquet')

# Calcular features macro cíclicas
print("Calculando features macro...")

# Yield curve slope
if "DGS10" in macro_df.columns and "DGS2" in macro_df.columns:
    macro_df["yield_curve_slope"] = macro_df["DGS10"] - macro_df["DGS2"]
    macro_df["yield_curve_inverted"] = (macro_df["yield_curve_slope"] < 0).astype(int)

# Credit spread
if "BAA" in macro_df.columns and "AAA" in macro_df.columns:
    macro_df["credit_spread"] = macro_df["BAA"] - macro_df["AAA"]
elif "DGS10" in macro_df.columns:
    macro_df["credit_spread"] = macro_df["DGS10"] * 0.02

# Recession probability
macro_df["recession_probability"] = 0.0
if "MANPMI" in macro_df.columns:
    macro_df["pmi_below_50"] = (macro_df["MANPMI"] < 50).astype(int)
    macro_df["recession_probability"] += macro_df["pmi_below_50"] * 0.3
if "UNRATE" in macro_df.columns:
    macro_df["unemployment_change"] = macro_df["UNRATE"].diff()
    macro_df["unemployment_rising"] = (macro_df["unemployment_change"] > 0.5).astype(int)
    macro_df["recession_probability"] += macro_df["unemployment_rising"] * 0.3
if "yield_curve_inverted" in macro_df.columns:
    macro_df["recession_probability"] += macro_df["yield_curve_inverted"] * 0.4
macro_df["recession_probability"] = np.clip(macro_df["recession_probability"], 0, 1)

# Macro regime
macro_df["macro_regime"] = "expansion"
macro_df.loc[macro_df["recession_probability"] > 0.5, "macro_regime"] = "recession"
macro_df.loc[(macro_df["recession_probability"] > 0.3) & (macro_df["recession_probability"] <= 0.5), "macro_regime"] = "slowdown"

# Commodity momentum
commodity_cols = ["GOLDAMGBD228NLBM", "PSLVAMUSD", "DCOILWTICO", "PCOPPUSDM"]
for col in commodity_cols:
    if col in macro_df.columns:
        macro_df[f"{col}_momentum_3m"] = macro_df[col].pct_change(periods=63)
        macro_df[f"{col}_momentum_12m"] = macro_df[col].pct_change(periods=252)

# Commodity ratios
if "GOLDAMGBD228NLBM" in macro_df.columns and "DCOILWTICO" in macro_df.columns:
    macro_df["gold_oil_ratio"] = macro_df["GOLDAMGBD228NLBM"] / (macro_df["DCOILWTICO"] + 1e-6)
    macro_df["risk_aversion_indicator"] = (macro_df["gold_oil_ratio"] > macro_df["gold_oil_ratio"].rolling(252).mean()).astype(int)

if "PCOPPUSDM" in macro_df.columns and "GOLDAMGBD228NLBM" in macro_df.columns:
    macro_df["copper_gold_ratio"] = macro_df["PCOPPUSDM"] / (macro_df["GOLDAMGBD228NLBM"] + 1e-6)
    macro_df["growth_indicator"] = (macro_df["copper_gold_ratio"] > macro_df["copper_gold_ratio"].rolling(252).mean()).astype(int)

# Currency features
currency_cols = ["DEXUSEU", "DEXCHUS", "DEXJPUS", "DEXUSUK", "DEXCAUS", "DEXMXUS", "DTWEXBGS"]
for col in currency_cols:
    if col in macro_df.columns:
        macro_df[f"{col}_momentum_3m"] = macro_df[col].pct_change(periods=63)
        macro_df[f"{col}_strength"] = (macro_df[col] / macro_df[col].rolling(252).mean() - 1)

# Guardar features macro procesadas
macro_features_path = BASE_DIR / 'data' / 'silver' / 'macro' / 'macro_features.parquet'
macro_df.to_parquet(macro_features_path, index=False)
print(f"✓ Features macro guardadas: {macro_features_path}")
print(f"  Features creadas: {len([c for c in macro_df.columns if c not in FRED_SERIES.keys()])} nuevas columnas")


In [None]:
import shutil

# Copiar archivos parquet de gold desde Drive
drive_data_path = Path(DRIVE_BASE_PATH)

for split in ['train', 'val', 'test']:
    src = drive_data_path / 'data' / 'gold' / f'{split}.parquet'
    dst = BASE_DIR / 'data' / 'gold' / f'{split}.parquet'
    if src.exists():
        shutil.copy2(src, dst)
        print(f"✓ Copiado: {split}.parquet ({dst.stat().st_size / 1024 / 1024:.2f} MB)")
    else:
        print(f"⚠ No encontrado: {src}")

# Cargar datos gold
print("\nCargando datos gold...")
train_df = pd.read_parquet(BASE_DIR / 'data' / 'gold' / 'train.parquet')
val_df = pd.read_parquet(BASE_DIR / 'data' / 'gold' / 'val.parquet')
test_df = pd.read_parquet(BASE_DIR / 'data' / 'gold' / 'test.parquet')

print(f"✓ train: {len(train_df)} filas")
print(f"✓ val: {len(val_df)} filas")
print(f"✓ test: {len(test_df)} filas")

# Combinar con macro usando merge_asof
print("\nCombinando con datos macro...")
train_df['date'] = pd.to_datetime(train_df['date'])
val_df['date'] = pd.to_datetime(val_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])
macro_df['date'] = pd.to_datetime(macro_df['date'])

# Seleccionar columnas macro relevantes
macro_cols_to_merge = [
    'yield_curve_slope', 'credit_spread', 'recession_probability', 'macro_regime',
    'gold_oil_ratio', 'copper_gold_ratio', 'risk_aversion_indicator', 'growth_indicator'
]
macro_cols_to_merge = [c for c in macro_cols_to_merge if c in macro_df.columns]
macro_subset = macro_df[['date'] + macro_cols_to_merge].sort_values('date')

# Merge con train/val/test

# Validar que macro_subset no esté vacío antes de merge
if macro_subset.empty:
    print('⚠ macro_subset está vacío, saltando merge con macro')
    # Crear columnas macro vacías para mantener consistencia
    for col in macro_cols_to_merge:
        train_df[col] = np.nan
        val_df[col] = np.nan
        test_df[col] = np.nan
else:
    print(f'  Macro features disponibles: {len(macro_cols_to_merge)} columnas')
    for df_name, df in [('train', train_df), ('val', val_df), ('test', test_df)]:
        df_sorted = df.sort_values(['ticker', 'date']) if 'ticker' in df.columns else df.sort_values('date')
        df_merged = pd.merge_asof(df_sorted, macro_subset, on='date', direction='backward')
        if df_name == 'train':
            train_df = df_merged
        elif df_name == 'val':
            val_df = df_merged
        else:
            test_df = df_merged
        print(f"✓ Datos combinados con macro features")
        print(f"  Train columns: {len(train_df.columns)}")
        print(f"  Val columns: {len(val_df.columns)}")
        print(f"  Test columns: {len(test_df.columns)}")


## 7. Feature Engineering para Stocks (Percentiles Históricos, Lags)


In [None]:
# Calcular features relativas históricas para evitar leakage
print("Calculando features históricas relativas...")

def add_historical_features(df):
    """Agrega features históricas (percentiles, lags, promedios) para evitar leakage."""
    df = df.copy()
    
    # Asegurar que está ordenado por ticker y fecha
    if 'ticker' in df.columns:
        df = df.sort_values(['ticker', 'date']).reset_index(drop=True)
    else:
        df = df.sort_values('date').reset_index(drop=True)
    
    # Percentiles históricos de múltiplos (5 años = ~1260 trading days)
    valuation_cols = ['priceToBookRatio', 'priceToSalesRatio', 'enterpriseValue', 'freeCashFlowYield']
    for col in valuation_cols:
        if col in df.columns:
            # Percentil histórico por ticker
            if 'ticker' in df.columns:
                df[f'{col}_percentile_5y'] = df.groupby('ticker')[col].transform(
                    lambda x: x.rolling(window=1260, min_periods=252).apply(
                        lambda y: (y.iloc[-1] > y.iloc[:-1]).sum() / len(y.iloc[:-1]) if len(y.iloc[:-1]) > 0 else 0.5
                    )
                )
            else:
                df[f'{col}_percentile_5y'] = df[col].rolling(window=1260, min_periods=252).apply(
                    lambda y: (y.iloc[-1] > y.iloc[:-1]).sum() / len(y.iloc[:-1]) if len(y.iloc[:-1]) > 0 else 0.5
                )
    
    # ROIC/ROE históricos con lags para evitar leakage
    quality_cols = ['roic', 'returnOnEquity', 'returnOnAssets']
    for col in quality_cols:
        if col in df.columns:
            # Promedio histórico (3 años)
            if 'ticker' in df.columns:
                df[f'{col}_3y_avg'] = df.groupby('ticker')[col].transform(
                    lambda x: x.rolling(window=756, min_periods=252).mean()
                )
                # Lags (trimestres anteriores)
                df[f'{col}_lag_1q'] = df.groupby('ticker')[col].shift(63)
                df[f'{col}_lag_2q'] = df.groupby('ticker')[col].shift(126)
            else:
                df[f'{col}_3y_avg'] = df[col].rolling(window=756, min_periods=252).mean()
                df[f'{col}_lag_1q'] = df[col].shift(63)
                df[f'{col}_lag_2q'] = df[col].shift(126)
    
    return df

# Aplicar a train, val, test
train_df = add_historical_features(train_df)
val_df = add_historical_features(val_df)
test_df = add_historical_features(test_df)

print("✓ Features históricas calculadas")
print(f"  Nuevas columnas agregadas: {len([c for c in train_df.columns if 'percentile' in c or 'lag' in c or '_3y_avg' in c])}")


In [None]:
# Función helper para codificar variables categóricas (debe estar disponible globalmente)
def encode_categorical_features(df_train, df_val, df_test, features):
    """Codifica variables categóricas usando label encoding."""
    from sklearn.preprocessing import LabelEncoder
    
    df_train = df_train[features].copy()
    df_val = df_val[features].copy()
    df_test = df_test[features].copy()
    
    # Identificar columnas categóricas (object o string)
    categorical_cols = []
    for col in features:
        if col in df_train.columns:
            if df_train[col].dtype == 'object' or df_train[col].dtype.name == 'category':
                categorical_cols.append(col)
    
    # Codificar variables categóricas
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        # Entrenar con train + val + test para tener todos los valores posibles
        all_values = pd.concat([df_train[col], df_val[col], df_test[col]], axis=0).dropna().astype(str)
        le.fit(all_values.unique())
        label_encoders[col] = le
        
        # Transformar cada dataset
        df_train[col] = df_train[col].astype(str).map(lambda x: le.transform([x])[0] if x in le.classes_ else 0)
        df_val[col] = df_val[col].astype(str).map(lambda x: le.transform([x])[0] if x in le.classes_ else 0)
        df_test[col] = df_test[col].astype(str).map(lambda x: le.transform([x])[0] if x in le.classes_ else 0)
    
    if categorical_cols:
        print(f"  Variables categóricas codificadas: {categorical_cols}")
    
    return df_train, df_val, df_test

print("✓ Función encode_categorical_features definida")


## 8. Entrenar Quality Model (Percentiles por Fecha)


In [None]:
import xgboost as xgb
import joblib
from sklearn.metrics import accuracy_score, roc_auc_score

print("=" * 60)
print("ENTRENANDO QUALITY MODEL")
print("=" * 60)

# Features para Quality Model (SIN roic actual para evitar leakage)
quality_features = [
    'roic_lag_1q', 'roic_lag_2q', 'roic_3y_avg',
    'returnOnEquity', 'returnOnAssets',
    'grossProfitMargin', 'netProfitMargin',
    'freeCashFlowYield', 'freeCashFlowPerShare',
    'revenueGrowth', 'netIncomeGrowth',
]

# Agregar features macro si están disponibles
macro_quality_features = ['recession_probability', 'macro_regime', 'credit_spread']
quality_features.extend([f for f in macro_quality_features if f in train_df.columns])

# Filtrar solo features que existen
quality_features = [f for f in quality_features if f in train_df.columns]
print(f"\nFeatures usadas ({len(quality_features)}): {quality_features[:5]}...")

# Función helper para codificar variables categóricas

# Preparar datos - codificar variables categóricas primero
X_train_quality, X_val_quality, X_test_quality = encode_categorical_features(
    train_df, val_df, test_df, quality_features
)

# Convertir a float32 y llenar NaN
X_train_quality = X_train_quality.fillna(0).astype('float32')
X_val_quality = X_val_quality.fillna(0).astype('float32')
X_test_quality = X_test_quality.fillna(0).astype('float32')

# Crear labels: Top 20% de ROIC POR FECHA (adaptado al régimen económico)
print("\nCreando labels por fecha (percentiles por fecha)...")
train_df['roic_percentile_by_date'] = train_df.groupby('date')['roic'].rank(pct=True)
train_df['is_quality'] = (train_df['roic_percentile_by_date'] > 0.80).astype(int)

val_df['roic_percentile_by_date'] = val_df.groupby('date')['roic'].rank(pct=True)
val_df['is_quality'] = (val_df['roic_percentile_by_date'] > 0.80).astype(int)

test_df['roic_percentile_by_date'] = test_df.groupby('date')['roic'].rank(pct=True)
test_df['is_quality'] = (test_df['roic_percentile_by_date'] > 0.80).astype(int)

y_train_quality = train_df['is_quality']
y_val_quality = val_df['is_quality']
y_test_quality = test_df['is_quality']

print(f"  Train: {y_train_quality.sum()} / {len(y_train_quality)} positivos ({y_train_quality.mean():.2%})")
print(f"  Val: {y_val_quality.sum()} / {len(y_val_quality)} positivos ({y_val_quality.mean():.2%})")
print(f"  Test: {y_test_quality.sum()} / {len(y_test_quality)} positivos ({y_test_quality.mean():.2%})")

# Calcular scale_pos_weight para balancear clases
scale_pos_weight = (len(y_train_quality) - y_train_quality.sum()) / max(y_train_quality.sum(), 1)

# Entrenar modelo con hiperparámetros anti-overfitting
print("\nEntrenando modelo...")
quality_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=3,  # Reducido para evitar overfitting
    learning_rate=0.01,
    reg_alpha=2.0,
    reg_lambda=3.0,
    subsample=0.75,
    colsample_bytree=0.75,
    min_child_weight=10,
    random_state=42,
    scale_pos_weight=scale_pos_weight,
    eval_metric='auc',
    early_stopping_rounds=50,
)

quality_model.fit(
    X_train_quality,
    y_train_quality,
    eval_set=[(X_val_quality, y_val_quality)],
    verbose=False,
)

# Evaluar
y_pred_train = quality_model.predict(X_train_quality)
y_pred_val = quality_model.predict(X_val_quality)
y_pred_test = quality_model.predict(X_test_quality)

y_pred_proba_train = quality_model.predict_proba(X_train_quality)[:, 1]
y_pred_proba_val = quality_model.predict_proba(X_val_quality)[:, 1]
y_pred_proba_test = quality_model.predict_proba(X_test_quality)[:, 1]

print("\nResultados Quality Model:")
print(f"  Train - Accuracy: {accuracy_score(y_train_quality, y_pred_train):.4f}, AUC: {roc_auc_score(y_train_quality, y_pred_proba_train):.4f}")
print(f"  Val   - Accuracy: {accuracy_score(y_val_quality, y_pred_val):.4f}, AUC: {roc_auc_score(y_val_quality, y_pred_proba_val):.4f}")
print(f"  Test  - Accuracy: {accuracy_score(y_test_quality, y_pred_test):.4f}, AUC: {roc_auc_score(y_test_quality, y_pred_proba_test):.4f}")

# Guardar
output_path = BASE_DIR / 'models' / 'improved_quality_model.pkl'
joblib.dump(quality_model, output_path)
print(f"\n✓ Modelo guardado: {output_path}")

# Copiar a Drive
drive_models_path = Path(DRIVE_BASE_PATH) / 'models'
drive_models_path.mkdir(parents=True, exist_ok=True)
joblib.dump(quality_model, drive_models_path / 'improved_quality_model.pkl')
print(f"✓ Copiado a Drive: {drive_models_path / 'improved_quality_model.pkl'}")


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print("=" * 60)
print("ENTRENANDO VALUATION MODEL (DCF/Múltiplos)")
print("=" * 60)

# Features para Valuation Model
valuation_features = [
    # Múltiplos relativos históricos
    'priceToBookRatio_percentile_5y',
    'priceToSalesRatio_percentile_5y',
    'freeCashFlowYield_percentile_5y',
    # Fundamentales
    'priceToBookRatio', 'priceToSalesRatio', 'enterpriseValue',
    'returnOnEquity', 'roic', 'grossProfitMargin', 'netProfitMargin',
    'freeCashFlowYield', 'revenueGrowth', 'netIncomeGrowth',
    # Macro contexto
    'yield_curve_slope', 'credit_spread', 'recession_probability',
    'gold_oil_ratio', 'copper_gold_ratio',
]

# Filtrar solo features que existen
valuation_features = [f for f in valuation_features if f in train_df.columns]
print(f"\nFeatures usadas ({len(valuation_features)}): {valuation_features[:5]}...")

# Preparar datos - codificar variables categóricas primero
X_train_val, X_val_val, X_test_val = encode_categorical_features(
    train_df, val_df, test_df, valuation_features
)

# Convertir a float32 y llenar NaN
X_train_val = X_train_val.fillna(0).astype('float32')
X_val_val = X_val_val.fillna(0).astype('float32')
X_test_val = X_test_val.fillna(0).astype('float32')

# Target: Precio futuro 5 años (1260 trading days) vs precio actual
# Para esto necesitamos calcular el precio futuro desde la fecha actual
print("\nCalculando target: precio futuro 5 años...")

def calculate_future_price_target(df, future_days=1260):
    """Calcula el precio futuro como target para valuación."""
    df = df.copy()
    if 'ticker' not in df.columns or 'date' not in df.columns:
        print("  ⚠ No se puede calcular target sin ticker y date")
        return pd.Series(index=df.index, dtype='float64')
    
    df = df.sort_values(['ticker', 'date']).reset_index(drop=True)
    
    # Buscar precio futuro (5 años = ~1260 trading days)
    # Asumimos que tenemos una columna de precio o podemos calcularlo desde múltiplos
    # Por ahora usamos forward returns acumulados como proxy
    if 'target' in df.columns:
        # Acumular returns futuros (aproximación)
        df['future_price_target'] = df.groupby('ticker')['target'].transform(
            lambda x: x.rolling(window=future_days, min_periods=1).apply(
                lambda y: (1 + y).prod() if len(y) > 0 else 1.0
            )
        )
        return df['future_price_target']
    else:
        print("  ⚠ No hay columna 'target' para calcular precio futuro")
        return pd.Series(index=df.index, dtype='float64')

# Para simplificar, usamos target como proxy de retorno futuro
# En producción, esto debería ser precio real futuro vs precio actual
y_train_val = train_df['target'].fillna(0)  # Proxy: forward returns
y_val_val = val_df['target'].fillna(0)
y_test_val = test_df['target'].fillna(0)

print(f"  Train target stats: mean={y_train_val.mean():.4f}, std={y_train_val.std():.4f}")
print(f"  Val target stats: mean={y_val_val.mean():.4f}, std={y_val_val.std():.4f}")

# Entrenar modelo (Regressor para predecir valor justo)
print("\nEntrenando modelo...")
valuation_model = xgb.XGBRegressor(
    n_estimators=400,
    max_depth=4,
    learning_rate=0.01,
    reg_alpha=1.5,
    reg_lambda=2.5,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=5,
    random_state=42,
    eval_metric='rmse',
    early_stopping_rounds=50,
)

valuation_model.fit(
    X_train_val,
    y_train_val,
    eval_set=[(X_val_val, y_val_val)],
    verbose=False,
)

# Evaluar
y_pred_train = valuation_model.predict(X_train_val)
y_pred_val = valuation_model.predict(X_val_val)
y_pred_test = valuation_model.predict(X_test_val)

print("\nResultados Valuation Model:")
print(f"  Train - RMSE: {np.sqrt(mean_squared_error(y_train_val, y_pred_train)):.4f}, R²: {r2_score(y_train_val, y_pred_train):.4f}")
print(f"  Val   - RMSE: {np.sqrt(mean_squared_error(y_val_val, y_pred_val)):.4f}, R²: {r2_score(y_val_val, y_pred_val):.4f}")
print(f"  Test  - RMSE: {np.sqrt(mean_squared_error(y_test_val, y_pred_test)):.4f}, R²: {r2_score(y_test_val, y_pred_test):.4f}")

# Guardar
output_path = BASE_DIR / 'models' / 'improved_valuation_model.pkl'
joblib.dump(valuation_model, output_path)
print(f"\n✓ Modelo guardado: {output_path}")

# Copiar a Drive
joblib.dump(valuation_model, drive_models_path / 'improved_valuation_model.pkl')
print(f"✓ Copiado a Drive: {drive_models_path / 'improved_valuation_model.pkl'}")


In [None]:
print("=" * 60)
print("ENTRENANDO MOMENTUM MODEL")
print("=" * 60)

# Features para Momentum Model (volumen, SMAs 200/50, RSI)
momentum_features = [
    # Volumen (más importante según usuario)
    'volume', 'volume_sma_20',
    # SMAs (200 y 50 son las más importantes)
    'sma_200', 'sma_50', 'sma_20',
    'ema_20', 'ema_50',
    # Posición relativa vs SMAs
    # (calcularemos si precio está sobre SMA)
    # RSI (en menor medida según usuario)
    'rsi_14',
    # Otros técnicos
    'macd', 'macd_signal',
    'atr_14', 'volatility_30d',
]

# Agregar features calculadas: precio sobre SMAs
if 'sma_200' in train_df.columns:
    # Asumimos que hay una columna de precio o la calculamos desde múltiplos
    # Por simplicidad, usamos enterpriseValue como proxy si no hay precio directo
    price_proxy = train_df['price'] if 'price' in train_df.columns else None
    if price_proxy is None:
        # Intentar calcular desde múltiplos si están disponibles
        if 'priceToBookRatio' in train_df.columns and 'bookValue' in train_df.columns:
            price_proxy = train_df['priceToBookRatio'] * train_df['bookValue']
        else:
            price_proxy = train_df['enterpriseValue'] if 'enterpriseValue' in train_df.columns else pd.Series([1.0] * len(train_df), index=train_df.index)
    
    train_df['price_above_sma200'] = (price_proxy > train_df['sma_200']).astype(int)
    train_df['price_above_sma50'] = (price_proxy > train_df['sma_50']).astype(int)
    
    val_df['price_above_sma200'] = ((val_df['price'] if 'price' in val_df.columns else (val_df['enterpriseValue'] if 'enterpriseValue' in val_df.columns else 1.0)) > val_df['sma_200']).astype(int)
    val_df['price_above_sma50'] = ((val_df['price'] if 'price' in val_df.columns else (val_df['enterpriseValue'] if 'enterpriseValue' in val_df.columns else 1.0)) > val_df['sma_50']).astype(int)
    
    test_df['price_above_sma200'] = ((test_df['price'] if 'price' in test_df.columns else (test_df['enterpriseValue'] if 'enterpriseValue' in test_df.columns else 1.0)) > test_df['sma_200']).astype(int)
    test_df['price_above_sma50'] = ((test_df['price'] if 'price' in test_df.columns else (test_df['enterpriseValue'] if 'enterpriseValue' in test_df.columns else 1.0)) > test_df['sma_50']).astype(int)
    
    momentum_features.extend(['price_above_sma200', 'price_above_sma50'])

# Ratio de volumen (volumen actual / SMA volumen)
if 'volume' in train_df.columns and 'volume_sma_20' in train_df.columns:
    train_df['volume_ratio_20d'] = train_df['volume'] / (train_df['volume_sma_20'] + 1e-6)
    val_df['volume_ratio_20d'] = val_df['volume'] / (val_df['volume_sma_20'] + 1e-6)
    test_df['volume_ratio_20d'] = test_df['volume'] / (test_df['volume_sma_20'] + 1e-6)
    momentum_features.append('volume_ratio_20d')

# Slope de SMA 200 (tendencia)
if 'sma_200' in train_df.columns:
    train_df['sma200_slope'] = train_df.groupby('ticker')['sma_200'].transform(lambda x: x.diff(20)) if 'ticker' in train_df.columns else train_df['sma_200'].diff(20)
    val_df['sma200_slope'] = val_df.groupby('ticker')['sma_200'].transform(lambda x: x.diff(20)) if 'ticker' in val_df.columns else val_df['sma_200'].diff(20)
    test_df['sma200_slope'] = test_df.groupby('ticker')['sma_200'].transform(lambda x: x.diff(20)) if 'ticker' in test_df.columns else test_df['sma_200'].diff(20)
    momentum_features.append('sma200_slope')

# Filtrar solo features que existen
momentum_features = [f for f in momentum_features if f in train_df.columns]
print(f"\nFeatures usadas ({len(momentum_features)}): {momentum_features}")

# Preparar datos - codificar variables categóricas primero
X_train_momentum, X_val_momentum, X_test_momentum = encode_categorical_features(
    train_df, val_df, test_df, momentum_features
)

# Convertir a float32 y llenar NaN
X_train_momentum = X_train_momentum.fillna(0).astype('float32')
X_val_momentum = X_val_momentum.fillna(0).astype('float32')
X_test_momentum = X_test_momentum.fillna(0).astype('float32')

# Target: Dirección de retorno (positivo vs negativo)
y_train_momentum = (train_df['target'] > 0).astype(int)
y_val_momentum = (val_df['target'] > 0).astype(int)
y_test_momentum = (test_df['target'] > 0).astype(int)

print(f"\nTarget distribución:")
print(f"  Train: {y_train_momentum.sum()} / {len(y_train_momentum)} positivos ({y_train_momentum.mean():.2%})")
print(f"  Val: {y_val_momentum.sum()} / {len(y_val_momentum)} positivos ({y_val_momentum.mean():.2%})")
print(f"  Test: {y_test_momentum.sum()} / {len(y_test_momentum)} positivos ({y_test_momentum.mean():.2%})")

# Entrenar modelo
print("\nEntrenando modelo...")
momentum_model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=5,  # Más profundo para capturar relaciones complejas
    learning_rate=0.01,
    reg_alpha=1.0,
    reg_lambda=2.0,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    random_state=42,
    eval_metric='auc',
    early_stopping_rounds=50,
)

momentum_model.fit(
    X_train_momentum,
    y_train_momentum,
    eval_set=[(X_val_momentum, y_val_momentum)],
    verbose=False,
)

# Evaluar
y_pred_train = momentum_model.predict(X_train_momentum)
y_pred_val = momentum_model.predict(X_val_momentum)
y_pred_test = momentum_model.predict(X_test_momentum)

y_pred_proba_train = momentum_model.predict_proba(X_train_momentum)[:, 1]
y_pred_proba_val = momentum_model.predict_proba(X_val_momentum)[:, 1]
y_pred_proba_test = momentum_model.predict_proba(X_test_momentum)[:, 1]

print("\nResultados Momentum Model:")
print(f"  Train - Accuracy: {accuracy_score(y_train_momentum, y_pred_train):.4f}, AUC: {roc_auc_score(y_train_momentum, y_pred_proba_train):.4f}")
print(f"  Val   - Accuracy: {accuracy_score(y_val_momentum, y_pred_val):.4f}, AUC: {roc_auc_score(y_val_momentum, y_pred_proba_val):.4f}")
print(f"  Test  - Accuracy: {accuracy_score(y_test_momentum, y_pred_test):.4f}, AUC: {roc_auc_score(y_test_momentum, y_pred_proba_test):.4f}")

# Guardar
output_path = BASE_DIR / 'models' / 'improved_momentum_model.pkl'
joblib.dump(momentum_model, output_path)
print(f"\n✓ Modelo guardado: {output_path}")

# Copiar a Drive
joblib.dump(momentum_model, drive_models_path / 'improved_momentum_model.pkl')
print(f"✓ Copiado a Drive: {drive_models_path / 'improved_momentum_model.pkl'}")


## 11. Guardar Feature Config y Resumen Final


In [None]:
# Guardar feature config para uso futuro
feature_config = {
    'quality_features': quality_features,
    'valuation_features': valuation_features,
    'momentum_features': momentum_features,
}

feature_config_path = BASE_DIR / 'models' / 'improved_feature_config.pkl'
joblib.dump(feature_config, feature_config_path)
print(f"✓ Feature config guardado: {feature_config_path}")

# Copiar a Drive
joblib.dump(feature_config, drive_models_path / 'improved_feature_config.pkl')
print(f"✓ Copiado a Drive: {drive_models_path / 'improved_feature_config.pkl'}")

# Resumen final
print("\n" + "=" * 60)
print("RESUMEN FINAL - MODELOS MEJORADOS")
print("=" * 60)

print("\n✓ Modelos entrenados y guardados:")
print(f"  1. Quality Model: {BASE_DIR / 'models' / 'improved_quality_model.pkl'}")
print(f"  2. Valuation Model: {BASE_DIR / 'models' / 'improved_valuation_model.pkl'}")
print(f"  3. Momentum Model: {BASE_DIR / 'models' / 'improved_momentum_model.pkl'}")

print("\n✓ Características principales:")
print("  - Quality: Percentiles por fecha (identifica outliers en cualquier régimen)")
print("  - Valuation: DCF/Múltiplos con contexto macro (target futuro 5 años)")
print("  - Momentum: Features mejoradas (volumen, SMAs 200/50, RSI)")

print("\n✓ Datos procesados:")
print(f"  - Datos FRED descargados y procesados")
print(f"  - Features macro cíclicas calculadas")
print(f"  - Features históricas relativas agregadas (percentiles, lags)")

print("\n✓ Archivos en Drive:")
print(f"  - Modelos: {drive_models_path}")
print(f"  - Datos macro: {Path(DRIVE_BASE_PATH) / 'data' / 'silver' / 'macro'}")

print("\n" + "=" * 60)
print("ENTRENAMIENTO COMPLETADO")
print("=" * 60)
