In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Para escalar el target
from sklearn.preprocessing import StandardScaler as TargetScaler

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

In [3]:
CSV_PATH = "BTCUSDT_1d_last_year.csv"

df = pd.read_csv(CSV_PATH)
df.columns = [c.lower() for c in df.columns]

ohlc = ["open","high","low","close"]
missing = [c for c in ohlc if c not in df.columns]
if missing:
    raise ValueError(f"Faltan columnas OHLC: {missing}")

# Cambios porcentuales (%)
for c in ohlc:
    df[f"{c}_pct"] = df[c].pct_change()*100.0

# Volumen opcional
vol_cols = [c for c in ["volume","volumen"] if c in df.columns]
vcol = vol_cols[0] if vol_cols else None
if vcol:
    df[f"{vcol}_pct"] = df[vcol].pct_change()*100.0

# Target: retorno % del pr√≥ximo per√≠odo (cierre)
df["close_pct_next"] = df["close_pct"].shift(-1)

df.head()

Unnamed: 0,symbol,interval,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_volume,taker_buy_quote_volume,ignore,open_pct,high_pct,low_pct,close_pct,volume_pct,close_pct_next
0,BTCUSDT,1d,1727222400000,64262.7,64817.99,62947.08,63152.01,17813.11168,1727308799999,1135251000.0,3355531,8384.67983,534590400.0,0,,,,,,3.201767
1,BTCUSDT,1d,1727308800000,63152.01,65839.0,62670.0,65173.99,28373.30593,1727395199999,1831205000.0,4361333,15041.9886,971176200.0,0,-1.728359,1.575195,-0.440179,3.201767,59.283265,0.914414
2,BTCUSDT,1d,1727395200000,65173.99,66498.0,64819.9,65769.95,22048.80487,1727481599999,1448852000.0,3498529,11092.85716,729156100.0,0,3.201767,1.000927,3.430509,0.914414,-22.290321,0.133876
3,BTCUSDT,1d,1727481600000,65769.95,66260.0,65422.23,65858.0,9127.23316,1727567999999,600118500.0,1341703,4501.22534,296026700.0,0,0.914414,-0.357906,0.929236,0.133876,-58.604409,-0.3887
4,BTCUSDT,1d,1727568000000,65858.0,66076.12,65432.0,65602.01,8337.74111,1727654399999,547980000.0,1413449,4132.9378,271622300.0,0,0.133876,-0.277513,0.014934,-0.3887,-8.649851,-3.466997


## 2) *Feature engineering* (lags, medias m√≥viles, volatilidad)
- Lags de `close_pct` (1..5)
- `ma3`, `ma7`: medias m√≥viles de `close_pct`
- `volatilidad_7`: desv√≠o est√°ndar m√≥vil (7)


In [None]:
# Lags
N_LAGS = 5
for k in range(1, N_LAGS+1):
    df[f"close_pct_lag{k}"] = df["close_pct"].shift(k)

# Medias m√≥viles y volatilidad
df["ma3"] = df["close_pct"].rolling(3).mean()
df["ma7"] = df["close_pct"].rolling(7).mean()
df["ma14"] = df["close_pct"].rolling(14).mean()  # Media m√≥vil m√°s larga
df["volatilidad_7"] = df["close_pct"].rolling(7).std()
df["volatilidad_14"] = df["close_pct"].rolling(14).std()  # Volatilidad m√°s larga

# Momentum y tendencia
df["momentum_3"] = df["close_pct"] - df["close_pct"].shift(3)  # Diferencia de 3 per√≠odos
df["momentum_7"] = df["close_pct"] - df["close_pct"].shift(7)  # Diferencia de 7 per√≠odos

# RSI simplificado (indicador de sobrecompra/sobreventa)
delta = df["close_pct"]
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / (loss + 1e-10)  # Evitar divisi√≥n por cero
df["rsi_14"] = 100 - (100 / (1 + rs))

# Feature set base (agregamos las nuevas features)
feature_cols = ["open_pct","high_pct","low_pct","close_pct",
                "ma3","ma7","ma14",
                "volatilidad_7","volatilidad_14",
                "momentum_3","momentum_7","rsi_14"]
if vcol:
    feature_cols.append(f"{vcol}_pct")

# + lags
feature_cols += [f"close_pct_lag{k}" for k in range(1, N_LAGS+1)]

# Drop NaN de pct_change/shift/rolling
df_model = df.dropna(subset=feature_cols + ["close_pct_next"]).copy()

X = df_model[feature_cols]
y = df_model["close_pct_next"]

print(f"Features totales: {len(feature_cols)}")
print(f"Datos disponibles: {len(X)} filas")
X.shape, y.shape, X.head()

((358, 13),
 (358,),
     open_pct  high_pct   low_pct  close_pct       ma3       ma7  volatilidad_7  volume_pct  close_pct_lag1  \
 7  -3.983540 -2.713711 -0.272588  -0.257377 -2.568847 -0.549312       2.486236  -27.791088       -3.982166   
 8  -0.255983 -1.463561 -0.286483   0.170538 -1.356335 -0.982345       1.924543  -16.848951       -0.257377   
 9   0.170571  1.639079  1.056009   2.194618  0.702593 -0.799458       2.178856  -18.789112        0.170538   
 10  2.194601 -0.182908  2.033348  -0.045099  0.773352 -0.825026       2.167096  -63.336060        2.194618   
 11 -0.045083  0.969111  0.177843   1.227739  1.125753 -0.594106       2.303187   14.081406       -0.045099   
 
     close_pct_lag2  close_pct_lag3  close_pct_lag4  close_pct_lag5  
 7        -3.466997       -0.388700        0.133876        0.914414  
 8        -3.982166       -3.466997       -0.388700        0.133876  
 9        -0.257377       -3.982166       -3.466997       -0.388700  
 10        0.170538       -0.25

## 3) Split temporal y **escalado del *target***
Usamos el 20% final como *test* y escalamos **solo con *train***.  
Luego **desescalamos** las predicciones para evaluar en % real.


In [5]:
n = len(df_model)
cut = int(n*0.8)

X_train, X_test = X.iloc[:cut], X.iloc[cut:]
y_train, y_test = y.iloc[:cut], y.iloc[cut:]

# Escalado del target
scaler_y = TargetScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1,1)).ravel()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1,1)).ravel()

len(X_train), len(X_test)

(286, 72)

## 4) Modelos y *grid search* con **TimeSeriesSplit**
Incluimos ahora un **RandomForestRegressor**.


In [None]:
tscv = TimeSeriesSplit(n_splits=5)

models = {
    "DecisionTreeRegressor": {
        "pipe": Pipeline([("model", DecisionTreeRegressor(random_state=42))]),
        "param_grid": {
            "model__max_depth": [30, 40, None],  # MUCHO m√°s profundo
            "model__min_samples_leaf": [1],      # Solo hojas de 1 muestra (m√°xima granularidad)
            "model__min_samples_split": [2],     # Dividir con m√≠nimo 2 muestras
            "model__min_impurity_decrease": [0.0],  # Sin restricci√≥n de impureza
            "model__max_features": [None, "sqrt"]  # Usar todas las features o sqrt
        }
    }
}

results = []
best_estimators = {}

for name, cfg in models.items():
    print(f"\nüîÑ Entrenando {name}...")
    pipe = cfg["pipe"]
    grid = cfg["param_grid"]
    if grid:
        search = GridSearchCV(
            estimator=pipe,
            param_grid=grid,
            scoring="neg_mean_absolute_error",
            cv=tscv,
            n_jobs=-1,
            verbose=1
        )
        search.fit(X_train, y_train_scaled)  # usamos target escalado
        best = search.best_estimator_
        best_estimators[name] = best
        cv_mae = -search.best_score_
        cv_params = search.best_params_
        print(f"‚úÖ {name} - Mejores par√°metros: {cv_params}")
    else:
        pipe.fit(X_train, y_train_scaled)
        best = pipe
        best_estimators[name] = best
        cv_mae = None
        cv_params = {}

    # Predicci√≥n en test (escala del target)
    preds_scaled = best.predict(X_test)
    # Desescalar a %
    preds = scaler_y.inverse_transform(np.array(preds_scaled).reshape(-1,1)).ravel()

    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)

    results.append({
        "modelo": name,
        "cv_mae": cv_mae,
        "mejores_params": cv_params,
        "test_mae": mae,
        "test_rmse": rmse,
        "test_r2": r2
    })
    print(f"   Test MAE: {mae:.4f}%, RMSE: {rmse:.4f}%, R¬≤: {r2:.4f}")

res_df = pd.DataFrame(results).sort_values("test_mae")
print("\nüìä RESULTADOS FINALES:")
res_df

ValueError: Invalid parameter 'n_estimators' for estimator DecisionTreeRegressor(max_depth=10, max_features='sqrt', random_state=42). Valid parameters are: ['ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'monotonic_cst', 'random_state', 'splitter'].

## 5) Curva Real vs Predicci√≥n del mejor modelo


In [None]:
best_name = res_df.iloc[0]["modelo"]
best_model = best_estimators[best_name]

preds_scaled = best_model.predict(X_test)
preds = scaler_y.inverse_transform(np.array(preds_scaled).reshape(-1,1)).ravel()

plt.figure(figsize=(10,4))
plt.plot(y_test.values, label="Real")
plt.plot(preds, label=f"Predicci√≥n ({best_name})")
plt.title("Cambio % de close (siguiente per√≠odo)")
plt.legend()
plt.xlabel("Observaci√≥n (orden temporal)")
plt.ylabel("%")
plt.tight_layout()
plt.show()

res_df

In [None]:
# An√°lisis de diversidad de predicciones
unique_preds = np.unique(preds)
print(f"\nüìä AN√ÅLISIS DE DIVERSIDAD DE PREDICCIONES:")
print(f"   Valores √∫nicos predichos: {len(unique_preds)}")
print(f"   Valores √∫nicos reales: {len(np.unique(y_test.values))}")
print(f"   Rango de predicciones: [{preds.min():.4f}%, {preds.max():.4f}%]")
print(f"   Rango de valores reales: [{y_test.min():.4f}%, {y_test.max():.4f}%]")
print(f"\n   Primeras 20 predicciones √∫nicas:")
print(f"   {unique_preds[:20]}")

## 6) Exportar el mejor modelo con metadata completa

In [None]:
# Crear carpeta models si no existe (para Colab)
import os
os.makedirs('models', exist_ok=True)

# Obtener el mejor modelo
best_name = res_df.iloc[0]["modelo"]
best_model = best_estimators[best_name]

# Extraer solo el modelo (sin el pipeline de scaling si existe)
if hasattr(best_model, 'named_steps'):
    final_model = best_model.named_steps['model']
else:
    final_model = best_model

# Crear artefacto completo con TODA la informaci√≥n necesaria
artifact = {
    "model": final_model,
    "model_name": best_name,
    "feature_names": feature_cols,  # Lista ordenada de features
    "base_features": ["close", "volume", "high", "low", "open"],
    "n_lags": N_LAGS,
    "use_feedback": False,  # Si usas feedback, cambiar a True
    "scaler_y": scaler_y,   # Scaler del target
    "metrics": {
        "test_mae": res_df.iloc[0]["test_mae"],
        "test_rmse": res_df.iloc[0]["test_rmse"],
        "test_r2": res_df.iloc[0]["test_r2"]
    },
    "best_params": res_df.iloc[0]["mejores_params"]
}

# Guardar
joblib.dump(artifact, "models/model_feedback.pkl")
print(f"‚úÖ Modelo {best_name} guardado en models/model_feedback.pkl")
print(f"   Features: {len(feature_cols)}")
print(f"   Test MAE: {artifact['metrics']['test_mae']:.4f}%")
print(f"   Test R¬≤: {artifact['metrics']['test_r2']:.4f}")

# Info del √°rbol si es DecisionTree o RandomForest
if hasattr(final_model, 'tree_'):
    print(f"   √Årbol - Hojas: {final_model.tree_.n_leaves}, Profundidad: {final_model.tree_.max_depth}")
elif hasattr(final_model, 'estimators_'):
    if len(final_model.estimators_) > 0 and hasattr(final_model.estimators_[0], 'tree_'):
        avg_leaves = np.mean([tree.tree_.n_leaves for tree in final_model.estimators_])
        print(f"   Random Forest - √Årboles: {len(final_model.estimators_)}, Promedio hojas: {avg_leaves:.0f}")

## 7) Descargar el modelo (para Colab)

Si est√°s en Google Colab, ejecuta esta celda para descargar el modelo entrenado:

In [None]:
# Descargar el modelo en Colab
try:
    from google.colab import files
    files.download('models/model_feedback.pkl')
    print("‚úÖ Modelo descargado exitosamente")
except:
    print("‚ÑπÔ∏è No est√°s en Colab o el archivo ya fue descargado")