In [25]:
# multi_crypto_forecast.py

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, LSTM, RepeatVector, TimeDistributed, Dense, Dropout, LayerNormalization, Bidirectional
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import mlflow.tensorflow
import yfinance as yf
import os
import matplotlib.pyplot as plt



In [26]:

mlflow.tensorflow.autolog()



In [27]:

# --- Parameters ---
WINDOW_SIZE = 30
FORECAST_HORIZON = 5
BATCH_SIZE = 64
EPOCHS = 100
CRYPTO_SYMBOLS = ["BTC-USD", "ETH-USD", "DOGE-USD", "LTC-USD"]
EXTERNAL_SYMBOLS = ["GLD", "CL=F", "^GSPC"]  # Gold, Oil, S&P 500
START_DATE = "2018-01-01"
END_DATE = "2024-01-01"
MODEL_DIR = "../models/model.keras"

# --- Download and prepare dataset ---
def fetch_market_data(symbols, start, end):
    dfs = []
    for sym in symbols:
        data = yf.download(sym, start=start, end=end)[['Close']]
        data.columns = [sym]
        dfs.append(data)
    df = pd.concat(dfs, axis=1).dropna()
    return df

def fetch_sentiment_data():
    # Placeholder for real sentiment ingestion (Twitter, Reddit, etc.)
    # In practice, replace this with actual sentiment feature generation
    # Here we simulate 3 sentiment scores for BTC, ETH, DOGE over time
    dates = pd.date_range(start=START_DATE, end=END_DATE, freq='D')
    np.random.seed(42)
    sentiment = pd.DataFrame({
        'Date': dates,
        'BTC_sentiment': np.random.uniform(-1, 1, len(dates)),
        'ETH_sentiment': np.random.uniform(-1, 1, len(dates)),
        'DOGE_sentiment': np.random.uniform(-1, 1, len(dates))
    })
    sentiment = sentiment.set_index('Date')
    return sentiment

def load_data():
    price_df = fetch_market_data(CRYPTO_SYMBOLS, START_DATE, END_DATE)
    external_df = fetch_market_data(EXTERNAL_SYMBOLS, START_DATE, END_DATE)
    sentiment_df = fetch_sentiment_data()

    df = price_df.join(external_df, how='inner')
    df = df.join(sentiment_df, how='inner')
    df = df.reset_index().sort_values('Date')
    df = df.drop(columns=['Date'])

    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(df)
    return scaled, scaler, df.columns.tolist()

def create_sequences(data, window_size, forecast_horizon):
    X, y = [], []
    for i in range(len(data) - window_size - forecast_horizon):
        X.append(data[i:i + window_size])
        y.append(data[i + window_size:i + window_size + forecast_horizon])
    return np.array(X), np.array(y)

# --- Build Conv-BiLSTM Encoder-Decoder Model ---
def build_model(input_shape, output_steps):
    inputs = Input(shape=input_shape)
    x = Conv1D(filters=64, kernel_size=3, activation='relu', padding='causal')(inputs)
    x = Dropout(0.2)(x)
    x = LayerNormalization()(x)
    x = Bidirectional(LSTM(100, return_sequences=False))(x)
    x = RepeatVector(output_steps)(x)
    x = LSTM(100, return_sequences=True)(x)
    outputs = TimeDistributed(Dense(input_shape[1]))(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# --- Plotting helper ---
def log_predictions(X_val, y_val, y_pred, feature_names):
    plt.figure(figsize=(10, 6))
    for i in range(min(len(feature_names), 6)):  # Plot top 6 features max
        plt.plot(y_val[0, :, i], label=f"True - {feature_names[i]}", linestyle='--')
        plt.plot(y_pred[0, :, i], label=f"Pred - {feature_names[i]}")
    plt.title("Sample Forecast vs Actual")
    plt.legend()
    plt.tight_layout()
    os.makedirs("plots", exist_ok=True)
    plt.savefig("plots/sample_forecast.png")
    mlflow.log_artifact("plots/sample_forecast.png")
    plt.close()

# --- Training Pipeline ---
def train_model():
    data, scaler, feature_names = load_data()
    NUM_FEATURES = data.shape[1]
    X, y = create_sequences(data, WINDOW_SIZE, FORECAST_HORIZON)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

    model = build_model((WINDOW_SIZE, NUM_FEATURES), FORECAST_HORIZON)

    with mlflow.start_run():
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=EPOCHS,
            batch_size=BATCH_SIZE,
            callbacks=[
                tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
                tf.keras.callbacks.ReduceLROnPlateau(patience=5, factor=0.5)
            ]
        )

        model.save(MODEL_DIR, save_format='keras')
        mlflow.log_artifact(MODEL_DIR)

        # Evaluate and log predictions
        y_pred = model.predict(X_val[:1])
        print(X_val, y_val, y_pred, feature_names)
        log_predictions(X_val, y_val, y_pred, feature_names)

In [28]:
train_model()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Epoch 1/100
[1m18/19[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 21ms/step - loss: 0.1072 - mae: 0.2405



[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 49ms/step - loss: 0.1032 - mae: 0.2352 - val_loss: 0.0394 - val_mae: 0.1427 - learning_rate: 0.0010
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 0.0368 - mae: 0.1349



[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - loss: 0.0367 - mae: 0.1346 - val_loss: 0.0289 - val_mae: 0.1144 - learning_rate: 0.0010
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - loss: 0.0297 - mae: 0.1145 - val_loss: 0.0296 - val_mae: 0.1199 - learning_rate: 0.0010
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - loss: 0.0282 - mae: 0.1081 - val_loss: 0.0297 - val_mae: 0.1186 - learning_rate: 0.0010
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - loss: 0.0276 - mae: 0.1057 - val_loss: 0.0292 - val_mae: 0.1162 - learning_rate: 0.0010
Epoch 6/100
[1m18/19[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 23ms/step - loss: 0.0271 - mae: 0.1037



[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - loss: 0.0272 - mae: 0.1036 - val_loss: 0.0285 - val_mae: 0.1135 - learning_rate: 0.0010
Epoch 7/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 0.0273 - mae: 0.1020 - val_loss: 0.0294 - val_mae: 0.1188 - learning_rate: 0.0010
Epoch 8/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 0.0269 - mae: 0.1000 - val_loss: 0.0287 - val_mae: 0.1138 - learning_rate: 0.0010
Epoch 9/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 0.0268 - mae: 0.1000



[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - loss: 0.0268 - mae: 0.1000 - val_loss: 0.0284 - val_mae: 0.1137 - learning_rate: 0.0010
Epoch 10/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - loss: 0.0265 - mae: 0.0985 - val_loss: 0.0284 - val_mae: 0.1140 - learning_rate: 0.0010
Epoch 11/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 0.0265 - mae: 0.0984 - val_loss: 0.0294 - val_mae: 0.1187 - learning_rate: 0.0010
Epoch 12/100
[1m18/19[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 22ms/step - loss: 0.0263 - mae: 0.0973



[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - loss: 0.0263 - mae: 0.0973 - val_loss: 0.0282 - val_mae: 0.1123 - learning_rate: 0.0010
Epoch 13/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - loss: 0.0260 - mae: 0.0964 - val_loss: 0.0284 - val_mae: 0.1142 - learning_rate: 0.0010
Epoch 14/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - loss: 0.0259 - mae: 0.0956 - val_loss: 0.0288 - val_mae: 0.1158 - learning_rate: 0.0010
Epoch 15/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 0.0258 - mae: 0.0951 - val_loss: 0.0288 - val_mae: 0.1148 - learning_rate: 0.0010
Epoch 16/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 0.0256 - mae: 0.0949 - val_loss: 0.0289 - val_mae: 0.1158 - learning_rate: 0.0010
Epoch 17/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 0.0256 - mae: 0.0948 - val_loss: 



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 334ms/step
[[[0.25009737 0.32806934 0.08699725 ... 0.05835337 0.25127169 0.74224157]
  [0.2819876  0.3457812  0.0914481  ... 0.78051281 0.84458407 0.05174001]
  [0.29736743 0.3446559  0.09128271 ... 0.99543522 0.30859673 0.87479496]
  ...
  [0.2535319  0.26384594 0.08547216 ... 0.53412396 0.08415423 0.01745092]
  [0.2501686  0.25934772 0.08523945 ... 0.21279319 0.39327079 0.10185007]
  [0.24713894 0.25412269 0.08368362 ... 0.00893651 0.10027945 0.41036747]]

 [[0.2819876  0.3457812  0.0914481  ... 0.78051281 0.84458407 0.05174001]
  [0.29736743 0.3446559  0.09128271 ... 0.99543522 0.30859673 0.87479496]
  [0.26512859 0.3165291  0.0857561  ... 0.05477067 0.5732908  0.35233199]
  ...
  [0.2501686  0.25934772 0.08523945 ... 0.21279319 0.39327079 0.10185007]
  [0.24713894 0.25412269 0.08368362 ... 0.00893651 0.10027945 0.41036747]
  [0.24580516 0.25358474 0.08488818 ... 0.23889207 0.01658838 0.93365679]]

 [[0.29736743 0.3446559