# Phase 2: Machine Learning Pipeline

This notebook demonstrates the Phase 2 implementation of the FinBERT-SOL project. It traces the full pipeline:
1. Feature Engineering (`FeatureEngineer`)
2. XGBoost Baseline Model (`XGBoostTrader`)
3. FinBERT Sentiment Integration (`FinBERTSentiment`)
4. Model Evaluation Framework (`ModelEvaluator`)

## 1. Setup & Imports

In [None]:
import sys
import os
import pandas as pd
import numpy as np
from datetime import datetime, time

# Add root directory to path to import src modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.features.feature_engineer import FeatureEngineer
from src.models.xgboost_trader import XGBoostTrader
from src.models.finbert_sentiment import FinBERTSentiment
from src.evaluation.model_evaluator import ModelEvaluator

## 2. Generate Mock Data

Since we don't have the live database connected here, we'll generate realistic mock data that mimics the `sol_ret_1h`, `bid_volume_5`, and other features used by `FeatureEngineer`.

In [None]:
def generate_mock_data(rows=2000):
    np.random.seed(42)
    dates = pd.date_range(start="2024-01-01", periods=rows, freq="1H")
    
    df = pd.DataFrame(index=dates)
    
    # Order flow features
    df['best_bid'] = np.cumsum(np.random.normal(0, 0.5, rows)) + 100
    df['best_ask'] = df['best_bid'] + np.random.uniform(0.01, 0.1, rows)
    df['mid_price'] = (df['best_bid'] + df['best_ask']) / 2
    df['spread'] = df['best_ask'] - df['best_bid']
    df['spread_bps'] = (df['spread'] / df['mid_price']) * 10000
    df['depth_imbalance_5'] = np.random.uniform(-1, 1, rows)
    df['depth_imbalance_10'] = np.random.uniform(-1, 1, rows)
    df['ofi_ratio'] = np.random.uniform(-1, 1, rows)
    
    # Volume
    df['bid_volume_5'] = np.random.exponential(1000, rows)
    df['ask_volume_5'] = np.random.exponential(1000, rows)
    
    # Technical indicators
    df['sol_ret_1h'] = df['mid_price'].pct_change()
    df['roc_1h'] = df['mid_price'].pct_change(periods=1)
    df['roc_4h'] = df['mid_price'].pct_change(periods=4)
    df['sma_diff_1h'] = np.random.normal(0, 0.01, rows)
    df['sma_diff_4h'] = np.random.normal(0, 0.02, rows)
    df['golden_cross'] = np.random.choice([0, 1], rows)
    df['bb_width_1h'] = np.random.uniform(0.01, 0.05, rows)
    df['bb_position_1h'] = np.random.uniform(-1, 1, rows)
    
    # Market microstructure
    df['toxicity_bid_proxy'] = np.random.uniform(0, 1, rows)
    df['toxicity_ask_proxy'] = np.random.uniform(0, 1, rows)
    df['toxicity_imbalance'] = df['toxicity_bid_proxy'] - df['toxicity_ask_proxy']
    df['bid_vwap_5'] = df['best_bid']
    df['ask_vwap_5'] = df['best_ask']
    df['vwap_mid_5'] = df['mid_price']
    
    # Cross-asset signals
    df['btc_close'] = np.cumsum(np.random.normal(0, 50, rows)) + 40000
    df['btc_ret_1h'] = df['btc_close'].pct_change()
    df['rel_strength_1h'] = df['sol_ret_1h'] - df['btc_ret_1h']
    
    # Time features
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['is_us_open'] = ((df.index.hour >= 9) & (df.index.hour < 16)).astype(int)
    
    return df.fillna(0)

raw_data = generate_mock_data()
print(f"Generated {len(raw_data)} rows of market data.")
raw_data.head()

## 3. Feature Engineering

In [None]:
fe = FeatureEngineer()

# Prepare features
features = fe.prepare_features(raw_data)

# Create labels (1h horizon)
labels = fe.create_labels(raw_data, horizon='1h')

# Align indices (feature scaling drops NAs natively)
aligned_labels = labels.loc[features.index]

# Train/Test split
X_train, X_test, y_train, y_test = fe.create_train_test_split(features, aligned_labels, test_size=0.3)

# Scale features
X_train_scaled = fe.scale_features(X_train, fit=True)
X_test_scaled = fe.scale_features(X_test, fit=False)

print(f"Training set shape: {X_train_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")

## 4. XGBoost Baseline Evaluation

In [None]:
trader = XGBoostTrader()
print("Training XGBoost model...")
xgb_model = trader.train_model(X_train_scaled, y_train, X_test_scaled, y_test)

predictions, probs = trader.predict(X_test_scaled)

evaluator = ModelEvaluator()
evaluator.evaluate_model_performance(
    model_name='XGBoost Baseline',
    y_true=y_test,
    y_pred=predictions,
    y_prob=probs
)

print(evaluator.generate_evaluation_report())

### Plot Feature Importance

In [None]:
trader.plot_feature_importance(top_n=10)

## 5. FinBERT Sentiment Filter Pipeline

In [None]:
sentiment_analyzer = FinBERTSentiment()

# Mock some daily news for the last few days in our test set
test_dates = list(set([str(d.date()) for d in X_test.index]))
test_dates.sort()

mock_news = {
    test_dates[0]: [
        {'text': 'Solana network shows strong performance with new DeFi integrations', 'time': time(8, 30)},
        {'text': 'SOL price surges as institutional interest grows', 'time': time(9, 00)}
    ],
    test_dates[1]: [
        {'text': 'Technical issues reported on Solana blockchain', 'time': time(8, 45)},
        {'text': 'SOL experiences volatility amid market uncertainty', 'time': time(9, 15)}
    ]
}

print("Analyzing daily sentiment...")
daily_sentiment = sentiment_analyzer.aggregate_daily_sentiment(mock_news)

for date, score in daily_sentiment.items():
    print(f"{date}: Sentiment Score = {score:.3f}")

sentiment_filter = sentiment_analyzer.create_sentiment_filter(daily_sentiment)
print("\nTrading Filter (Trade Allowed?):")
print(sentiment_filter)

## 6. Backtesting with Sentiment Filtered Predictions

In [None]:
actual_returns = raw_data.loc[X_test_scaled.index, 'sol_ret_1h']

# Apply the sentiment filter map to indices
index_sentiment_filter = np.ones(len(predictions), dtype=bool)
for i, date_idx in enumerate(X_test_scaled.index):
    date_str = str(date_idx.date())
    if date_str in sentiment_filter:
        index_sentiment_filter[i] = sentiment_filter[date_str]

# Evaluate Trading Performance
metrics = evaluator.evaluate_trading_performance('XGBoost Baseline', predictions, actual_returns, index_sentiment_filter)

pd.DataFrame([metrics]).T.rename(columns={0: 'Metric Value'})