In [3]:
import pandas as pd

# Load datasets
crypto_df = pd.read_csv("sample_data/crypto_category_flows.csv")
mf_df = pd.read_csv("sample_data/mutual_fund_flows.csv")
metal_df = pd.read_csv("sample_data/metal_indicators.csv")
macro_df = pd.read_csv("sample_data/fred_macro_flows.csv")
stock_df = pd.read_csv("sample_data/stocks.csv")

# Clean % change columns
def clean_percentage_columns(df):
    percent_cols = [col for col in df.columns if 'Change %' in col]
    for col in percent_cols:
        df[col] = df[col].replace('%', '', regex=True)
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df.dropna(subset=percent_cols)

# Clean and tag each dataset
crypto_clean = clean_percentage_columns(crypto_df)
crypto_clean['Asset'] = crypto_clean['Symbol']  # or replace with actual crypto name column
crypto_clean['Type'] = 'Crypto'
print(crypto_clean.head(10))

mf_clean = clean_percentage_columns(mf_df)
mf_clean['Asset'] = mf_clean['Symbol']  # or actual column
mf_clean['Type'] = 'Mutual Fund'

stock_clean = clean_percentage_columns(stock_df)
stock_clean['Asset'] = stock_clean['Symbol']  # or 'Name'
stock_clean['Type'] = 'Stock'

metal_clean = clean_percentage_columns(metal_df)
metal_clean['Asset'] = metal_clean['Code']  # or set manually to 'GOLD', 'SILVER' etc.
metal_clean['Type'] = 'Metal'

macro_clean = clean_percentage_columns(macro_df)
macro_clean['Asset'] = macro_clean['Indicator']  # like 'CPI', '10Y Treasury'
macro_clean['Type'] = 'Bond'

# Combine asset-based datasets




     Symbol  1W Change %  1M Change %  3M Change %  6M Change %  1Y Change %  \
0   BTC-USD         1.38        23.65         8.84        13.90        58.69   
1   ETH-USD        -0.78        56.67        -6.95       -18.78       -17.74   
2  USDT-USD        -0.01         0.01         0.01        -0.13         0.01   
3   XRP-USD        -0.45        13.53       -13.54       115.04       364.94   
4   BNB-USD        -0.66         9.33        -1.00         5.03        12.59   
5   SOL-USD        -1.88        18.66        -1.88       -30.35        -2.50   
6  USDC-USD        -0.02        -0.03        -0.02        -0.03        -0.03   
7  DOGE-USD         2.28        42.49       -12.01       -42.72        50.35   
8   ADA-USD        -2.63        18.04        -4.10         0.21        58.51   
9   TRX-USD        -3.05         8.52         9.28        32.45       118.44   

   5Y Change %             Name    Category     Asset    Type  
0       981.08      Bitcoin USD  Crypto All   BTC-USD  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crypto_clean['Asset'] = crypto_clean['Symbol']  # or replace with actual crypto name column
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crypto_clean['Type'] = 'Crypto'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mf_clean['Asset'] = mf_clean['Symbol']  # or actual column
A value is trying to b

In [4]:
combined_df = pd.concat([crypto_clean, mf_clean, stock_clean, metal_clean, macro_clean], ignore_index=True)

change_cols = ['1W Change %', '1M Change %', '3M Change %', '6M Change %', '1Y Change %', '5Y Change %']

macro_mask = combined_df['Type'].isin(['Bond', 'Metal'])
combined_df.loc[macro_mask, change_cols] = combined_df.loc[macro_mask, change_cols].fillna(3.0)
combined_df.to_csv("combined_data.csv", index=False)


In [12]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader, TensorDataset
import cvxpy as cp

# -------------------------------
# Load Data (preprocessed + labeled)
# -------------------------------
combined_df = pd.read_csv("combined_data.csv")

# -------------------------------
# Clean and Prepare Input Features
# -------------------------------
change_cols = ['1W Change %', '1M Change %', '3M Change %', '6M Change %', '1Y Change %', '5Y Change %']
combined_df[change_cols] = combined_df[change_cols].apply(pd.to_numeric, errors='coerce')
combined_df.replace([np.inf, -np.inf], np.nan, inplace=True)
macro_mask = combined_df['Type'].isin(['Bond', 'Metal'])
combined_df.loc[macro_mask, change_cols] = combined_df.loc[macro_mask, change_cols].fillna(3.0)

X_raw = combined_df[change_cols]
valid_mask = X_raw.notna().all(axis=1)
X_raw = X_raw[valid_mask]
y_raw = combined_df.loc[valid_mask, '3M Change %'].astype(np.float32) / 100.0
asset_names_all = combined_df.loc[valid_mask, 'Asset'].values
asset_types_all = combined_df.loc[valid_mask, 'Type'].values

# -------------------------------
# Scale features and targets
# -------------------------------
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X_raw)

target_scaler = StandardScaler()
y_scaled = target_scaler.fit_transform(y_raw.values.reshape(-1, 1)).flatten()

# -------------------------------
# Convert to Tensors
# -------------------------------
X_tensor = torch.tensor(X_scaled, dtype=torch.float32).unsqueeze(1)
y_tensor = torch.tensor(y_scaled, dtype=torch.float32)

# -------------------------------
# Train/Test Split
# -------------------------------
X_train, X_test, y_train, y_test, asset_train, asset_test, type_train, type_test = train_test_split(
    X_tensor, y_tensor, asset_names_all, asset_types_all, test_size=0.2, random_state=42
)

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# -------------------------------
# Define Attention LSTM Model
# -------------------------------
class AttentionLSTMRegressor(nn.Module):
    def __init__(self, input_dim=6, hidden_dim=64):
        super(AttentionLSTMRegressor, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.attn = nn.Linear(hidden_dim, 1)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = self.dropout(lstm_out)
        attn_weights = torch.softmax(self.attn(lstm_out).squeeze(-1), dim=1)
        context = torch.sum(lstm_out * attn_weights.unsqueeze(-1), dim=1)
        return self.fc(context).squeeze(-1)

# -------------------------------
# Train the Model
# -------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AttentionLSTMRegressor().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()

for epoch in range(20):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        preds = model(xb)
        loss = criterion(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, RMSE: {np.sqrt(total_loss / len(train_loader)):.4f}")

# -------------------------------
# Evaluation
# -------------------------------
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in DataLoader(TensorDataset(X_test, y_test), batch_size=32):
        xb = xb.to(device)
        preds = model(xb).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

# Inverse transform predictions
y_pred_inv = target_scaler.inverse_transform(np.array(all_preds).reshape(-1, 1)).flatten()
y_true_inv = target_scaler.inverse_transform(np.array(all_labels).reshape(-1, 1)).flatten()

rmse = np.sqrt(mean_squared_error(y_true_inv, y_pred_inv))
direction_acc = np.mean((y_pred_inv > 0) == (y_true_inv > 0))
print(f"\nâœ… Final RMSE: {rmse:.4f}")
print(f"ðŸ“ˆ Directional Accuracy: {direction_acc:.4f}")

# -------------------------------
# Optimized Portfolio using Markowitz
# -------------------------------
pred_df = pd.DataFrame({
    'Asset': asset_test,
    'Type': type_test,
    'Predicted 3M Change %': y_pred_inv,
    'Actual 3M Change %': y_true_inv
}).sort_values(by='Predicted 3M Change %', ascending=False)

# Select top N assets
N = 12
top_assets = pred_df.head(N)
expected_returns = top_assets['Predicted 3M Change %'].values
print(expected_returns)
symbols = top_assets['Asset'].values

# Dummy covariance matrix (identity)
cov_matrix = np.identity(N)
risk_aversion = 0.5

w = cp.Variable(N)
objective = cp.Maximize(expected_returns @ w - risk_aversion * cp.quad_form(w, cov_matrix))
constraints = [cp.sum(w) == 1, w >= 0]
prob = cp.Problem(objective, constraints)
prob.solve()
weights = w.value

# Output optimized portfolio
opt_portfolio = pd.DataFrame({
    'Asset': symbols,
    'Type': top_assets['Type'].values,
    'Predicted Return (%)': np.round(expected_returns * 100, 2),
    'Optimized Weight %': np.round(weights * 100, 2)
})

print("\nðŸ“ˆ Optimized Portfolio (Markowitz):")
print(opt_portfolio)


Epoch 1, RMSE: 1.0745
Epoch 2, RMSE: 1.0746
Epoch 3, RMSE: 1.0726
Epoch 4, RMSE: 1.0698
Epoch 5, RMSE: 1.0785
Epoch 6, RMSE: 1.0706
Epoch 7, RMSE: 1.0796
Epoch 8, RMSE: 1.0770
Epoch 9, RMSE: 1.0625
Epoch 10, RMSE: 1.0745
Epoch 11, RMSE: 1.0610
Epoch 12, RMSE: 1.0655
Epoch 13, RMSE: 1.0652
Epoch 14, RMSE: 1.0674
Epoch 15, RMSE: 1.0665
Epoch 16, RMSE: 1.0571
Epoch 17, RMSE: 1.0666
Epoch 18, RMSE: 1.0643
Epoch 19, RMSE: 1.0544
Epoch 20, RMSE: 1.0618

âœ… Final RMSE: 51.0151
ðŸ“ˆ Directional Accuracy: 0.4333
[25.765356  23.942709  22.687004  16.031126  13.425082  12.926043
  8.881901   6.305207   5.9165864  4.253714   3.374177   3.2609093]

ðŸ“ˆ Optimized Portfolio (Markowitz):
               Asset    Type  Predicted Return (%)  Optimized Weight %
0      MSTR34245-USD  Crypto           2576.540039               100.0
1     TURBO33724-USD  Crypto           2394.270020                -0.0
2            TIA-USD  Crypto           2268.699951                -0.0
3     LIBRA36211-USD  Crypto     