In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

# Step 1: Mock Data
def generate_mock_data(num_currency_pairs=5, num_clients_per_pair=10, days=2):
    records = []
    for ccy in range(num_currency_pairs):
        for client in range(num_clients_per_pair):
            for hour in range(24 * days):
                timestamp = pd.Timestamp("2024-01-01") + pd.Timedelta(hours=hour)
                market_volume = np.random.uniform(1e5, 1e6)
                client_spread = np.random.uniform(0.1, 1.0)
                market_spread = np.random.uniform(0.05, 0.5)
                bid_edge = np.random.uniform(0.01, 0.05)
                ask_edge = np.random.uniform(0.01, 0.05)
                flow_volume = np.random.uniform(0, market_volume * 0.1)
                market_total_volume = market_volume + np.random.uniform(0, 5e5)

                records.append({
                    'TimeStamp': timestamp,
                    'client_id': client,
                    'segment_id': client % 3,
                    'currency_pair_id': ccy,
                    'market_volume': market_volume,
                    'client_spread': client_spread,
                    'market_spread': market_spread,
                    'bid_edge': bid_edge,
                    'ask_edge': ask_edge,
                    'flow_volume': flow_volume,
                    'market_total_volume': market_total_volume
                })
    df = pd.DataFrame(records)
    df['flow_ratio'] = df['flow_volume'] / df['market_total_volume']
    return df

df = generate_mock_data()



In [5]:
df.head()

Unnamed: 0,TimeStamp,client_id,segment_id,currency_pair_id,market_volume,client_spread,market_spread,bid_edge,ask_edge,flow_volume,market_total_volume,flow_ratio
0,2024-01-01 00:00:00,0,0,0,315560.204932,0.985904,0.297364,0.022234,0.028561,28527.604492,664617.0,0.042923
1,2024-01-01 01:00:00,0,0,0,643611.772371,0.37175,0.090416,0.03381,0.029707,2563.872592,1035847.0,0.002475
2,2024-01-01 02:00:00,0,0,0,945513.970382,0.606315,0.061555,0.012502,0.014629,25757.344567,1226319.0,0.021004
3,2024-01-01 03:00:00,0,0,0,475615.196919,0.270836,0.477591,0.042928,0.031481,39255.455122,673852.1,0.058255
4,2024-01-01 04:00:00,0,0,0,153958.435224,0.49852,0.302521,0.012464,0.013201,3529.798777,515716.8,0.006844


In [7]:
import numpy as np
import pandas as pd

def add_time_features(df, time_col='TimeStamp'):
    df = df.copy()
    
    # Ensure datetime
    df[time_col] = pd.to_datetime(df[time_col])

    # Extract time components
    df['hour'] = df[time_col].dt.hour
    df['day_of_week'] = df[time_col].dt.dayofweek
    df['day_of_month'] = df[time_col].dt.day
    df['week_of_year'] = df[time_col].dt.isocalendar().week.astype(int)

    # Normalize with sin/cos transformation to represent cycles
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

    df['dom_sin'] = np.sin(2 * np.pi * df['day_of_month'] / 31)
    df['dom_cos'] = np.cos(2 * np.pi * df['day_of_month'] / 31)

    df['woy_sin'] = np.sin(2 * np.pi * df['week_of_year'] / 52)
    df['woy_cos'] = np.cos(2 * np.pi * df['week_of_year'] / 52)

    return df


In [9]:
df = add_time_features(df)

In [11]:
# ========================================
# 📦 1. Import dependencies
# ========================================
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ========================================
# 🕐 2. Time-based Feature Engineering
# ========================================
def add_time_features(df, time_col='TimeStamp'):
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col])
    df['hour'] = df[time_col].dt.hour
    df['day_of_week'] = df[time_col].dt.dayofweek
    df['day_of_month'] = df[time_col].dt.day
    df['week_of_year'] = df[time_col].dt.isocalendar().week.astype(int)

    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['dom_sin'] = np.sin(2 * np.pi * df['day_of_month'] / 31)
    df['dom_cos'] = np.cos(2 * np.pi * df['day_of_month'] / 31)
    df['woy_sin'] = np.sin(2 * np.pi * df['week_of_year'] / 52)
    df['woy_cos'] = np.cos(2 * np.pi * df['week_of_year'] / 52)

    return df

# ========================================
# 🧪 3. Train Classical Models
# ========================================
def train_and_evaluate_models(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    models = {
        "LogisticRegression": LogisticRegression(max_iter=1000),
        "DecisionTree": DecisionTreeRegressor(max_depth=5),
        "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=5),
        "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, objective='reg:squarederror')
    }

    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        mse = mean_squared_error(y_test, preds)
        mae = mean_absolute_error(y_test, preds)
        r2 = r2_score(y_test, preds)

        results[name] = {
            "model": model,
            "MSE": mse,
            "MAE": mae,
            "R2": r2
        }

        print(f"✅ {name}: MSE={mse:.4f}, MAE={mae:.4f}, R2={r2:.4f}")

    return results

# ========================================
# 🤖 4. PyTorch MLP Model
# ========================================
class FlowDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x)

def train_mlp_model(X, y, epochs=20, batch_size=64, lr=1e-3):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    train_loader = DataLoader(FlowDataset(pd.DataFrame(X_train_scaled), y_train), batch_size=batch_size, shuffle=True)
    test_dataset = FlowDataset(pd.DataFrame(X_test_scaled), y_test)

    model = MLP(input_dim=X.shape[1])
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        for xb, yb in train_loader:
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

    # Evaluation
    model.eval()
    with torch.no_grad():
        X_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
        y_pred = model(X_tensor).numpy().flatten()
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"✅ MLP: MSE={mse:.4f}, MAE={mae:.4f}, R2={r2:.4f}")

    return model, scaler


In [15]:
df.head()

Unnamed: 0,TimeStamp,client_id,segment_id,currency_pair_id,market_volume,client_spread,market_spread,bid_edge,ask_edge,flow_volume,...,day_of_month,week_of_year,hour_sin,hour_cos,dow_sin,dow_cos,dom_sin,dom_cos,woy_sin,woy_cos
0,2024-01-01 00:00:00,0,0,0,315560.204932,0.985904,0.297364,0.022234,0.028561,28527.604492,...,1,1,0.0,1.0,0.0,1.0,0.201299,0.97953,0.120537,0.992709
1,2024-01-01 01:00:00,0,0,0,643611.772371,0.37175,0.090416,0.03381,0.029707,2563.872592,...,1,1,0.258819,0.965926,0.0,1.0,0.201299,0.97953,0.120537,0.992709
2,2024-01-01 02:00:00,0,0,0,945513.970382,0.606315,0.061555,0.012502,0.014629,25757.344567,...,1,1,0.5,0.866025,0.0,1.0,0.201299,0.97953,0.120537,0.992709
3,2024-01-01 03:00:00,0,0,0,475615.196919,0.270836,0.477591,0.042928,0.031481,39255.455122,...,1,1,0.707107,0.707107,0.0,1.0,0.201299,0.97953,0.120537,0.992709
4,2024-01-01 04:00:00,0,0,0,153958.435224,0.49852,0.302521,0.012464,0.013201,3529.798777,...,1,1,0.866025,0.5,0.0,1.0,0.201299,0.97953,0.120537,0.992709


In [13]:
df = add_time_features(df)

feature_cols = [
    'market_volume', 'client_spread', 'market_spread',
    'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos',
    'dom_sin', 'dom_cos', 'woy_sin', 'woy_cos',
    # 加上 one-hot encoded ClientSegment, ClientID 等
]

X = df[feature_cols]
y = df['flow_ratio']

results = train_and_evaluate_models(X, y)

mlp_model, mlp_scaler = train_mlp_model(X, y)

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [17]:

mlp_model, mlp_scaler = train_mlp_model(X, y)

Epoch 1/20, Loss: 0.0005
Epoch 2/20, Loss: 0.0007
Epoch 3/20, Loss: 0.0005
Epoch 4/20, Loss: 0.0006
Epoch 5/20, Loss: 0.0006
Epoch 6/20, Loss: 0.0006
Epoch 7/20, Loss: 0.0005
Epoch 8/20, Loss: 0.0005
Epoch 9/20, Loss: 0.0004
Epoch 10/20, Loss: 0.0006
Epoch 11/20, Loss: 0.0005
Epoch 12/20, Loss: 0.0005
Epoch 13/20, Loss: 0.0005
Epoch 14/20, Loss: 0.0005
Epoch 15/20, Loss: 0.0005
Epoch 16/20, Loss: 0.0005
Epoch 17/20, Loss: 0.0004
Epoch 18/20, Loss: 0.0005
Epoch 19/20, Loss: 0.0004
Epoch 20/20, Loss: 0.0005
✅ MLP: MSE=0.0005, MAE=0.0189, R2=-0.0351


In [19]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

# Make sure this matches your model definition
class FlowRatioModel(nn.Module):
    def __init__(self, num_numeric, cat_dims, emb_dims):
        super().__init__()
        self.embeddings = nn.ModuleList(
            [nn.Embedding(cat_dim, emb_dim) for cat_dim, emb_dim in zip(cat_dims, emb_dims)]
        )
        total_emb_dim = sum(emb_dims)
        self.mlp = nn.Sequential(
            nn.Linear(num_numeric + total_emb_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x_num, x_cat):
        embs = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        x = torch.cat([x_num] + embs, dim=1)
        return self.mlp(x)

class FlowRatioDataset(torch.utils.data.Dataset):
    def __init__(self, df, numeric_features, cat_features, y_column=None):
        self.x_num = torch.tensor(df[numeric_features].values, dtype=torch.float32)
        self.x_cat = torch.tensor(df[cat_features].values, dtype=torch.long)
        self.y = None if y_column is None else torch.tensor(df[y_column].values, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.x_num)

    def __getitem__(self, idx):
        return self.x_num[idx], self.x_cat[idx], self.y[idx] if self.y is not None else 0.0

# ----------------------------
# TRAINING FUNCTION
# ----------------------------
def train_model(df_train, numeric_features, cat_features, y_column, device='cpu'):
    global client_id_map, currency_pair_map, scaler

    # Encode categorical vars
    client_id_map = {val: idx for idx, val in enumerate(df_train['client_id'].unique())}
    currency_pair_map = {val: idx for idx, val in enumerate(df_train['currency_pair'].unique())}

    df_train['client_id_enc'] = df_train['client_id'].map(client_id_map)
    df_train['currency_pair_enc'] = df_train['currency_pair'].map(currency_pair_map)

    # Normalize numeric vars
    scaler = StandardScaler()
    df_train[numeric_features] = scaler.fit_transform(df_train[numeric_features])

    # Dataset + Model
    train_dataset = FlowRatioDataset(df_train, numeric_features, ['client_id_enc', 'currency_pair_enc'], y_column)
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

    model = FlowRatioModel(
        num_numeric=len(numeric_features),
        cat_dims=[len(client_id_map), len(currency_pair_map)],
        emb_dims=[32, 16]  # Tuneable
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    # Train loop (5 epochs)
    for epoch in range(5):
        model.train()
        epoch_loss = 0
        for x_num, x_cat, y in train_loader:
            x_num, x_cat, y = x_num.to(device), x_cat.to(device), y.to(device)

            pred = model(x_num, x_cat)
            loss = loss_fn(pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        print(f"Epoch {epoch+1} loss: {epoch_loss:.4f}")

    return model


In [21]:
def predict_flow_ratio(model, df_test, numeric_features, device='cpu'):
    df_test = df_test.copy()

    df_test['client_id_enc'] = df_test['client_id'].map(client_id_map)
    df_test['currency_pair_enc'] = df_test['currency_pair'].map(currency_pair_map)

    # Unknown mappings -> fill with 0 or handle better
    df_test['client_id_enc'] = df_test['client_id_enc'].fillna(0).astype(int)
    df_test['currency_pair_enc'] = df_test['currency_pair_enc'].fillna(0).astype(int)

    df_test[numeric_features] = scaler.transform(df_test[numeric_features])

    test_dataset = FlowRatioDataset(df_test, numeric_features, ['client_id_enc', 'currency_pair_enc'])
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

    model.eval()
    preds = []

    with torch.no_grad():
        for x_num, x_cat, _ in test_loader:
            x_num, x_cat = x_num.to(device), x_cat.to(device)
            out = model(x_num, x_cat)
            preds.append(out.cpu().numpy())

    preds = np.concatenate(preds).flatten()
    df_test['predicted_flow_ratio'] = preds

    return df_test


In [31]:
df_train['currency_pair'] = df_train['currency_pair_id']
df_test = df.tail(100)

In [33]:
numeric_features = ['hour', 'market_volume', 'client_spread', 'market_spread']
y_column = 'flow_ratio'

df_train = df
model = train_model(df_train, numeric_features, cat_features=['client_id_enc', 'currency_pair_enc'], y_column=y_column)
df_with_preds = predict_flow_ratio(model, df_test, numeric_features)


Epoch 1 loss: 2.8012
Epoch 2 loss: 0.6212
Epoch 3 loss: 0.1470
Epoch 4 loss: 0.0536
Epoch 5 loss: 0.0296


In [35]:
df.head()

Unnamed: 0,TimeStamp,client_id,segment_id,currency_pair_id,market_volume,client_spread,market_spread,bid_edge,ask_edge,flow_volume,...,hour_cos,dow_sin,dow_cos,dom_sin,dom_cos,woy_sin,woy_cos,currency_pair,client_id_enc,currency_pair_enc
0,2024-01-01 00:00:00,0,0,0,-0.895407,1.685866,0.175139,0.022234,0.028561,28527.604492,...,1.0,0.0,1.0,0.201299,0.97953,0.120537,0.992709,0,0,0
1,2024-01-01 01:00:00,0,0,0,0.368501,-0.647487,-1.403594,0.03381,0.029707,2563.872592,...,0.965926,0.0,1.0,0.201299,0.97953,0.120537,0.992709,0,0,0
2,2024-01-01 02:00:00,0,0,0,1.531663,0.243694,-1.623761,0.012502,0.014629,25757.344567,...,0.866025,0.0,1.0,0.201299,0.97953,0.120537,0.992709,0,0,0
3,2024-01-01 03:00:00,0,0,0,-0.278752,-1.03089,1.550026,0.042928,0.031481,39255.455122,...,0.707107,0.0,1.0,0.201299,0.97953,0.120537,0.992709,0,0,0
4,2024-01-01 04:00:00,0,0,0,-1.518023,-0.16585,0.214479,0.012464,0.013201,3529.798777,...,0.5,0.0,1.0,0.201299,0.97953,0.120537,0.992709,0,0,0
