In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

# Step 1: Mock Data
def generate_mock_data(num_currency_pairs=5, num_clients_per_pair=10, days=2):
    records = []
    for ccy in range(num_currency_pairs):
        for client in range(num_clients_per_pair):
            for hour in range(24 * days):
                timestamp = pd.Timestamp("2024-01-01") + pd.Timedelta(hours=hour)
                market_volume = np.random.uniform(1e5, 1e6)
                client_spread = np.random.uniform(0.1, 1.0)
                market_spread = np.random.uniform(0.05, 0.5)
                bid_edge = np.random.uniform(0.01, 0.05)
                ask_edge = np.random.uniform(0.01, 0.05)
                flow_volume = np.random.uniform(0, market_volume * 0.1)
                market_total_volume = market_volume + np.random.uniform(0, 5e5)

                records.append({
                    'TimeStamp': timestamp,
                    'client_id': client,
                    'segment_id': client % 3,
                    'currency_pair_id': ccy,
                    'market_volume': market_volume,
                    'client_spread': client_spread,
                    'market_spread': market_spread,
                    'bid_edge': bid_edge,
                    'ask_edge': ask_edge,
                    'flow_volume': flow_volume,
                    'market_total_volume': market_total_volume
                })
    df = pd.DataFrame(records)
    df['flow_ratio'] = df['flow_volume'] / df['market_total_volume']
    return df

df = generate_mock_data()



In [2]:
df.head()

Unnamed: 0,TimeStamp,client_id,segment_id,currency_pair_id,market_volume,client_spread,market_spread,bid_edge,ask_edge,flow_volume,market_total_volume,flow_ratio
0,2024-01-01 00:00:00,0,0,0,226873.893794,0.941218,0.405161,0.026663,0.025619,3228.497231,440295.8,0.007333
1,2024-01-01 01:00:00,0,0,0,113387.90982,0.71004,0.175491,0.045596,0.032908,528.723994,369662.6,0.00143
2,2024-01-01 02:00:00,0,0,0,593987.392891,0.84069,0.303079,0.026762,0.018154,34677.859018,613054.0,0.056566
3,2024-01-01 03:00:00,0,0,0,473642.46085,0.194164,0.230058,0.045883,0.049876,34808.866067,679426.4,0.051233
4,2024-01-01 04:00:00,0,0,0,597459.747618,0.298175,0.484759,0.037752,0.020775,34016.8668,1017944.0,0.033417


In [11]:
import numpy as np
import pandas as pd

def add_time_features(df, time_col='TimeStamp'):
    df = df.copy()
    
    # Ensure datetime
    df[time_col] = pd.to_datetime(df[time_col])

    # Extract time components
    df['hour'] = df[time_col].dt.hour
    df['day_of_week'] = df[time_col].dt.dayofweek
    df['day_of_month'] = df[time_col].dt.day
    df['week_of_year'] = df[time_col].dt.isocalendar().week.astype(int)

    # Normalize with sin/cos transformation to represent cycles
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

    df['dom_sin'] = np.sin(2 * np.pi * df['day_of_month'] / 31)
    df['dom_cos'] = np.cos(2 * np.pi * df['day_of_month'] / 31)

    df['woy_sin'] = np.sin(2 * np.pi * df['week_of_year'] / 52)
    df['woy_cos'] = np.cos(2 * np.pi * df['week_of_year'] / 52)

    return df


In [12]:
df = add_time_features(df)

In [13]:

class FlowDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        time_hour = row['TimeStamp'].hour + 24 * row['TimeStamp'].dayofweek
        features = np.array([
            time_hour / 168,  # normalize to week cycle
            row['market_volume'],
            row['client_spread'],
            row['market_spread'],
            row['bid_edge'],
            row['ask_edge']
        ], dtype=np.float32)

        return {
            'features': torch.tensor(features),
            'client_id': torch.tensor(row['client_id'], dtype=torch.long),
            'segment_id': torch.tensor(row['segment_id'], dtype=torch.long),
            'currency_pair_id': torch.tensor(row['currency_pair_id'], dtype=torch.long),
            'target': torch.tensor(row['flow_ratio'], dtype=torch.float32)
        }


import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

# Make sure this matches your model definition
class FlowRatioModel(nn.Module):
    def __init__(self, num_numeric, cat_dims, emb_dims):
        super().__init__()
        self.embeddings = nn.ModuleList(
            [nn.Embedding(cat_dim, emb_dim) for cat_dim, emb_dim in zip(cat_dims, emb_dims)]
        )
        total_emb_dim = sum(emb_dims)
        self.mlp = nn.Sequential(
            nn.Linear(num_numeric + total_emb_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x_num, x_cat):
        embs = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        x = torch.cat([x_num] + embs, dim=1)
        return self.mlp(x)

class FlowRatioDataset(torch.utils.data.Dataset):
    def __init__(self, df, numeric_features, cat_features, y_column=None):
        self.x_num = torch.tensor(df[numeric_features].values, dtype=torch.float32)
        self.x_cat = torch.tensor(df[cat_features].values, dtype=torch.long)
        self.y = None if y_column is None else torch.tensor(df[y_column].values, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.x_num)

    def __getitem__(self, idx):
        return self.x_num[idx], self.x_cat[idx], self.y[idx] if self.y is not None else 0.0

# ----------------------------
# TRAINING FUNCTION
# ----------------------------
def train_model(df_train, numeric_features, cat_features, y_column, device='cpu'):
    global client_id_map, currency_pair_map, scaler

    # Encode categorical vars
    client_id_map = {val: idx for idx, val in enumerate(df_train['client_id'].unique())}
    currency_pair_map = {val: idx for idx, val in enumerate(df_train['currency_pair'].unique())}

    df_train['client_id_enc'] = df_train['client_id'].map(client_id_map)
    df_train['currency_pair_enc'] = df_train['currency_pair'].map(currency_pair_map)

    # Normalize numeric vars
    scaler = StandardScaler()
    df_train[numeric_features] = scaler.fit_transform(df_train[numeric_features])

    # Dataset + Model
    train_dataset = FlowRatioDataset(df_train, numeric_features, ['client_id_enc', 'currency_pair_enc'], y_column)
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

    model = FlowRatioModel(
        num_numeric=len(numeric_features),
        cat_dims=[len(client_id_map), len(currency_pair_map)],
        emb_dims=[32, 16]  # Tuneable
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    # Train loop (5 epochs)
    for epoch in range(5):
        model.train()
        epoch_loss = 0
        for x_num, x_cat, y in train_loader:
            x_num, x_cat, y = x_num.to(device), x_cat.to(device), y.to(device)

            pred = model(x_num, x_cat)
            loss = loss_fn(pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        print(f"Epoch {epoch+1} loss: {epoch_loss:.4f}")

    return model

import torch.nn as nn

class FlowPredictor(nn.Module):
    def __init__(self, num_clients, num_segments, num_currency_pairs, emb_dim=8):
        super().__init__()
        self.client_emb = nn.Embedding(num_clients, emb_dim)
        self.segment_emb = nn.Embedding(num_segments, emb_dim)
        self.currency_emb = nn.Embedding(num_currency_pairs, emb_dim)

        self.fc1 = nn.Linear(6 + emb_dim * 3, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 1)

    def forward(self, features, client_id, segment_id, currency_pair_id):
        c_emb = self.client_emb(client_id)
        s_emb = self.segment_emb(segment_id)
        cur_emb = self.currency_emb(currency_pair_id)
        x = torch.cat([features, c_emb, s_emb, cur_emb], dim=1)
        x = self.relu(self.fc1(x))
        return self.fc2(x).squeeze()


def train_model(df, num_epochs=10):
    dataset = FlowDataset(df)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

    model = FlowPredictor(num_clients=200, num_segments=3, num_currency_pairs=30)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            pred = model(batch['features'], batch['client_id'], batch['segment_id'], batch['currency_pair_id'])
            loss = loss_fn(pred, batch['target'])
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

    return model


In [8]:
model = train_model(df)

Epoch 1, Loss: 14880005849.8125
Epoch 2, Loss: 114923443.8436
Epoch 3, Loss: 5175766.5293
Epoch 4, Loss: 107124.1870
Epoch 5, Loss: 2285.8702
Epoch 6, Loss: 43.1897
Epoch 7, Loss: 3.8002
Epoch 8, Loss: 3.3665
Epoch 9, Loss: 3.3577
Epoch 10, Loss: 3.3325


In [9]:
model

FlowPredictor(
  (client_emb): Embedding(200, 8)
  (segment_emb): Embedding(3, 8)
  (currency_emb): Embedding(30, 8)
  (fc1): Linear(in_features=30, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)

In [10]:
df.head()

Unnamed: 0,TimeStamp,client_id,segment_id,currency_pair_id,market_volume,client_spread,market_spread,bid_edge,ask_edge,flow_volume,market_total_volume,flow_ratio
0,2024-01-01 00:00:00,0,0,0,226873.893794,0.941218,0.405161,0.026663,0.025619,3228.497231,440295.8,0.007333
1,2024-01-01 01:00:00,0,0,0,113387.90982,0.71004,0.175491,0.045596,0.032908,528.723994,369662.6,0.00143
2,2024-01-01 02:00:00,0,0,0,593987.392891,0.84069,0.303079,0.026762,0.018154,34677.859018,613054.0,0.056566
3,2024-01-01 03:00:00,0,0,0,473642.46085,0.194164,0.230058,0.045883,0.049876,34808.866067,679426.4,0.051233
4,2024-01-01 04:00:00,0,0,0,597459.747618,0.298175,0.484759,0.037752,0.020775,34016.8668,1017944.0,0.033417


In [14]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

# Make sure this matches your model definition
class FlowRatioModel(nn.Module):
    def __init__(self, num_numeric, cat_dims, emb_dims):
        super().__init__()
        self.embeddings = nn.ModuleList(
            [nn.Embedding(cat_dim, emb_dim) for cat_dim, emb_dim in zip(cat_dims, emb_dims)]
        )
        total_emb_dim = sum(emb_dims)
        self.mlp = nn.Sequential(
            nn.Linear(num_numeric + total_emb_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x_num, x_cat):
        embs = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        x = torch.cat([x_num] + embs, dim=1)
        return self.mlp(x)

class FlowRatioDataset(torch.utils.data.Dataset):
    def __init__(self, df, numeric_features, cat_features, y_column=None):
        self.x_num = torch.tensor(df[numeric_features].values, dtype=torch.float32)
        self.x_cat = torch.tensor(df[cat_features].values, dtype=torch.long)
        self.y = None if y_column is None else torch.tensor(df[y_column].values, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.x_num)

    def __getitem__(self, idx):
        return self.x_num[idx], self.x_cat[idx], self.y[idx] if self.y is not None else 0.0

# ----------------------------
# TRAINING FUNCTION
# ----------------------------
def train_model(df_train, numeric_features, cat_features, y_column, device='cpu'):
    global client_id_map, currency_pair_map, scaler

    # Encode categorical vars
    client_id_map = {val: idx for idx, val in enumerate(df_train['client_id'].unique())}
    currency_pair_map = {val: idx for idx, val in enumerate(df_train['currency_pair'].unique())}

    df_train['client_id_enc'] = df_train['client_id'].map(client_id_map)
    df_train['currency_pair_enc'] = df_train['currency_pair'].map(currency_pair_map)

    # Normalize numeric vars
    scaler = StandardScaler()
    df_train[numeric_features] = scaler.fit_transform(df_train[numeric_features])

    # Dataset + Model
    train_dataset = FlowRatioDataset(df_train, numeric_features, ['client_id_enc', 'currency_pair_enc'], y_column)
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

    model = FlowRatioModel(
        num_numeric=len(numeric_features),
        cat_dims=[len(client_id_map), len(currency_pair_map)],
        emb_dims=[32, 16]  # Tuneable
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    # Train loop (5 epochs)
    for epoch in range(5):
        model.train()
        epoch_loss = 0
        for x_num, x_cat, y in train_loader:
            x_num, x_cat, y = x_num.to(device), x_cat.to(device), y.to(device)

            pred = model(x_num, x_cat)
            loss = loss_fn(pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        print(f"Epoch {epoch+1} loss: {epoch_loss:.4f}")

    return model


In [15]:
def predict_flow_ratio(model, df_test, numeric_features, device='cpu'):
    df_test = df_test.copy()

    df_test['client_id_enc'] = df_test['client_id'].map(client_id_map)
    df_test['currency_pair_enc'] = df_test['currency_pair'].map(currency_pair_map)

    # Unknown mappings -> fill with 0 or handle better
    df_test['client_id_enc'] = df_test['client_id_enc'].fillna(0).astype(int)
    df_test['currency_pair_enc'] = df_test['currency_pair_enc'].fillna(0).astype(int)

    df_test[numeric_features] = scaler.transform(df_test[numeric_features])

    test_dataset = FlowRatioDataset(df_test, numeric_features, ['client_id_enc', 'currency_pair_enc'])
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

    model.eval()
    preds = []

    with torch.no_grad():
        for x_num, x_cat, _ in test_loader:
            x_num, x_cat = x_num.to(device), x_cat.to(device)
            out = model(x_num, x_cat)
            preds.append(out.cpu().numpy())

    preds = np.concatenate(preds).flatten()
    df_test['predicted_flow_ratio'] = preds

    return df_test


In [17]:
numeric_features = ['hour', 'market_volume', 'client_spread', 'market_spread']
y_column = 'flow_ratio'

df_train = df
model = train_model(df_train, numeric_features, cat_features=['client_id_enc', 'currency_pair_enc'], y_column=y_column)
df_with_preds = predict_flow_ratio(model, df_test, numeric_features)


KeyError: 'currency_pair'

In [18]:
df.head()

Unnamed: 0,TimeStamp,client_id,segment_id,currency_pair_id,market_volume,client_spread,market_spread,bid_edge,ask_edge,flow_volume,...,day_of_month,week_of_year,hour_sin,hour_cos,dow_sin,dow_cos,dom_sin,dom_cos,woy_sin,woy_cos
0,2024-01-01 00:00:00,0,0,0,226873.893794,0.941218,0.405161,0.026663,0.025619,3228.497231,...,1,1,0.0,1.0,0.0,1.0,0.201299,0.97953,0.120537,0.992709
1,2024-01-01 01:00:00,0,0,0,113387.90982,0.71004,0.175491,0.045596,0.032908,528.723994,...,1,1,0.258819,0.965926,0.0,1.0,0.201299,0.97953,0.120537,0.992709
2,2024-01-01 02:00:00,0,0,0,593987.392891,0.84069,0.303079,0.026762,0.018154,34677.859018,...,1,1,0.5,0.866025,0.0,1.0,0.201299,0.97953,0.120537,0.992709
3,2024-01-01 03:00:00,0,0,0,473642.46085,0.194164,0.230058,0.045883,0.049876,34808.866067,...,1,1,0.707107,0.707107,0.0,1.0,0.201299,0.97953,0.120537,0.992709
4,2024-01-01 04:00:00,0,0,0,597459.747618,0.298175,0.484759,0.037752,0.020775,34016.8668,...,1,1,0.866025,0.5,0.0,1.0,0.201299,0.97953,0.120537,0.992709
