In [132]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from datetime import datetime
from dateutil.relativedelta import relativedelta
from collections import defaultdict

# scikit-learn scaler
from sklearn.preprocessing import StandardScaler

def month_range(start, end):
    """
    Return a list of monthly dates (as strings 'YYYY-MM-01')
    from start to end inclusive.
    """
    dates = []
    cur = datetime.strptime(start, "%Y-%m-%d")
    stop = datetime.strptime(end, "%Y-%m-%d")
    while cur <= stop:
        dates.append(cur.strftime("%Y-%m-01"))
        # move forward one month
        cur += relativedelta(months=1)
    return dates

def drop_all_nan_columns_before_cutoff(arr, cutoff_idx):
    """
    Drop the columns that are all NaNs before or at the cutoff index.
    :param arr: np.array of shape (n, m)
    :return: np.array of shape (n, m') where m' <= m
    """
    mask = np.all(np.isnan(arr[:cutoff_idx, :]), axis=0)

    print(f"Dropping {mask.sum()} columns with all NaNs before cutoff")

    return arr[:, ~mask]


In [143]:
def read_and_scale_tables(csv_file_paths,
                          start_date="1960-01-01",
                          end_date="2024-01-01",
                          train_cutoff_str="2018-01-01"):
    """
    Reads each CSV, aligns to a monthly timeline, scales the data,
    and returns:
    - table_data_dict: dict[table_name] -> np.array of shape (num_months, k_i)
    - scalers: dict[table_name] -> a fitted StandardScaler
    - monthly_dates: list of monthly date strings
    """
    monthly_dates = month_range(start_date, end_date)
    date_to_idx = {d: i for i, d in enumerate(monthly_dates)}
    num_months = len(monthly_dates)

    table_data_dict = {}
    scalers = {}

    # We'll locate the index for the cutoff date for training
    train_cutoff_idx = date_to_idx[train_cutoff_str]

    for file_path in csv_file_paths:
        table_name = file_path.split("/")[-1].replace(".csv", "")
        print("Reading", table_name)

        df = pd.read_csv(file_path)
        feature_cols = [c for c in df.columns if c != "DATE_PARSED"]

        # Initialize an array with NaNs
        array_data = np.full((num_months, len(feature_cols)), np.nan, dtype=np.float32)

        for _, row in df.iterrows():
            date_str = str(row["DATE_PARSED"])
            if date_str in date_to_idx:
                idx = date_to_idx[date_str]
                array_data[idx] = row[feature_cols].values.astype(np.float32)

        array_data = drop_all_nan_columns_before_cutoff(array_data, train_cutoff_idx)

        # Fit a scikit-learn StandardScaler on the training portion
        scaler = StandardScaler()
        train_data = array_data[:train_cutoff_idx]  # up to but not including test start

        # In order to handle nulls, we will fill them with the mean of the column before fitting
        col_means = np.nanmean(train_data, axis=0)
        train_data_copy = train_data.copy()
        # Then fill the actual array's NaNs with that column mean
        for c in range(train_data.shape[1]):
            np.place(train_data_copy[:, c], np.isnan(train_data_copy[:, c]), col_means[c])

        scaler.fit(train_data_copy)

        # Transform the entire array, ignoring NaNs by temporarily filling them
        # then re-inserting them
        array_copy = array_data.copy()
        nan_mask = np.isnan(array_copy)
        array_copy[nan_mask] = 0.0  # placeholder

        # scale
        scaled_data = scaler.transform(array_copy)
        # put the NaNs back
        scaled_data[nan_mask] = np.nan

        table_data_dict[table_name] = scaled_data
        scalers[table_name] = scaler

    return table_data_dict, scalers, monthly_dates


In [144]:
class EconDataset(Dataset):
    def __init__(self,
                 table_data_dict,
                 monthly_dates,
                 window_length=60,        # e.g. 5 years if monthly
                 train=True,
                 test_start_date="2018-01-01",
                 p_mask_year=0.2,
                 p_mask_partial=0.3,
                 p_mask_none=0.5):
        """
        table_data_dict: dict[table_name] -> (num_months, k_i) scaled arrays
        monthly_dates: list of str, aligned
        window_length: number of months in each sample
        train: whether this is train or test
        test_start_date: boundary
        p_mask_year: probability that we mask the last 12 months entirely
        p_mask_partial: probability that we do random partial feature masking
        p_mask_none: probability that we do no additional masking
        """
        assert abs((p_mask_year + p_mask_partial + p_mask_none) - 1.0) < 1e-7, \
            "Mask probabilities must sum to 1.0"

        self.table_data_dict = table_data_dict
        self.monthly_dates = monthly_dates
        self.num_months = len(monthly_dates)
        self.window_length = window_length

        self.p_mask_year = p_mask_year
        self.p_mask_partial = p_mask_partial
        self.p_mask_none = p_mask_none

        self.test_start_idx = self.monthly_dates.index(test_start_date)

        if train:
            # sample windows from [0 .. test_start_idx - window_length]
            max_start = self.test_start_idx - window_length
            self.start_indices = list(range(0, max_start + 1))
        else:
            # sample windows from [test_start_idx .. num_months - window_length]
            max_start = self.num_months - window_length
            self.start_indices = list(range(self.test_start_idx, max_start + 1))

        self.table_names = list(self.table_data_dict.keys())
        self.table_shapes = [self.table_data_dict[tn].shape[1] for tn in self.table_names]

    def __len__(self):
        return len(self.start_indices)

    def __getitem__(self, idx):
        start_idx = self.start_indices[idx]
        end_idx = start_idx + self.window_length

        sample_full = {}   # unmasked ground truth
        sample_masked = {} # input features with potential extra masking

        # We'll decide which "masking mode" to apply
        r = np.random.rand()
        if r < self.p_mask_year:
            mask_mode = 'year'
        elif r < self.p_mask_year + self.p_mask_partial:
            mask_mode = 'partial'
        else:
            mask_mode = 'none'

        for tn in self.table_names:
            # shape: (L, k_i)
            table_slice = self.table_data_dict[tn][start_idx:end_idx]

            # Keep a copy for the target
            full_data = table_slice.copy()

            # This will be our masked input
            masked_data = table_slice.copy()

            # ~~~ 1) Possibly mask last year (12 months) ~~~
            if mask_mode == 'year':
                omit_start = max(0, self.window_length - 12)
                masked_data[omit_start:, :] = np.nan

            # ~~~ 2) Possibly mask partial ~~~
            elif mask_mode == 'partial':
                # We'll choose random fraction of features to mask across entire L
                # For demonstration, let's randomly pick 1/3 of columns to mask
                k_i = table_slice.shape[1]
                # pick random subset of columns
                col_indices = np.random.choice(k_i, size=int(k_i/3), replace=False)
                masked_data[:, col_indices] = np.nan

            # ~~~ 3) 'none' => do nothing additional ~~~

            sample_full[tn] = full_data
            sample_masked[tn] = masked_data

        return {
            "full_data": sample_full,
            "masked_data": sample_masked
        }


In [145]:
def econ_collate_fn(batch):
    """
    batch: List of size B
      each element is a dict:
        {
          "full_data": {table_name -> (L, k_i) array},
          "masked_data": {table_name -> (L, k_i) array}
        }
    Returns a dict:
      {
        "full_data": {table_name -> (B, L, k_i)},
        "masked_data": {table_name -> (B, L, k_i)},
        "mask": {table_name -> (B, L, k_i)}
      }
    """
    table_names = batch[0]["full_data"].keys()

    # We'll accumulate data in dictionaries of lists
    full_data_dict = {}
    masked_data_dict = {}
    mask_dict = {}

    B = len(batch)

    for tn in table_names:
        # gather arrays for each sample in batch
        full_list = []
        masked_list = []
        mask_list = []

        for sample in batch:
            full_np = sample["full_data"][tn]   # shape: (L, k_i)
            masked_np = sample["masked_data"][tn] # shape: (L, k_i)

            # Convert to torch
            full_tensor = torch.tensor(full_np, dtype=torch.float32)
            masked_tensor = torch.tensor(masked_np, dtype=torch.float32)

            # Build a mask of where full_data is not nan (the ground truth)
            valid_mask = ~torch.isnan(full_tensor)  # shape: (L, k_i)

            # Replace nans in masked_data with 0.0 for the input
            masked_tensor[torch.isnan(masked_tensor)] = 0.0

            full_list.append(full_tensor)
            masked_list.append(masked_tensor)
            mask_list.append(valid_mask.float())  # store as float 0/1

        # stack along batch dimension => (B, L, k_i)
        full_data_stack = torch.stack(full_list, dim=0)
        masked_data_stack = torch.stack(masked_list, dim=0)
        mask_stack = torch.stack(mask_list, dim=0)

        full_data_dict[tn] = full_data_stack
        masked_data_dict[tn] = masked_data_stack
        mask_dict[tn] = mask_stack

    return {
        "full_data": full_data_dict,
        "masked_data": masked_data_dict,
        "mask": mask_dict
    }


In [146]:
class TableEmbedding(nn.Module):
    def __init__(self, k_in, embed_dim):
        super().__init__()
        self.l1 = nn.Linear(k_in, embed_dim)
        self.l2 = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        """
        x: (B, L, k_in)
        returns: (B, L, embed_dim)
        """
        x = self.l1(x)
        x = F.relu(x)
        x = self.l2(x)
        return x


In [147]:
class Flattened2DTransformer(nn.Module):
    def __init__(self, embed_dim=32, n_heads=4, dim_feedforward=128, num_layers=2, dropout=0.1):
        super().__init__()

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=n_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation='relu'
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.embed_dim = embed_dim
        self.dropout = nn.Dropout(dropout)

        # We'll define a max L, max N for positional embeddings if you want to be strict
        # but here we generate them on the fly in forward (less efficient, but simpler).

    def forward(self, x: torch.Tensor, src_key_padding_mask=None):
        """
        x: (B, L, N, E)
        Flatten -> (B, L*N, E), add pos embeddings, pass through Transformer
        -> (B, L, N, E)
        """
        device = x.device
        B, L, N, E = x.shape
        S = L*N

        # Flatten
        x = x.view(B, S, E)  # (B, S, E)

        # Build positional embeddings
        time_positions = torch.arange(L, device=device).unsqueeze(1).expand(L, N).flatten()  # shape (S,)
        time_emb = nn.Embedding(L, E).to(device)

        x = x + time_emb(time_positions)
        x = x.transpose(0, 1)  # -> (S, B, E)

        out = self.transformer(x, src_key_padding_mask=src_key_padding_mask)  # (S, B, E)
        out = out.transpose(0, 1).view(B, L, N, E)
        return out


In [148]:
class TableDecoder(nn.Module):
    def __init__(self, embed_dim, k_out):
        super().__init__()
        self.linear = nn.Linear(embed_dim, k_out)

    def forward(self, x):
        """
        x: (B, L, embed_dim)
        -> (B, L, k_out)
        """
        return self.linear(x)


In [149]:
class EconModel(nn.Module):
    def __init__(self, table_names, table_shapes,
                 embed_dim=32, n_heads=4, ff_dim=128, num_layers=2):
        super().__init__()

        self.table_names = table_names
        self.N = len(table_names)

        self.table_embeds = nn.ModuleDict()
        self.table_decoders = nn.ModuleDict()

        # Create embeddings/decoders
        for tn, k_in in zip(table_names, table_shapes):
            self.table_embeds[tn] = TableEmbedding(k_in, embed_dim)
            self.table_decoders[tn] = TableDecoder(embed_dim, k_in)

        # 2D Transformer core
        self.core_transformer = Flattened2DTransformer(
            embed_dim=embed_dim,
            n_heads=n_heads,
            dim_feedforward=ff_dim,
            num_layers=num_layers
        )
        self.embed_dim = embed_dim

    def forward(self, batch_data):
        """
        batch_data:
          {
            "full_data": {tn -> (B, L, k_i)},
            "masked_data": {tn -> (B, L, k_i)},
            "mask": {tn -> (B, L, k_i)}
          }
        We'll use masked_data as input,
        but we decode and eventually compute loss vs. full_data.
        Returns a dict {tn -> (B, L, k_i)} of predictions.
        """
        B = None
        L = None

        # 1) embed each table => (B, L, E), stack => (B, L, N, E)
        embed_list = []
        table_valid_mask_list = []

        for tn in self.table_names:
            x = batch_data["masked_data"][tn]  # (B, L, k_i), with 0 where missing
            m = batch_data["mask"][tn]         # (B, L, k_i)
            B, L, k_i = x.shape

            # embed
            x_emb = self.table_embeds[tn](x)  # -> (B, L, E)
            embed_list.append(x_emb)

            # Build a "valid" mask for the transformer. We can say if the entire row is missing, we mask it.
            # But let's do a simple approach: if sum over k_i is zero, it's missing.
            # Actually, better to check if masked_data was all zeros => but that's
            # tricky if partial columns are present.
            # We'll do a simpler approach:
            # We'll rely on the standard transformer src_key_padding_mask usage in forward().
            # That requires shape (B, L*N). We'll build that after stacking.

        # stack => (B, L, N, E)
        embed_stack = torch.stack(embed_list, dim=2)

        # Build key_padding_mask => shape (B, L*N).
        # We consider a position "padded" if the input is entirely 0 for that table at that time
        # (assuming masked_data sets missing columns to 0).
        # Let's check the sum over E BEFORE the linear, or sum over k_i?
        # Right now, we have embed_stack: (B, L, N, E)
        # Summation in the original space was easier, but let's do it here:
        # We'll reconstruct a mask for "non-empty" from the input x.

        # We'll do a quick pass to find if x was all zeros =>
        # but we only have x inside the loop. Let's do it more systematically:

        table_zero_mask_list = []
        for tn in self.table_names:
            x_original = batch_data["masked_data"][tn]  # (B, L, k_i)
            zero_mask = (x_original.abs().sum(dim=2) == 0.0)  # shape: (B, L) boolean
            table_zero_mask_list.append(zero_mask)

        # stack => (B, L, N)
        zero_mask_stacked = torch.stack(table_zero_mask_list, dim=2)
        # we want shape (B, L*N) for the src_key_padding_mask => True if padded
        key_padding_mask = zero_mask_stacked.view(B, -1)  # (B, L*N)

        # 2) pass through the transformer
        out_2d = self.core_transformer(embed_stack, src_key_padding_mask=key_padding_mask)
        # shape: (B, L, N, E)

        # 3) decode table by table
        decoded = {}
        for i, tn in enumerate(self.table_names):
            table_repr = out_2d[:, :, i, :]  # (B, L, E)
            out = self.table_decoders[tn](table_repr)  # (B, L, k_i)
            decoded[tn] = out

        return decoded


In [150]:
def masked_mse_loss(pred, target, mask):
    """
    pred: (B, L, k_i)
    target: (B, L, k_i)
    mask: (B, L, k_i)  # 1 where ground truth is valid, 0 where no ground truth
    Returns average MSE over valid entries.
    """
    diff = (pred - target) ** 2
    diff = diff * mask  # zero out missing
    valid_count = mask.sum()
    if valid_count > 0:
        return diff.sum() / valid_count
    else:
        return torch.tensor(0.0, device=pred.device)


In [157]:
from tqdm.notebook import tqdm


def train_econ_model(csv_file_paths,
                     epochs=5,
                     batch_size=8,
                     window_length=60,
                     embed_dim=32,
                     lr=1e-3,
                     p_mask_year=0.2,
                     p_mask_partial=0.3,
                     p_mask_none=0.5):
    """
    Full pipeline:
    1) Read & scale data with scikit-learn
    2) Create train/test datasets
    3) Model + optimizer
    4) Training loop with masked MSE
    """
    # 1) read & scale
    table_data_dict, scalers, monthly_dates = read_and_scale_tables(
        csv_file_paths,
        start_date="1960-01-01",
        end_date="2024-01-01",
        train_cutoff_str="2018-01-01"
    )

    # 2) create datasets
    train_dataset = EconDataset(
        table_data_dict,
        monthly_dates,
        window_length=window_length,
        train=True,
        test_start_date="2018-01-01",
        p_mask_year=p_mask_year,
        p_mask_partial=p_mask_partial,
        p_mask_none=p_mask_none
    )

    test_dataset = EconDataset(
        table_data_dict,
        monthly_dates,
        window_length=window_length,
        train=False,
        test_start_date="2018-01-01",
        p_mask_year=p_mask_year,
        p_mask_partial=p_mask_partial,
        p_mask_none=p_mask_none
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=econ_collate_fn
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=econ_collate_fn
    )

    # 3) model + optimizer
    table_names = list(table_data_dict.keys())
    table_shapes = [table_data_dict[tn].shape[1] for tn in table_names]

    model = EconModel(
        table_names,
        table_shapes,
        embed_dim=embed_dim,
        n_heads=4,
        ff_dim=128,
        num_layers=2
    )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # 4) Training loop
    for ep in range(epochs):
        model.train()
        total_train_loss = 0.0
        for b_idx, batch_data in tqdm(enumerate(train_loader), desc="Training"):
            # move data to device
            for tn in batch_data["full_data"]:
                batch_data["full_data"][tn] = batch_data["full_data"][tn].to(device)
                batch_data["masked_data"][tn] = batch_data["masked_data"][tn].to(device)
                batch_data["mask"][tn] = batch_data["mask"][tn].to(device)

            optimizer.zero_grad()
            outputs = model(batch_data)  # dict {tn -> (B, L, k_i)}

            # compute loss
            loss_val = 0.0
            for tn in table_names:
                pred = outputs[tn]
                tgt = batch_data["full_data"][tn]
                msk = batch_data["mask"][tn]
                loss_val += masked_mse_loss(pred, tgt, msk)

            loss_val.backward()
            optimizer.step()

            total_train_loss += loss_val.item()

        avg_train_loss = total_train_loss / (b_idx + 1)

        # Evaluate
        model.eval()
        total_test_loss = 0.0
        with torch.no_grad():
            for b_idx, batch_data in tqdm(enumerate(test_loader), desc="Testing"):
                for tn in batch_data["full_data"]:
                    batch_data["full_data"][tn] = batch_data["full_data"][tn].to(device)
                    batch_data["masked_data"][tn] = batch_data["masked_data"][tn].to(device)
                    batch_data["mask"][tn] = batch_data["mask"][tn].to(device)

                outputs = model(batch_data)
                loss_val = 0.0
                for tn in table_names:
                    pred = outputs[tn]
                    tgt = batch_data["full_data"][tn]
                    msk = batch_data["mask"][tn]
                    loss_val += masked_mse_loss(pred, tgt, msk)

                total_test_loss += loss_val.item()

            avg_test_loss = total_test_loss / (b_idx + 1)

        print(f"Epoch {ep+1}/{epochs} - Train Loss: {avg_train_loss:.4f} | Test Loss: {avg_test_loss:.4f}")

    print("Training complete!")
    return model


In [158]:
import json

with open("Data/all_data.json", "r") as f:
    all_data = json.load(f)

csv_file_paths = [f"Data/{table_name}.csv" for table_name in all_data]
train_econ_model(csv_file_paths)

Reading BALANCE-PAIEMENTS
Dropping 0 columns with all NaNs before cutoff
Reading CHOMAGE-TRIM-NATIONAL
Dropping 0 columns with all NaNs before cutoff
Reading CLIMAT-AFFAIRES
Dropping 0 columns with all NaNs before cutoff
Reading CNA-2020-CONSO-MEN
Dropping 1 columns with all NaNs before cutoff
Reading CNA-2020-CONSO-SI
Dropping 0 columns with all NaNs before cutoff
Reading CNA-2020-CPEB
Dropping 0 columns with all NaNs before cutoff
Reading CNA-2020-CSI
Dropping 0 columns with all NaNs before cutoff
Reading CNA-2020-EMPLOI
Dropping 0 columns with all NaNs before cutoff
Reading CNA-2020-ERE
Dropping 2 columns with all NaNs before cutoff
Reading CNA-2020-FBCF-SI
Dropping 0 columns with all NaNs before cutoff
Reading CNA-2020-PIB
Dropping 0 columns with all NaNs before cutoff
Reading CNA-2020-TEI
Dropping 0 columns with all NaNs before cutoff
Reading CNT-2020-CB
Dropping 0 columns with all NaNs before cutoff
Reading CNT-2020-CSI
Dropping 0 columns with all NaNs before cutoff
Reading CNT-2



Training: 0it [00:00, ?it/s]

KeyboardInterrupt: 