In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from datetime import datetime
from dateutil.relativedelta import relativedelta
from collections import defaultdict

# scikit-learn scaler
from sklearn.preprocessing import StandardScaler

def month_range(start, end):
    """
    Return a list of monthly dates (as strings 'YYYY-MM-01')
    from start to end inclusive.
    """
    dates = []
    cur = datetime.strptime(start, "%Y-%m-%d")
    stop = datetime.strptime(end, "%Y-%m-%d")
    while cur <= stop:
        dates.append(cur.strftime("%Y-%m-01"))
        # move forward one month
        cur += relativedelta(months=1)
    return dates

def drop_all_nan_columns_before_cutoff(arr, cutoff_idx):
    """
    Drop the columns that are all NaNs before or at the cutoff index.
    :param arr: np.array of shape (n, m)
    :return: np.array of shape (n, m') where m' <= m
    """
    mask = np.all(np.isnan(arr[:cutoff_idx, :]), axis=0)
    if mask.sum() > 0:
        print(f"Dropping {mask.sum()} columns with all NaNs before cutoff")

    return arr[:, ~mask]


In [8]:
from typing import List, Tuple

def read_and_scale_tables(
        csv_file_paths,
        meta_file_paths,
        start_date="1960-01-01",
        end_date="2024-01-01",
        train_cutoff_str="2018-01-01"):
    """
    Reads each CSV data file and corresponding CSV meta file,
    aligns data to a monthly timeline, scales the data,
    and returns:
      - table_data_dict: dict[table_name] -> np.array of shape (num_months, k_i)
      - meta_data_dict: dict[table_name] -> dict[col_index -> freq_str]
      - scalers: dict[table_name] -> a fitted StandardScaler
      - monthly_dates: list of monthly date strings

    :param csv_file_paths: list of str (data CSVs, one per table)
    :param meta_file_paths: list of str (metadata CSVs, same length as above)
    :param start_date: str, earliest date
    :param end_date: str, latest date
    :param train_cutoff_str: str, date boundary for training
    """
    assert len(csv_file_paths) == len(meta_file_paths), \
        "Must have matching data and meta files"

    monthly_dates = month_range(start_date, end_date)
    date_to_idx = {d: i for i, d in enumerate(monthly_dates)}
    num_months = len(monthly_dates)

    train_cutoff_idx = date_to_idx[train_cutoff_str]

    table_data_dict = {}
    meta_data_dict = {}
    scalers = {}

    for data_path, meta_path in zip(csv_file_paths, meta_file_paths):
        table_name = data_path.split("/")[-1].replace(".csv", "")
        print("Reading table:", table_name)

        # Read the data CSV
        df = pd.read_csv(data_path)
        feature_cols = [c for c in df.columns if c != "DATE_PARSED"]

        # Read the meta CSV
        df_meta = pd.read_csv(meta_path)

        # Create a dictionary col_index -> freq_str
        col_to_freq = {}
        for col_index, col_name in enumerate(feature_cols):
            row_meta = df_meta[df_meta['TITLE_FR'] == col_name]
            if len(row_meta) == 0:
                raise ValueError(f"Could not find metadata for column {col_name}")
            else:
                freq_str = row_meta['FREQ'].values[0]
            col_to_freq[col_index] = freq_str

        # Now create array_data (num_months, k_i)
        array_data = np.full((num_months, len(feature_cols)), np.nan, dtype=np.float32)
        for _, row in df.iterrows():
            date_str = str(row["DATE_PARSED"])
            if date_str in date_to_idx:
                idx = date_to_idx[date_str]
                array_data[idx] = row[feature_cols].values.astype(np.float32)

        # Optionally drop columns that are all NaN up to the cutoff, this should have been done in the ETL
        array_data = drop_all_nan_columns_before_cutoff(array_data, train_cutoff_idx)

        # The dropping above might reduce shape from (num_months, k_i) to fewer columns
        # so we also need to shrink col_to_freq accordingly:
        keep_mask = ~np.all(np.isnan(array_data[:train_cutoff_idx, :]), axis=0)
        # new frequency map
        old_col_indices = np.where(keep_mask)[0]
        new_col_to_freq = {}
        for new_i, old_i in enumerate(old_col_indices):
            new_col_to_freq[new_i] = col_to_freq[old_i]
        col_to_freq = new_col_to_freq

        # Fit StandardScaler on training portion (ignoring NaNs by filling w/ mean)
        scaler = StandardScaler()
        train_data = array_data[:train_cutoff_idx]  # shape (train_cutoff_idx, k_i')

        col_means = np.nanmean(train_data, axis=0)
        train_data_copy = train_data.copy()
        for c in range(train_data_copy.shape[1]):
            np.place(train_data_copy[:, c], np.isnan(train_data_copy[:, c]), col_means[c])

        scaler.fit(train_data_copy)

        # Transform entire array, ignoring NaNs by temp-filling them
        array_copy = array_data.copy()
        nan_mask = np.isnan(array_copy)
        array_copy[nan_mask] = 0.0
        scaled_data = scaler.transform(array_copy)
        scaled_data[nan_mask] = np.nan  # put NaNs back

        # Store results
        table_data_dict[table_name] = scaled_data
        meta_data_dict[table_name] = col_to_freq
        scalers[table_name] = scaler

    return table_data_dict, meta_data_dict, scalers, monthly_dates


In [9]:
def is_expected(freq_str, date_str):
    """
    freq_str: 'A', 'Q', or 'M'
    date_str: e.g. '2021-03-01'
    Returns True if we *should* have a value on this date.
    """
    dt = datetime.strptime(date_str, "%Y-%m-%d")
    m = dt.month
    if freq_str == 'M':
        return True
    elif freq_str == 'Q':
        # let’s say Q covers months 3,6,9,12
        return (m in [3,6,9,12])
    elif freq_str == 'A':
        # annual => only january
        return (m == 1)
    else:
        # default
        return True

In [10]:
def is_expected_mask_for_col(freq_str, months_of_date):
    """
    Given a freq_str in {'A','Q','M'} and an array of month integers,
    return a boolean mask of where we expect a value for that freq.
    E.g. for 'Q', we might only expect months 3,6,9,12; for 'A', only month 1, etc.
    """
    if freq_str == 'M':
        return np.ones_like(months_of_date, dtype=bool)
    elif freq_str == 'T':
        return np.isin(months_of_date, [1, 4, 7, 10])
    elif freq_str == 'A':
        return (months_of_date == 1)
    else:
        # Default: raise error
        raise ValueError(f"Unknown frequency string: {freq_str}")

def generate_expected_and_truly_missing_masks_vectorized(
    table_array,    # shape (L, k_i), might contain np.nan
    freq_dict,      # dict[col_index -> freq_str], e.g. {0:'A',1:'Q',2:'M',...}
    date_list       # list of date strings 'YYYY-MM-01' of length L
):
    """
    Generates two boolean masks of shape (L, k_i):
      - expected_missing_mask: True where the value is *missing* but that is *expected*
                               (due to freq not reporting that month)
      - truly_missing_mask:    True where the value is missing but it *should* have been reported
    """
    L, k_i = table_array.shape

    # 1) Parse months out of each date in date_list
    #    We'll create an array of shape (L,) containing the month number.
    months_of_date = np.array([
        datetime.strptime(d, "%Y-%m-%d").month for d in date_list
    ], dtype=int)  # shape (L,)

    # 2) Build an array freq_array of shape (k_i,) from freq_dict
    #    so freq_array[c] = freq_dict[c]
    freq_array = np.array([freq_dict[c] for c in range(k_i)], dtype=object)  # shape (k_i,)

    # 3) Build an (L, k_i) boolean array "expected_array"
    #    which is True where freq says "we expect a value"
    #    We'll do this by computing a mask for each column, then stacking.
    expected_array = np.zeros((L, k_i), dtype=bool)
    for c in range(k_i):
        freq_str = freq_array[c]
        expected_array[:, c] = is_expected_mask_for_col(freq_str, months_of_date)

    # 4) Now check where table_array is NaN => "missing"
    missing_mask = np.isnan(table_array)  # shape (L, k_i)

    # 5) truly_missing => "missing" AND "expected"
    truly_missing_mask = missing_mask & expected_array
    # 6) expected_missing => "missing" AND "NOT expected"
    expected_missing_mask = missing_mask & ~expected_array

    return expected_missing_mask, truly_missing_mask

In [11]:
import json

with open("Data/all_data.json", "r") as f:
    all_data = json.load(f)

csv_file_paths = [f"Data/{table_name}.csv" for table_name in all_data]
csv_file_paths_meta = [f"Data/{table_name}_meta.csv" for table_name in all_data]

table_data_dict, meta_data_dict, scalers, monthly_dates = read_and_scale_tables(
    csv_file_paths,
    csv_file_paths_meta,
    start_date="1970-01-01",
    end_date="2024-01-01",
    train_cutoff_str="2018-01-01"
)

Reading table: BALANCE-PAIEMENTS
Reading table: CHOMAGE-TRIM-NATIONAL
Reading table: CLIMAT-AFFAIRES
Reading table: CNA-2020-CONSO-MEN
Reading table: CNA-2020-CONSO-SI
Reading table: CNA-2020-CPEB
Reading table: CNA-2020-CSI
Reading table: CNA-2020-EMPLOI
Reading table: CNA-2020-ERE
Reading table: CNA-2020-FBCF-SI
Reading table: CNA-2020-PIB
Reading table: CNA-2020-TEI
Reading table: CNT-2020-CB
Reading table: CNT-2020-CSI
Reading table: CNT-2020-OPERATIONS
Reading table: CNT-2020-PIB-EQB-RF
Reading table: COM-EXT
Reading table: COMPTES-ETAT
Reading table: CONSO-MENAGES-2020
Reading table: CONSTRUCTION-LOCAUX
Reading table: CONSTRUCTION-LOGEMENTS
Reading table: CREATIONS-ENTREPRISES-METHODE-2022
Reading table: DECES-MORTALITE
Reading table: DEFAILLANCES-ENTREPRISES
Reading table: DEMANDES-EMPLOIS-NATIONALES
Reading table: DETTE-NEGOCIABLE-ETAT
Reading table: DETTE-TRIM-APU-2020
Reading table: EMPLOI-BIT-TRIM
Reading table: EMPLOI-SALARIE-TRIM-NATIONAL
Reading table: ENQ-CONJ-ACT-IND
Re

In [12]:
monthly_dates

['1970-01-01',
 '1970-02-01',
 '1970-03-01',
 '1970-04-01',
 '1970-05-01',
 '1970-06-01',
 '1970-07-01',
 '1970-08-01',
 '1970-09-01',
 '1970-10-01',
 '1970-11-01',
 '1970-12-01',
 '1971-01-01',
 '1971-02-01',
 '1971-03-01',
 '1971-04-01',
 '1971-05-01',
 '1971-06-01',
 '1971-07-01',
 '1971-08-01',
 '1971-09-01',
 '1971-10-01',
 '1971-11-01',
 '1971-12-01',
 '1972-01-01',
 '1972-02-01',
 '1972-03-01',
 '1972-04-01',
 '1972-05-01',
 '1972-06-01',
 '1972-07-01',
 '1972-08-01',
 '1972-09-01',
 '1972-10-01',
 '1972-11-01',
 '1972-12-01',
 '1973-01-01',
 '1973-02-01',
 '1973-03-01',
 '1973-04-01',
 '1973-05-01',
 '1973-06-01',
 '1973-07-01',
 '1973-08-01',
 '1973-09-01',
 '1973-10-01',
 '1973-11-01',
 '1973-12-01',
 '1974-01-01',
 '1974-02-01',
 '1974-03-01',
 '1974-04-01',
 '1974-05-01',
 '1974-06-01',
 '1974-07-01',
 '1974-08-01',
 '1974-09-01',
 '1974-10-01',
 '1974-11-01',
 '1974-12-01',
 '1975-01-01',
 '1975-02-01',
 '1975-03-01',
 '1975-04-01',
 '1975-05-01',
 '1975-06-01',
 '1975-07-

In [14]:
def test_data_integrity(table_data_dict, meta_data_dict, monthly_dates):
    # Test that all arrays have the same number of months as monthly_dates
    num_months = len(monthly_dates)
    for table_name, table_data in table_data_dict.items():
        assert table_data.shape[0] == num_months, \
            f"Table {table_name} has {table_data.shape[0]} months, expected {num_months}"

    # Test that all arrays have the same number of columns as meta_data_dict
    for table_name, table_data in table_data_dict.items():
        assert table_data.shape[1] == len(meta_data_dict[table_name]), \
            f"Table {table_name} has {table_data.shape[1]} columns, expected {len(meta_data_dict[table_name])}"

    # Test that are not null where True mask AND expected mask == 0
    for table_name, table_data in table_data_dict.items():
        expected_mask, truly_missing_mask = generate_expected_and_truly_missing_masks_vectorized(
            table_data, meta_data_dict[table_name], monthly_dates
        )

        place_with_value = np.where(~(expected_mask | truly_missing_mask))

        # Check if there are any nans where we expect a value
        assert not np.isnan(table_data[place_with_value]).any(), \
            f"Table {table_name} has NaNs where we expect a value"

    print("Data integrity test passed!")

print("Testing data integrity...")
test_data_integrity(table_data_dict, meta_data_dict, monthly_dates)

Testing data integrity...
Data integrity test passed!


true_mask is true wherever the value is missing and it should not be (report date is consistent with the presence of a value there)

expected_mask is true wherever the value is missing and it should be (report date is consistent with the absence of a value there)

In [15]:
import numpy as np
import torch
from torch.utils.data import Dataset
from typing import Optional, Dict, List

class EconDataset(Dataset):
    def __init__(self,
                 table_data_dict: Dict[str, np.ndarray],
                 monthly_dates: List[str],
                 min_window_length_year: int = 1,
                 max_window_length_year: Optional[int] = None,
                 train: bool = True,
                 test_start_date: str = "2018-01-01",
                 number_of_samples: int = 100_000,
                 # -- Masking probabilities
                 p_1_none: float = 0.1,
                 p_2_uniform: float = 0.2,
                 p_3_last1yr: float = 0.2,
                 p_4_last2yr: float = 0.2,
                 p_5_table: float = 0.3,
                 p_uniform: float = 0.3,     # Probability to mask each cell in uniform masking
                 seed: Optional[int] = None):
        """
        table_data_dict: dict[table_name] -> (num_months, k_i) scaled arrays
        monthly_dates: list of str, aligned to the arrays in table_data_dict
        min_window_length_year: minimum window length (in years)
        max_window_length_year: maximum window length (in years), if None => no upper bound
        train: whether this dataset is for training or test
        test_start_date: str, e.g. "2018-01-01"
        number_of_samples: total number of random samples (i.e. random time windows) to generate
        p_1_none, p_2_uniform, p_3_last1yr, p_4_last2yr, p_5_table: probabilities for the 5 masking modes
        p_uniform: for the uniform random mask, each cell has this probability of being masked
        seed: optional random seed for reproducibility
        """
        super().__init__()

        # -- Basic checks
        p_sum = p_1_none + p_2_uniform + p_3_last1yr + p_4_last2yr + p_5_table
        assert abs(p_sum - 1.0) < 1e-7, "Mask probabilities must sum to 1.0!"

        self.table_data_dict = table_data_dict
        self.monthly_dates = monthly_dates
        self.num_months = len(monthly_dates)
        self.train = train

        self.min_window_length_months = 12 * min_window_length_year
        if max_window_length_year is not None:
            self.max_window_length_months = 12 * max_window_length_year
        else:
            # If not specified, let’s default to using up to the entire range
            self.max_window_length_months = self.num_months

        # -- Masking probabilities
        self.p_1_none = p_1_none
        self.p_2_uniform = p_2_uniform
        self.p_3_last1yr = p_3_last1yr
        self.p_4_last2yr = p_4_last2yr
        self.p_5_table = p_5_table
        self.p_uniform = p_uniform

        # -- Build train/test boundary
        self.test_start_idx = self.monthly_dates.index(test_start_date)

        # -- Prepare list of possible start indices for sampling
        #    We'll just keep [0, 1, 2, ..., num_months-1], then in __getitem__
        #    we pick from these randomly with a random window length.
        self.all_possible_starts = list(range(0, self.num_months))

        # But we have a user-specified number_of_samples. We'll just store that.
        self.number_of_samples = number_of_samples

        self.table_names = list(table_data_dict.keys())
        self.num_tables = len(self.table_names)

        # If you’d like shapes, e.g. for reference
        # self.table_shapes = [table_data_dict[tn].shape for tn in self.table_names]

        # Optionally set random seed
        if seed is not None:
            np.random.seed(seed)

    def __len__(self):
        return self.number_of_samples

    def __getitem__(self, idx):
        """
        Returns a dict:
            {
              "full_data": { table_name -> np.ndarray of shape (window_length, k_i) },
              "mask": np.ndarray of shape (window_length, num_tables)
            }
        """
        # 1) Sample a random window length
        window_length = np.random.randint(self.min_window_length_months,
                                          self.max_window_length_months + 1)

        # 2) Based on train/test, pick a random start that fits inside the window
        #    For train: the end must be < self.test_start_idx
        #    For test: the start must be >= self.test_start_idx
        if self.train:
            # The last valid start is test_start_idx - window_length
            max_start = self.test_start_idx - window_length
            if max_start < 0:
                raise ValueError("Not enough months for the requested window in training set.")
            start_idx = np.random.randint(0, max_start + 1)
        else:
            # The last valid start is num_months - window_length
            max_start = self.num_months - window_length
            if max_start < self.test_start_idx:
                raise ValueError("Not enough months for the requested window in test set.")
            start_idx = np.random.randint(self.test_start_idx, max_start + 1)

        end_idx = start_idx + window_length

        # 3) Prepare the output data structures
        full_data = {}
        # We create a mask with shape (time_window, num_tables), default all zeros
        mask = np.zeros((window_length, self.num_tables), dtype=np.float32)

        # 4) Decide which masking mode to apply
        r = np.random.rand()
        if r < self.p_1_none:
            mask_mode = "none"
        elif r < self.p_1_none + self.p_2_uniform:
            mask_mode = "uniform"
        elif r < self.p_1_none + self.p_2_uniform + self.p_3_last1yr:
            mask_mode = "last1yr"
        elif r < self.p_1_none + self.p_2_uniform + self.p_3_last1yr + self.p_4_last2yr:
            mask_mode = "last2yr"
        else:
            mask_mode = "table"

        # 5) Gather the data slices (unmasked “full_data”)
        #    We’ll populate `full_data` with references to the subset of the arrays
        for i, tn in enumerate(self.table_names):
            table_array = self.table_data_dict[tn]  # shape: (num_months, k_i)
            # Slice out the desired portion
            full_data[tn] = table_array[start_idx:end_idx, :]

        # 6) Fill the mask accordingly
        if mask_mode == "none":
            # do nothing, mask remains all zeros
            pass

        elif mask_mode == "uniform":
            # For each (t, i), mask w/ probability p_uniform
            random_matrix = np.random.rand(window_length, self.num_tables)
            mask[random_matrix < self.p_uniform] = 1.0

        elif mask_mode == "last1yr":
            # If window_length < 12, then we mask the entire window.
            omit_start = max(0, window_length - 12)
            mask[omit_start:, :] = 1.0

        elif mask_mode == "last2yr":
            # If window_length < 24, mask entire window (or from a certain boundary).
            omit_start = max(0, window_length - 24)
            mask[omit_start:, :] = 1.0

        elif mask_mode == "table":
            # Choose how many tables to mask: random integer in [1, num_tables]
            n_mask_tables = np.random.randint(1, self.num_tables + 1)
            # Choose that many distinct table indices
            table_indices_to_mask = np.random.choice(self.num_tables,
                                                     size=n_mask_tables,
                                                     replace=False)
            # Mask them for all timesteps
            mask[:, table_indices_to_mask] = 1.0

        # 7) Return the result
        return {
            "full_data": full_data,      # Dict[str, np.ndarray]
            "mask": mask                 # np.ndarray of shape (window_length, num_tables)
        }


In [16]:
dataset = EconDataset(
    table_data_dict=table_data_dict,
    monthly_dates=monthly_dates,
    min_window_length_year=1,
    max_window_length_year=5,
    train=True)

x = dataset[1]

In [21]:
x["full_data"]["SALAIRES-ANNUELS"]

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=float32)

In [145]:
def econ_collate_fn(batch):
    """
    batch: List of size B
      each element is a dict:
        {
          "full_data": {table_name -> (L, k_i) array},
          "masked_data": {table_name -> (L, k_i) array}
        }
    Returns a dict:
      {
        "full_data": {table_name -> (B, L, k_i)},
        "masked_data": {table_name -> (B, L, k_i)},
        "mask": {table_name -> (B, L, k_i)}
      }
    """
    table_names = batch[0]["full_data"].keys()

    # We'll accumulate data in dictionaries of lists
    full_data_dict = {}
    masked_data_dict = {}
    mask_dict = {}

    B = len(batch)

    for tn in table_names:
        # gather arrays for each sample in batch
        full_list = []
        masked_list = []
        mask_list = []

        for sample in batch:
            full_np = sample["full_data"][tn]   # shape: (L, k_i)
            masked_np = sample["masked_data"][tn] # shape: (L, k_i)

            # Convert to torch
            full_tensor = torch.tensor(full_np, dtype=torch.float32)
            masked_tensor = torch.tensor(masked_np, dtype=torch.float32)

            # Build a mask of where full_data is not nan (the ground truth)
            valid_mask = ~torch.isnan(full_tensor)  # shape: (L, k_i)

            # Replace nans in masked_data with 0.0 for the input
            masked_tensor[torch.isnan(masked_tensor)] = 0.0

            full_list.append(full_tensor)
            masked_list.append(masked_tensor)
            mask_list.append(valid_mask.float())  # store as float 0/1

        # stack along batch dimension => (B, L, k_i)
        full_data_stack = torch.stack(full_list, dim=0)
        masked_data_stack = torch.stack(masked_list, dim=0)
        mask_stack = torch.stack(mask_list, dim=0)

        full_data_dict[tn] = full_data_stack
        masked_data_dict[tn] = masked_data_stack
        mask_dict[tn] = mask_stack

    return {
        "full_data": full_data_dict,
        "masked_data": masked_data_dict,
        "mask": mask_dict
    }


In [146]:
class TableEmbedding(nn.Module):
    def __init__(self, k_in, embed_dim):
        super().__init__()
        self.l1 = nn.Linear(k_in, embed_dim)
        self.l2 = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        """
        x: (B, L, k_in)
        returns: (B, L, embed_dim)
        """
        x = self.l1(x)
        x = F.relu(x)
        x = self.l2(x)
        return x


In [147]:
import torch
import torch.nn as nn

class DoubleAttention(nn.Module):
    """
    Perform two-step attention on data of shape (B, L, N, E):
    1) Attention over N dimension (tables)
    2) Attention over L dimension (time)
    """
    def __init__(self, embed_dim, num_heads):
        super(DoubleAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads

        # We'll define two MultiheadAttention modules:
        # - attnN: handles attention across N
        # - attnL: handles attention across L
        self.attnN = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.attnL = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)

    def forward(self, x):
        """
        x: Tensor of shape (B, L, N, E)
        returns: Tensor of shape (B, L, N, E) after double attention
        """
        B, L, N, E = x.shape

        # Attention across N
        # Flatten (B,L) so that N is the "sequence length"
        x_reshape = x.view(B * L, N, E)
        # Now shape is (batch_size = B*L, seq_len = N, embedding = E)

        # Run multihead attention (self-attention) across N
        # attn_outN will also be shape (B*L, N, E)
        attn_outN, _ = self.attnN(x_reshape, x_reshape, x_reshape)

        # Reshape back to (B, L, N, E)
        xN = attn_outN.view(B, L, N, E)

        # Attention across L
        # We want L to be the sequence dimension now, so permute to (B, N, L, E)
        xN = xN.permute(0, 2, 1, 3).contiguous()  # shape: (B, N, L, E)

        # Flatten (B,N) so that L is the "sequence length"
        xN_reshape = xN.view(B * N, L, E)

        # Multihead attention across L
        # Output shape is still (B*N, L, E)
        attn_outL, _ = self.attnL(xN_reshape, xN_reshape, xN_reshape)

        # Reshape back to (B, N, L, E)
        xL = attn_outL.view(B, N, L, E)

        # Permute back to (B, L, N, E)
        out = xL.permute(0, 2, 1, 3).contiguous()

        return out

In [148]:
class TableDecoder(nn.Module):
    def __init__(self, embed_dim, k_out):
        super().__init__()
        self.linear = nn.Linear(embed_dim, k_out)

    def forward(self, x):
        """
        x: (B, L, embed_dim)
        -> (B, L, k_out)
        """
        return self.linear(x)


In [149]:
class EconModel(nn.Module):
    def __init__(self, table_names, table_shapes,
                 embed_dim=32, n_heads=4, ff_dim=128, num_layers=2):
        super().__init__()

        self.table_names = table_names
        self.N = len(table_names)

        self.table_embeds = nn.ModuleDict()
        self.table_decoders = nn.ModuleDict()

        # Create embeddings/decoders
        for tn, k_in in zip(table_names, table_shapes):
            self.table_embeds[tn] = TableEmbedding(k_in, embed_dim)
            self.table_decoders[tn] = TableDecoder(embed_dim, k_in)

        # 2D Transformer core
        self.core_transformer = Flattened2DTransformer(
            embed_dim=embed_dim,
            n_heads=n_heads,
            dim_feedforward=ff_dim,
            num_layers=num_layers
        )
        self.embed_dim = embed_dim

    def forward(self, batch_data):
        """
        batch_data:
          {
            "full_data": {tn -> (B, L, k_i)},
            "masked_data": {tn -> (B, L, k_i)},
            "mask": {tn -> (B, L, k_i)}
          }
        We'll use masked_data as input,
        but we decode and eventually compute loss vs. full_data.
        Returns a dict {tn -> (B, L, k_i)} of predictions.
        """
        B = None
        L = None

        # 1) embed each table => (B, L, E), stack => (B, L, N, E)
        embed_list = []
        table_valid_mask_list = []

        for tn in self.table_names:
            x = batch_data["masked_data"][tn]  # (B, L, k_i), with 0 where missing
            m = batch_data["mask"][tn]         # (B, L, k_i)
            B, L, k_i = x.shape

            # embed
            x_emb = self.table_embeds[tn](x)  # -> (B, L, E)
            embed_list.append(x_emb)

            # Build a "valid" mask for the transformer. We can say if the entire row is missing, we mask it.
            # But let's do a simple approach: if sum over k_i is zero, it's missing.
            # Actually, better to check if masked_data was all zeros => but that's
            # tricky if partial columns are present.
            # We'll do a simpler approach:
            # We'll rely on the standard transformer src_key_padding_mask usage in forward().
            # That requires shape (B, L*N). We'll build that after stacking.

        # stack => (B, L, N, E)
        embed_stack = torch.stack(embed_list, dim=2)

        # Build key_padding_mask => shape (B, L*N).
        # We consider a position "padded" if the input is entirely 0 for that table at that time
        # (assuming masked_data sets missing columns to 0).
        # Let's check the sum over E BEFORE the linear, or sum over k_i?
        # Right now, we have embed_stack: (B, L, N, E)
        # Summation in the original space was easier, but let's do it here:
        # We'll reconstruct a mask for "non-empty" from the input x.

        # We'll do a quick pass to find if x was all zeros =>
        # but we only have x inside the loop. Let's do it more systematically:

        table_zero_mask_list = []
        for tn in self.table_names:
            x_original = batch_data["masked_data"][tn]  # (B, L, k_i)
            zero_mask = (x_original.abs().sum(dim=2) == 0.0)  # shape: (B, L) boolean
            table_zero_mask_list.append(zero_mask)

        # stack => (B, L, N)
        zero_mask_stacked = torch.stack(table_zero_mask_list, dim=2)
        # we want shape (B, L*N) for the src_key_padding_mask => True if padded
        key_padding_mask = zero_mask_stacked.view(B, -1)  # (B, L*N)

        # 2) pass through the transformer
        out_2d = self.core_transformer(embed_stack, src_key_padding_mask=key_padding_mask)
        # shape: (B, L, N, E)

        # 3) decode table by table
        decoded = {}
        for i, tn in enumerate(self.table_names):
            table_repr = out_2d[:, :, i, :]  # (B, L, E)
            out = self.table_decoders[tn](table_repr)  # (B, L, k_i)
            decoded[tn] = out

        return decoded


In [150]:
def masked_mse_loss(pred, target, mask):
    """
    pred: (B, L, k_i)
    target: (B, L, k_i)
    mask: (B, L, k_i)  # 1 where ground truth is valid, 0 where no ground truth
    Returns average MSE over valid entries.
    """
    diff = (pred - target) ** 2
    diff = diff * mask  # zero out missing
    valid_count = mask.sum()
    if valid_count > 0:
        return diff.sum() / valid_count
    else:
        return torch.tensor(0.0, device=pred.device)


In [157]:
from tqdm.notebook import tqdm


def train_econ_model(csv_file_paths,
                     epochs=5,
                     batch_size=8,
                     window_length=60,
                     embed_dim=32,
                     lr=1e-3,
                     p_mask_year=0.2,
                     p_mask_partial=0.3,
                     p_mask_none=0.5):
    """
    Full pipeline:
    1) Read & scale data with scikit-learn
    2) Create train/test datasets
    3) Model + optimizer
    4) Training loop with masked MSE
    """
    # 1) read & scale
    table_data_dict, scalers, monthly_dates = read_and_scale_tables(
        csv_file_paths,
        start_date="1960-01-01",
        end_date="2024-01-01",
        train_cutoff_str="2018-01-01"
    )

    # 2) create datasets
    train_dataset = EconDataset(
        table_data_dict,
        monthly_dates,
        window_length=window_length,
        train=True,
        test_start_date="2018-01-01",
        p_mask_year=p_mask_year,
        p_mask_partial=p_mask_partial,
        p_mask_none=p_mask_none
    )

    test_dataset = EconDataset(
        table_data_dict,
        monthly_dates,
        window_length=window_length,
        train=False,
        test_start_date="2018-01-01",
        p_mask_year=p_mask_year,
        p_mask_partial=p_mask_partial,
        p_mask_none=p_mask_none
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=econ_collate_fn
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=econ_collate_fn
    )

    # 3) model + optimizer
    table_names = list(table_data_dict.keys())
    table_shapes = [table_data_dict[tn].shape[1] for tn in table_names]

    model = EconModel(
        table_names,
        table_shapes,
        embed_dim=embed_dim,
        n_heads=4,
        ff_dim=128,
        num_layers=2
    )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # 4) Training loop
    for ep in range(epochs):
        model.train()
        total_train_loss = 0.0
        for b_idx, batch_data in tqdm(enumerate(train_loader), desc="Training"):
            # move data to device
            for tn in batch_data["full_data"]:
                batch_data["full_data"][tn] = batch_data["full_data"][tn].to(device)
                batch_data["masked_data"][tn] = batch_data["masked_data"][tn].to(device)
                batch_data["mask"][tn] = batch_data["mask"][tn].to(device)

            optimizer.zero_grad()
            outputs = model(batch_data)  # dict {tn -> (B, L, k_i)}

            # compute loss
            loss_val = 0.0
            for tn in table_names:
                pred = outputs[tn]
                tgt = batch_data["full_data"][tn]
                msk = batch_data["mask"][tn]
                loss_val += masked_mse_loss(pred, tgt, msk)

            loss_val.backward()
            optimizer.step()

            total_train_loss += loss_val.item()

        avg_train_loss = total_train_loss / (b_idx + 1)

        # Evaluate
        model.eval()
        total_test_loss = 0.0
        with torch.no_grad():
            for b_idx, batch_data in tqdm(enumerate(test_loader), desc="Testing"):
                for tn in batch_data["full_data"]:
                    batch_data["full_data"][tn] = batch_data["full_data"][tn].to(device)
                    batch_data["masked_data"][tn] = batch_data["masked_data"][tn].to(device)
                    batch_data["mask"][tn] = batch_data["mask"][tn].to(device)

                outputs = model(batch_data)
                loss_val = 0.0
                for tn in table_names:
                    pred = outputs[tn]
                    tgt = batch_data["full_data"][tn]
                    msk = batch_data["mask"][tn]
                    loss_val += masked_mse_loss(pred, tgt, msk)

                total_test_loss += loss_val.item()

            avg_test_loss = total_test_loss / (b_idx + 1)

        print(f"Epoch {ep+1}/{epochs} - Train Loss: {avg_train_loss:.4f} | Test Loss: {avg_test_loss:.4f}")

    print("Training complete!")
    return model


In [158]:
import json

with open("Data/all_data.json", "r") as f:
    all_data = json.load(f)

csv_file_paths = [f"Data/{table_name}.csv" for table_name in all_data]
train_econ_model(csv_file_paths)

Reading BALANCE-PAIEMENTS
Dropping 0 columns with all NaNs before cutoff
Reading CHOMAGE-TRIM-NATIONAL
Dropping 0 columns with all NaNs before cutoff
Reading CLIMAT-AFFAIRES
Dropping 0 columns with all NaNs before cutoff
Reading CNA-2020-CONSO-MEN
Dropping 1 columns with all NaNs before cutoff
Reading CNA-2020-CONSO-SI
Dropping 0 columns with all NaNs before cutoff
Reading CNA-2020-CPEB
Dropping 0 columns with all NaNs before cutoff
Reading CNA-2020-CSI
Dropping 0 columns with all NaNs before cutoff
Reading CNA-2020-EMPLOI
Dropping 0 columns with all NaNs before cutoff
Reading CNA-2020-ERE
Dropping 2 columns with all NaNs before cutoff
Reading CNA-2020-FBCF-SI
Dropping 0 columns with all NaNs before cutoff
Reading CNA-2020-PIB
Dropping 0 columns with all NaNs before cutoff
Reading CNA-2020-TEI
Dropping 0 columns with all NaNs before cutoff
Reading CNT-2020-CB
Dropping 0 columns with all NaNs before cutoff
Reading CNT-2020-CSI
Dropping 0 columns with all NaNs before cutoff
Reading CNT-2



Training: 0it [00:00, ?it/s]

KeyboardInterrupt: 

In [28]:
import os
import requests
from dotenv import load_dotenv

load_dotenv()
key = os.getenv("NYT_KEY")

result = requests.request("GET", f"https://api.nytimes.com/svc/archive/v1/1970/1.json?api-key={key}")

In [42]:
print(result.json()['response']['docs'][10])
for d in result.json()['response']['docs'][:500]:
    if 'Front Page 2 -- No Title' in d['headline']['main']:
        print(d)
        break

filtered_articles = [d for d in result.json()['response']['docs'] if 'print_page' in d and int(d['print_page']) < 3]
print(len(filtered_articles))


{'abstract': "Prosecution of charges involving ousted Water Supply, Gas and Electricity Dept Comr Marcus continues; M Kaufman indicted for '68 perjury concerning attempt to bribe City Planning Comm member to delay bldg application by competitor S Sommer; is charged with denying bribe in testimony before grand jury probing incident and role of Kaufman, Marcus, informer H Itkin, real estate operator R Elyachar 'and others'; Itkin has testified that city official shared $10,000 payoff with him and Marcus; Elyachar pleaded guilty to perjury last Sept, is reptd cooperating with probe; Kaufman pleads not guilty", 'web_url': 'https://www.nytimes.com/1970/01/01/archives/builder-is-accused-of-perjury-in-cityplanning-bribery-case.html', 'snippet': "Prosecution of charges involving ousted Water Supply, Gas and Electricity Dept Comr Marcus continues; M Kaufman indicted for '68 perjury concerning attempt to bribe City Planning Comm member to delay bldg application by competitor S Sommer; is cha..."

In [12]:
print(len(result.json()['response']['docs']))

6560


In [13]:
headers = {
        'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
        'Referer': 'https://www.google.com/'
    }
response = requests.get("https://www.nytimes.com/1979/01/01/archives/israelis-decide-to-continue-talks-with-egypt-on-treaty-israel-would.html", headers=headers)

In [14]:
print(response.text)

<!DOCTYPE html>
<html lang="en" class=" story nytapp-vi-article "  xmlns:og="http://opengraphprotocol.org/schema/">
  <head>
    
    
    <meta charset="utf-8" />
    <title data-rh="true">Israelis Decide to Continue Talks With Egypt on Treaty - The New York Times</title>
    <meta data-rh="true" name="robots" content="noarchive, max-image-preview:large"/><meta data-rh="true" name="description" content="Begins says Israel will continue talks with Egypt; illus (M)"/><meta data-rh="true" property="twitter:url" content="https://www.nytimes.com/1979/01/01/archives/israelis-decide-to-continue-talks-with-egypt-on-treaty-israel-would.html"/><meta data-rh="true" property="twitter:title" content="Israelis Decide to Continue Talks With Egypt on Treaty (Published 1979)"/><meta data-rh="true" property="twitter:description" content="Begins says Israel will continue talks with Egypt; illus (M)"/><meta data-rh="true" property="twitter:image" content="https://static01.nyt.com/newsgraphics/images/icon

In [24]:
from newspaper import Article

for url in [x['web_url'] for x in result.json()['response']['docs']]:
    # Wait for the page to load and get the html
    html = requests.get(url, headers=headers).text
    article = Article(url)
    article.download(input_html=html)
    article.parse()
    article.nlp()
    print("------------------")
    print(article.title)
    print(article.text)
    print()


------------------
Israelis Decide to Continue Talks With Egypt on Treaty
A low point was reached on Dec. 15 when the Israeli Cabinet, in a unanimous vote, appeared to close the door to further negotiations by rejecting all Egyptian proposals to amend the draft treaty. Since then, Mr. Begin has said that Israel is willing to discuss some but not all the Egyptian demands. The Government decision today and Prime Minister Begin's later remarks are part of the gradual movement back to the negotiating table.

Israel Would Review Security

In announcing agreement to resume the talks, Mr. Begin told reporters that Israel was prepared to discuss with Egypt its demand to review security arrangments in the Sinai Peninsula five years after a peace treaty is signed. Sinai, now under Israeli occupation, would be returned to Egypt under the peace treaty, but Egypt wants to eventually renegotiate the size of the military force it can deploy there.

Mr. Begin also said that Israel was prepared to disc

KeyboardInterrupt: 

In [29]:
urls = [x['web_url'] for x in result.json()['response']['docs']]


In [30]:
for i, url in enumerate(urls[:20]):
    print(i, url)

0 https://dealbook.nytimes.com/2006/06/21/facebook-and-that-2-billion/
1 https://www.nytimes.com/1970/01/01/archives/hail-and-farewell.html
2 https://www.nytimes.com/1970/01/01/archives/front-page-2-no-title.html
3 https://www.nytimes.com/1970/01/01/archives/mississippi-adds-3-negroes-to-storm-relief-unit-governor-moves-to.html
4 https://www.nytimes.com/1970/01/01/archives/icc-aide-is-named-as-acting-chairman.html
5 https://www.nytimes.com/1970/01/01/archives/letters-to-the-editor-of-the-times.html
6 https://www.nytimes.com/1970/01/01/archives/traffic-snarled-by-freezing-rain-road-and-rail-facilities-are.html
7 https://www.nytimes.com/1970/01/01/archives/business-tax-bills-signed-by-shafer.html
8 https://www.nytimes.com/1970/01/01/archives/article-4-no-title.html
9 https://www.nytimes.com/1970/01/01/archives/cigarette-maker-loses-courtr-test-set-back-in-dispute-over-tv.html
10 https://www.nytimes.com/1970/01/01/archives/builder-is-accused-of-perjury-in-cityplanning-bribery-case.html
11