In [1]:
import os
os.environ['ENV'] = 'prod'
os.environ['REGION'] = 'apse1'
os.environ['TENANT'] ="in"
os.environ['RECO_S3_BUCKET'] = "p13n-reco-offline-prod"
os.environ['COUNTRY_KEY']= "in"
os.environ['AWS_REGION']= "ap-southeast-1"
os.environ['USE_REAL_CMS3']= "True"
os.environ['RECO_CREDENTIAL']= "-----BEGINRSAPRIVATEKEY-----\nMGICAQACEQCdHOlGnxIMWCMzjK2JAg37AgMBAAECEGOIwGTEO9vd3X9+jyiF4NECCQnoqDakDgSm2QIID9sadWN0XvMCCQLiqPkgVKSuIQIIDCAsWM+pJB8CCQG0jbIGCNX9MA==\n-----ENDRSAPRIVATEKEY-----"

import argparse, gc
import json
import os, time
import numpy as np
import s3fs, pickle
import pyarrow
import tensorflow as tf
from tqdm import tqdm

tfv1 = tf.compat.v1
tfv1.disable_v2_behavior()

# Enable memory growth for GPUs to avoid memory fragmentation
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    for device in physical_devices:
        tf.config.experimental.set_memory_growth(device, True)

import tensorflow_addons as tfa
import tensorflow_recommenders_addons as tfra
from model.losses import masked_binary_entropy_loss
from model.metrics import MaskedAUC

from common.config.utils import data_path, model_path
from common.config import TENANT
from tpfy.tf_model.tpfy_model_v3_mtl import TpfyModelV3, TpfyMtlModelConfig
from tpfy.etl.schema import TpfyMtlDatasetSchema
from model.parquet_dataset import TFParquetDataset
from tpfy.common import TpfyDataPath
from omegaconf import OmegaConf
from dataclasses import dataclass
from tpfy.train_v3_mtl import make_example_mtl, TpfyTrainConfig, TpfyConfig
from tpfy.helper import load_model_weights_from_s3

def create_dataset(date, path = None):
    if path:
        data_path_str = path
    else:
        data_path_str = data_path(
            TpfyDataPath.S3_TPFY_IMPR_V3_DAILY_MTL_EXTRACTED_EXAMPLES, TENANT
        ) % (variant, date)

    dataset = TFParquetDataset([data_path_str], TpfyMtlDatasetSchema, shuffle_files=True)
    tf_dataset = dataset.create_tf_dataset(batch_size).map(make_example_mtl)
    return tf_dataset

def load_and_compile_model():
    # Build model
    print(f"\n{'='*80}")
    print("BUILDING MODEL")
    print(f"{'='*80}")

    model = TpfyModelV3(
        hparams.model,
        click_ns=args.click_ns,
        enable_random_watch=hparams.train.enable_random_watch,
    )

    # Create optimizer (needed for compilation, even though we won't train)
    optimizer = tfa.optimizers.AdamW(
        weight_decay=0.0,  # Not needed for inference
        learning_rate=0.001,  # Not needed for inference
        epsilon=1e-4,
    )

    loss_dict = {
            "click": masked_binary_entropy_loss(from_logits=True),
            "watch": masked_binary_entropy_loss(from_logits=True),
            "random_watch": masked_binary_entropy_loss(from_logits=False),
            "paywall_view": masked_binary_entropy_loss(from_logits=True),
            "add_watchlist": masked_binary_entropy_loss(from_logits=True),
    }
    metric_dict = {
            "click": MaskedAUC(from_logits=True),
            "watch": MaskedAUC(from_logits=True),
            "random_watch": MaskedAUC(from_logits=False),
            "paywall_view": MaskedAUC(from_logits=True),
            "add_watchlist": MaskedAUC(from_logits=True),
    }

    optimizer = tfra.dynamic_embedding.DynamicEmbeddingOptimizer(optimizer)

    model.compile(optimizer=optimizer, loss=loss_dict, metrics=metric_dict)
    print("Model compiled")
    
    return model

def get_activations_and_labels(next_batch, last_layer_tensor):
    features, labels, metadata = session.run(next_batch)
    
    activation_values = session.run(
        last_layer_tensor,
        feed_dict={} if not features else None  
    )
    
    return activation_values, labels, metadata

def compute_A_b(A, b, arm_index, next_batch, last_layer_tensor):
    def process_batch_fast(content_ids, embeddings, labels, arm_index, A, b):
        unique_ids, inverse = np.unique(content_ids, return_inverse=True)

        # ensure all arms exist
        for cid in unique_ids:
            if cid not in arm_index:
                arm_index[cid] = len(A)
                A.append(lambda_reg * np.eye(d))
                b.append(np.zeros(d))

        # group outer products
        outer = embeddings[:, :, None] * embeddings[:, None, :]  # (B, d, d)
        xb = embeddings * labels[:, None]
        
        for u_idx, cid in enumerate(unique_ids):
            global_idx = arm_index[cid]
            mask = (inverse == u_idx)

            A[global_idx] += outer[mask].sum(axis=0)
            b[global_idx] += xb[mask].sum(axis=0)
        
    activation_values, labels, metadata = get_activations_and_labels(next_batch, last_layer_tensor)
    y_batch = labels['click']
    mask = (y_batch != -1)

    if not np.any(mask):
        del activation_values, labels, metadata, mask
        return A

    H = activation_values[mask.squeeze()].copy() 
    y_batch = y_batch[mask].reshape(sum(mask)[0], )
    content_id = [content_id for content_id, mask_bool in zip(metadata['content_id'], mask) if mask_bool]    
    
    del activation_values, mask  # Delete immediately
    
    H = H / (np.linalg.norm(H, axis=1, keepdims=True) + 1e-8)
    # A += H.T @ H
    process_batch_fast(content_id, H, y_batch, arm_index, A, b)

    del H, labels, metadata, content_id, y_batch
    return A, b, arm_index

def run(args):
    tf_dataset = create_dataset(args.date)
    iterator = tf_dataset.make_one_shot_iterator()
    next_batch = iterator.get_next()
    sample_features, sample_labels, sample_metadata = session.run(next_batch)
    tpfy_model = load_and_compile_model()

    prediction = tpfy_model(sample_features, training=False)
    session.run([
        tfv1.global_variables_initializer(),
        tfv1.local_variables_initializer(),
        tfv1.tables_initializer()
    ])

    plain_weights = load_model_weights_from_s3(
        args.model_name,
        use_s3=True,
        checkpoint_name=args.checkpoint
    )
    plain_weights_modified = {k.replace('train/', ''): v for k, v in plain_weights.items()}
    restore_ops = tpfy_model.restore_plain_weights_ops(
        plain_weights_modified,
        clear_nn=args.clear_nn
    )
    session.run(restore_ops)

    # Create NEW iterator (reset to start of dataset)
    iterator = tf_dataset.make_one_shot_iterator()
    next_batch = iterator.get_next()

    # Get compress_output tensor (linear_input)
    graph = tf.compat.v1.get_default_graph()
    compress_output_tensor = graph.get_tensor_by_name(f'tpfy_model_v3/deepfm/{args.layer_name}:0')
    
    d = 128
    lambda_ = 1
    arm_index = {}          # content_id -> row index in A
    A = []
    b = []

    start = time.time()
    run_ = 0
    os.makedirs(f'disjoint_neural_linucb/disjoined_neural_linUCB_offline_matrices_{args.date}_{args.checkpoint}', exist_ok=True)
    while True:
        if (run_ % 100 == 0) and (run_):
            gc.collect()
            print(f'Length of arm index : {len(arm_index)}')
            print(f'Run {run_} completed in {time.time() - start} s!')
            start = time.time()
        if (run_ % 1000 == 0) and (run_):
            np.save(f'disjoint_neural_linucb/disjoined_neural_linUCB_offline_matrices_{args.date}_{args.checkpoint}/A_{run_}.npy', A)
            np.save(f'disjoint_neural_linucb/disjoined_neural_linUCB_offline_matrices_{args.date}_{args.checkpoint}/b_{run_}.npy', b)
            with open(f'disjoint_neural_linucb/disjoined_neural_linUCB_offline_matrices_{args.date}_{args.checkpoint}/arm_index_{run_}.pkl', 'wb') as handle:
                pickle.dump(arm_index, handle)
            gc.collect()
            print(f'Run {run_} completed in {time.time() - start} s!')
            start = time.time()
        try:
            A, b, arm_index = compute_A(A, b, arm_index, next_batch, compress_output_tensor)
        except tf.errors.OutOfRangeError:
            print("End of dataset reached")
            break
        run_ += 1
        
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="TPFY Exploration offline Training.")
    parser.add_argument("model_name", type=str)
    parser.add_argument("date", type=str)
    parser.add_argument("--click_ns", type=float, default=0.08)
    parser.add_argument("--variant", type=str, default="cms3")
    parser.add_argument("--batch_size", type=int, default=512)
    parser.add_argument("--clear_nn", action="store_true", default=False)
    parser.add_argument("--checkpoint", default=None, type=str)
    parser.add_argument("--layer_name", default='Relu', type=str)

    args = parser.parse_args()

    # Load configuration
    config_name = f"tpfy/tpfy_config/mtl-{TENANT}.yaml"
    if not os.path.exists(config_name):
        raise FileNotFoundError(f"Config file {config_name} not found")

    hparams: TpfyConfig = OmegaConf.merge(
        OmegaConf.structured(TpfyConfig),
        OmegaConf.load(config_name),
    )
    print(f"\nLoaded config: {config_name}")

    # Override batch size if specified
    if args.batch_size:
        hparams.train.batch_size = args.batch_size

    batch_size = hparams.train.batch_size
    print(f"Batch size: {batch_size}")

    # Load dataset
    variant = args.variant
    if variant and not variant.startswith("-"):
        variant = "-" + variant

    session = tfv1.keras.backend.get_session()
    print("Start training")
    run(args)

2026-02-13 13:35:44.767551: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2026-02-13 13:35:44.767579: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Instructions for updating:
non-resource variables are not supported in the long term


2026-02-13 13:35:45.748181: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2026-02-13 13:35:45.748205: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2026-02-13 13:35:45.748220: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-10-11-72-173): /proc/driver/nvidia/version does not exist


In [207]:
A = np.load('disjoint_neural_linucb/disjoined_neural_linUCB_offline_matrices_2026-02-09_1770723470/A_2000.npy')
b = np.load('disjoint_neural_linucb/disjoined_neural_linUCB_offline_matrices_2026-02-09_1770723470/b_2000.npy')

In [208]:
A.shape, b.shape

((6884, 128, 128), (6884, 128))

In [209]:
b

array([[ 3.95477964,  0.60653103,  0.85504176, ...,  1.19097164,
         4.38013578,  0.64074258],
       [17.30481672,  1.89125678,  6.88386854, ...,  5.50487728,
        18.29434566,  1.98321564],
       [ 2.86945255,  0.38247161,  1.08902032, ...,  0.9394835 ,
         3.60963462,  0.1999401 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.06999478,  0.        ,  0.        , ...,  0.        ,
         0.12473621,  0.        ]])