In [1]:
import sys
import os
import json
import tensorflow as tf
import tqdm
from models.model import *
from ltv_utils import *
from losses.custom_loss import *
pd.set_option('display.float_format', '{:.4f}'.format)  # 保留10位小数，可调整
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)



def parse_function(serialized_example):
    feature_description = {
        'deviceid': tf.io.FixedLenFeature([], tf.string),
        'install_date': tf.io.FixedLenFeature([], tf.string),
        'dim_os_name1': tf.io.FixedLenFeature([], tf.string),
        'creative_classify1': tf.io.FixedLenFeature([], tf.string),
        'total_pay_amount1':  tf.io.FixedLenFeature([], tf.float32),
         'channel1': tf.io.FixedLenFeature([], tf.string),
        'b2_sale_amt_bias':  tf.io.FixedLenFeature([], tf.int64),
         'b2_sale_amt_7d': tf.io.FixedLenFeature([], tf.int64),
         'install_time': tf.io.FixedLenFeature([], tf.string),
        'install_order_diff':  tf.io.FixedLenFeature([], tf.int64),
        'all_install_order_7d_diff':  tf.io.FixedLenFeature([], tf.int64),
        'is_a1x_a33':  tf.io.FixedLenFeature([], tf.int64),
        'platform_label':  tf.io.FixedLenFeature([], tf.string),
        'user_dense_price_features': tf.io.FixedLenFeature([len(group_2_features['user_dense_price_features'])], tf.float32),
        'user_dense_duration_features': tf.io.FixedLenFeature([len(group_2_features['user_dense_duration_features'])], tf.float32),
        'user_dense_features': tf.io.FixedLenFeature([len(group_2_features['user_dense_features'])], tf.float32),
        'user_sparse_features': tf.io.FixedLenFeature([len(group_2_features['user_sparse_features'])], tf.float32)
    }
    example = tf.io.parse_single_example(serialized_example, feature_description)
    return example


# load tf records
group_2_features = read_feature_json_config('features/feature_list.json')

train_file_name = 'data/ltv_0522_0603_multi_window_model_train/part-r-00000'
valid_file_name = 'data/ltv_0522_0603_multi_window_model_valid/part-r-00000'
test_file_name = 'data/ltv_0522_0603_multi_window_model_test/part-r-00000'

train_dataset, valid_dataset, test_dataset = get_trian_valid_test_dateset(parse_function, 10240, train_file_name, valid_file_name, test_file_name)



user_dense_price_features = group_2_features['user_dense_price_features']
user_dense_duration_features = group_2_features['user_dense_duration_features']
user_dense_features = group_2_features['user_dense_features']
user_sparse_features = group_2_features['user_sparse_features']


2025-06-30 17:48:35.621064: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-06-30 17:48:35.784442: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-30 17:48:35.789736: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-06-30 17:48:35.789755: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

In [2]:
MODEL_HOUR = 0 

In [3]:
def create_tf_dataset(dataset):
    sample_batch = next(iter(dataset))
    sample_data = {k: v for k, v in sample_batch.items() if k not in ['b2_sale_amt_7d', 'total_pay_amount1']}

    
    def generator():
        for batch in dataset:
            hour = tf.cast(tf.gather(batch['user_sparse_features'],  indices=0, axis = 1) - 1, tf.int64)    # shape: (batch_size,)
            b2_7d = tf.cast(tf.reshape(batch.pop('b2_sale_amt_7d'), (-1, 1)), tf.float32)
            # 将 b2_7d 中小于 0 的值替换为 0
            b2_7d = tf.maximum(b2_7d, 0.0)
            
            total_amt_1h = tf.reshape(batch.pop('total_pay_amount1'), (-1, 1))


            # 将保留的样本和标签一起返回
            y_true_packed = tf.concat([b2_7d, total_amt_1h], axis=1)

            # y_true_packed = b2_7d
            yield batch, y_true_packed
        

    # 正确写法：output_signature 中保留每个字段的真实 shape
    output_signature = (
        {
            name: tf.TensorSpec(shape=(None,) + v.shape[1:], dtype=v.dtype)
            for name, v in sample_data.items()
        },
        tf.TensorSpec(shape=(None, 2), dtype=tf.float32)
    )

    return tf.data.Dataset.from_generator(generator, output_signature=output_signature)

In [4]:

emb_features = [
'creative_classify','dim_device_manufacture', 'car_add_type_most','show_order_is_2arrival_latest', 'selecttirecount_most', 'show_order_is_2arrival_most','selecttirecount_latest',
 'new_sitename','advsite','car_add_type_latest','platform_level', 'tire_list_click_avg_index','tire_list_click_most_pid_level','tire_order_page_most_pid_level',
]


model = MULTI_HEAD_LTV_MODEL(5, [200], [200,128], 'user_dense_features', 'user_dense_price_features', 'user_dense_duration_features',
                            'user_sparse_features',user_sparse_features, emb_features)


sample = next(iter(train_dataset))
input_shape = {k: v.shape for k, v in sample.items()}


In [5]:
# 自己实现的双口loss
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  # 监控验证集上的 loss
    patience=5,          # 如果连续 3 轮没有改善，就停止训练
    restore_best_weights=True  # 训练结束后恢复到最优模型
)

loss_fn = UnifiedLTVLoss('tweedie')
model.compile(loss=loss_fn, 
              optimizer = tf.keras.optimizers.Adam(learning_rate=0.001),  
              metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse')]
             )


In [6]:
model.fit(
    create_tf_dataset(train_dataset),
    epochs=30,
    validation_data = create_tf_dataset(valid_dataset),
    callbacks= [early_stopping]
) 

Epoch 1/30
[[0.233561188]
 [-0.511621118]
 [0.585699141]
 ...
 [-0.296288133]
 [0.16384989]
 [0.341981322]]
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]
[[0.966563344]
 [-nan]
 [1.53061962]
 ...
 [-nan]
 [0.809567511]
 [1.16958344]]
      1/Unknown - 6s 6s/step - rmse: 237.0287[[nan]
 [nan]
 [nan]
 ...
 [nan]
 [nan]
 [nan]]
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]
[[nan]
 [nan]
 [nan]
 ...
 [nan]
 [nan]
 [nan]]
      2/Unknown - 7s 195ms/step - rmse: nan  [[nan]
 [nan]
 [nan]
 ...
 [nan]
 [nan]
 [nan]]
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]
[[nan]
 [nan]
 [nan]
 ...
 [nan]
 [nan]
 [nan]]
      3/Unknown - 7s 228ms/step - rmse: nan[[nan]
 [nan]
 [nan]
 ...
 [nan]
 [nan]
 [nan]]
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [269]]
[[nan]
 [nan]
 [nan]
 ...
 [nan]
 [nan]
 [nan]]
      4/Unknown - 7s 236ms/step - rmse: nan[[nan]
 [nan]
 [nan]
 ...
 [nan]
 [nan]
 [nan]]
[[1026]
 [0]
 [0]
 ...
 [318]
 [0]
 [0]]
[[nan]
 [nan]
 [nan]
 ...
 [nan]
 [nan]
 [nan]]
      5/Unknown - 7s 242ms/step - rmse: nan[[nan]
 [nan]
 [nan]
 


KeyboardInterrupt



In [None]:
model.evaluate_exp(create_tf_dataset(valid_dataset))

In [None]:
hour_model_pred = model.evaluate_rank(create_tf_dataset(valid_dataset))

In [None]:
hour_model_pred

In [None]:
hour_model_pred = model.evaluate_rank(create_tf_dataset(test_dataset))
hour_model_pred