In [1]:
import sys
import os
import json
import tensorflow as tf
import tqdm
from models.model import *
from ltv_utils import *
from losses.custom_loss import *
pd.set_option('display.float_format', '{:.4f}'.format)  # 保留10位小数，可调整
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)



def parse_function(serialized_example):
    feature_description = {
        'deviceid': tf.io.FixedLenFeature([], tf.string),
        'install_date': tf.io.FixedLenFeature([], tf.string),
        'dim_os_name1': tf.io.FixedLenFeature([], tf.string),
        'creative_classify1': tf.io.FixedLenFeature([], tf.string),
        'total_pay_amount1':  tf.io.FixedLenFeature([], tf.float32),
         'channel1': tf.io.FixedLenFeature([], tf.string),
        'b2_sale_amt_bias':  tf.io.FixedLenFeature([], tf.int64),
         'b2_sale_amt_7d': tf.io.FixedLenFeature([], tf.int64),
         'install_time': tf.io.FixedLenFeature([], tf.string),
        'install_order_diff':  tf.io.FixedLenFeature([], tf.int64),
        'all_install_order_7d_diff':  tf.io.FixedLenFeature([], tf.int64),
        'is_a1x_a33':  tf.io.FixedLenFeature([], tf.int64),
        'platform_label':  tf.io.FixedLenFeature([], tf.string),
        'user_dense_price_features': tf.io.FixedLenFeature([len(group_2_features['user_dense_price_features'])], tf.float32),
        'user_dense_duration_features': tf.io.FixedLenFeature([len(group_2_features['user_dense_duration_features'])], tf.float32),
        'user_dense_features': tf.io.FixedLenFeature([len(group_2_features['user_dense_features'])], tf.float32),
        'user_sparse_features': tf.io.FixedLenFeature([len(group_2_features['user_sparse_features'])], tf.float32)
    }
    example = tf.io.parse_single_example(serialized_example, feature_description)
    return example


# load tf records
group_2_features = read_feature_json_config('features/feature_list.json')

train_file_name = 'data/ltv_0522_0603_multi_window_model_train/part-r-00000'
valid_file_name = 'data/ltv_0522_0603_multi_window_model_valid/part-r-00000'

train_dataset, valid_dataset, _ = get_trian_valid_test_dateset(parse_function, 10000, train_file_name, valid_file_name)



user_dense_price_features = group_2_features['user_dense_price_features']
user_dense_duration_features = group_2_features['user_dense_duration_features']
user_dense_features = group_2_features['user_dense_features']
user_sparse_features = group_2_features['user_sparse_features']


In [2]:
MODEL_HOUR = 0 

In [16]:
def judge_is_1h(temp_var):
    
    hour = tf.cast(tf.gather(temp_var,  indices=0, axis = 0) - 1, tf.int64)  
    if hour == 0:
        return True 
    return False 

In [3]:

emb_features = [
'creative_classify','dim_device_manufacture', 'car_add_type_most','show_order_is_2arrival_latest', 'selecttirecount_most', 'show_order_is_2arrival_most','selecttirecount_latest',
 'new_sitename','advsite','car_add_type_latest','platform_level', 'tire_list_click_avg_index','tire_list_click_most_pid_level','tire_order_page_most_pid_level',
]

sample = next(iter(train_dataset))
input_shape = {k: v.shape for k, v in sample.items()}


In [4]:
class NO_Process_Layer(layers.Layer):
    def __init__(self, dense_cnt_features, dense_price_features, dense_duration_features, user_sparse_features):
        super().__init__()

        self.features = [dense_cnt_features
                        ,dense_price_features
                        , dense_duration_features
                        ,user_sparse_features
                         ]
        self.concat_layer = layers.Concatenate()
    def call(self, inputs):
        processed_dense_features = []
        for field, input_tensor in inputs.items():
            if field in self.features:
                input_tensor = tf.cast(input_tensor, tf.float32)  # 显式转 float
                processed_dense_features.append(input_tensor)
        return self.concat_layer(processed_dense_features)

In [5]:
# #多头model
# model = MULTI_HEAD_LTV_MODEL(5, [160], [100], 'user_dense_features', 'user_dense_price_features', 'user_dense_duration_features',
#                             'user_sparse_features',user_sparse_features, emb_features)

concat_layer = NO_Process_Layer('user_dense_features', 'user_dense_price_features', 'user_dense_duration_features',
                            'user_sparse_features')


In [None]:
import tensorflow as tf
import json

# 第一步：读取 feature 配置
def read_feature_json_config(json_path):
    with open(json_path, 'r') as f:
        return json.load(f)

group_2_features = read_feature_json_config('features/feature_list.json')

# 第二步：定义 parse function
def parse_function(serialized_example):
    feature_description = {
        'deviceid': tf.io.FixedLenFeature([], tf.string),
        'install_date': tf.io.FixedLenFeature([], tf.string),
        'dim_os_name1': tf.io.FixedLenFeature([], tf.string),
        'creative_classify1': tf.io.FixedLenFeature([], tf.string),
        'total_pay_amount1': tf.io.FixedLenFeature([], tf.float32),
        'channel1': tf.io.FixedLenFeature([], tf.string),
        'b2_sale_amt_bias': tf.io.FixedLenFeature([], tf.int64),
        'b2_sale_amt_7d': tf.io.FixedLenFeature([], tf.int64),
        'install_time': tf.io.FixedLenFeature([], tf.string),
        'install_order_diff': tf.io.FixedLenFeature([], tf.int64),
        'all_install_order_7d_diff': tf.io.FixedLenFeature([], tf.int64),
        'is_a1x_a33': tf.io.FixedLenFeature([], tf.int64),
        'platform_label': tf.io.FixedLenFeature([], tf.string),
        'user_dense_price_features': tf.io.FixedLenFeature(
            [len(group_2_features['user_dense_price_features'])], tf.float32),
        'user_dense_duration_features': tf.io.FixedLenFeature(
            [len(group_2_features['user_dense_duration_features'])], tf.float32),
        'user_dense_features': tf.io.FixedLenFeature(
            [len(group_2_features['user_dense_features'])], tf.float32),
        'user_sparse_features': tf.io.FixedLenFeature(
            [len(group_2_features['user_sparse_features'])], tf.float32),
    }
    example = tf.io.parse_single_example(serialized_example, feature_description)
    return example


# 第三步：拼接 layer
def NO_Process_Layer(example):
    parts = [
        example['user_dense_features'],
        example['user_dense_price_features'],
        example['user_dense_duration_features'],
        example['user_sparse_features']
    ]
    return tf.concat(parts, axis=-1)


# 第四步：写入 TFRecords 带上新 key
def serialize_example(example, dense_vector):
    feature = {}

    for key, value in example.items():
        if isinstance(value, tf.Tensor):
            value = value.numpy()

        if isinstance(value, bytes):
            feature[key] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
        elif isinstance(value, str):
            feature[key] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))
        elif isinstance(value, float):
            feature[key] = tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
        elif isinstance(value, int):
            feature[key] = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
        elif isinstance(value, (list, tuple)) or hasattr(value, 'shape'):
            if value.dtype == tf.float32 or value.dtype == float:
                feature[key] = tf.train.Feature(float_list=tf.train.FloatList(value=value.flatten()))
            elif value.dtype == tf.int64 or value.dtype == int:
                feature[key] = tf.train.Feature(int64_list=tf.train.Int64List(value=value.flatten()))

    # 添加新的 dense_vector 字段
    dense_vector = dense_vector.numpy().flatten()
    feature['dense_vector'] = tf.train.Feature(float_list=tf.train.FloatList(value=dense_vector))

    # 构造 Example
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()


# 第五步：读取数据集 & 保存新 tfrecord
def process_and_save_tfrecord(input_tfrecord, output_tfrecord, num_samples=None):
    dataset = tf.data.TFRecordDataset(input_tfrecord)
    dataset = dataset.map(parse_function)

    if num_samples:
        dataset = dataset.take(num_samples)

    with tf.io.TFRecordWriter(output_tfrecord) as writer:
        for i, example in enumerate(dataset):
            dense_vector = concat_layer(example)
            if judge_is_1h(example['user_sparse_features']):
                serialized = serialize_example(example, dense_vector)
                writer.write(serialized)
                if i % 1000 == 0:
                    print(f"Processed {i} records.")
            else:
                continue 

# 调用处理函数
process_and_save_tfrecord(
    'data/ltv_0522_0603_multi_window_model_train/part-r-00000',
    'data/processed/part-r-00000_with_dense_vector.tfrecord',
    num_samples=None  # 可设置为1000用于测试
)


Processed 0 records.
Processed 2000 records.
Processed 6000 records.
Processed 8000 records.
Processed 21000 records.
Processed 22000 records.
Processed 25000 records.
Processed 31000 records.
Processed 37000 records.
Processed 39000 records.
Processed 44000 records.
Processed 45000 records.
Processed 46000 records.
Processed 70000 records.
Processed 78000 records.
Processed 81000 records.
Processed 85000 records.
Processed 92000 records.
Processed 95000 records.
Processed 97000 records.
Processed 113000 records.
Processed 116000 records.
Processed 117000 records.
Processed 122000 records.
Processed 125000 records.
Processed 127000 records.
Processed 133000 records.
Processed 140000 records.
Processed 144000 records.
Processed 145000 records.
Processed 151000 records.
Processed 154000 records.
Processed 156000 records.
Processed 157000 records.
Processed 158000 records.
Processed 179000 records.
Processed 185000 records.
Processed 188000 records.
