In [1]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow import keras
import pandas as pd 
import numpy as np 
from tensorflow.keras.callbacks import EarlyStopping
np.set_printoptions(precision=6, suppress=True)

In [2]:
import tensorflow as tf

print("可用 GPU 设备：", tf.config.list_physical_devices('GPU'))
print("是否使用 GPU：", tf.test.is_built_with_cuda())  # 确保 TensorFlow 支持 CUDA
print("是否使用 cuDNN：", tf.test.is_built_with_gpu_support())  # 确保 cuDNN 可用

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"


可用 GPU 设备： [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
是否使用 GPU： True
是否使用 cuDNN： True


In [3]:
import sys 
sys.path.append("../data/liuliang_data")
from features_config import *

In [46]:
class Dense_Process_Layer(layers.Layer):
    def __init__(self, sparse_features, dense_features, price_features):
        super().__init__()
        self.sparse_features = sparse_features
        self.dense_features = dense_features
        self.price_features = price_features
        self.concat_layer = layers.Concatenate()  
    
    def call(self, inputs):
        concat_numeric = []
        for name, input in inputs.items():
            if name in self.dense_features:
                input_cast = tf.cast(input, tf.float32)  
                if name not in self.price_features:
                    temp_feature = tf.floor( tf.math.log1p(input_cast + 1) / tf.math.log(tf.constant(2.0, dtype=tf.float32)) )
                else:
                    temp_feature = tf.floor( tf.math.log1p(input_cast + 9) / tf.math.log(tf.constant(10.0, dtype=tf.float32)) )
                temp_feature = tf.expand_dims(temp_feature, 1)
                concat_numeric.append(temp_feature)

        return self.concat_layer(concat_numeric)  


In [5]:
class DNN(layers.Layer):
    def __init__(self, units = [256, 64]):
        super().__init__()
        self.dnn = keras.Sequential([
            layers.Dense(unit, activation = 'relu') for unit in units
        ])
    def call(self, x):
        return self.dnn(x)
        
import tensorflow as tf


class MultiLoss(tf.keras.losses.Loss):
    def __init__(self, **kwargs):
        super(MultiLoss, self).__init__(**kwargs)
        self.bce_loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)

    def call(self, y_true, y_pred_mask):
        y_pred, mask = y_pred_mask  # 解包 y_pred 和 mask
        return self.compute_loss(y_true, y_pred, mask)
    
    def compute_loss(self, y_true, y_pred, mask):  
        sum_loss = 0.0
        batch_size = tf.shape(y_true)[0]
        num_classes = tf.shape(y_true)[1]
        count = tf.constant(0.0, dtype=tf.float32)  # 初始化 count 为 float32 Tensor

        for j in range(num_classes):
            tp_yhat = tf.expand_dims(y_pred[:, j], axis=1)
            tp_y = tf.expand_dims(y_true[:, j], axis=1)
            sample_weight = tf.expand_dims(mask[:, j], axis=1)

            sum_loss += self.bce_loss(tp_y, tp_yhat, sample_weight=sample_weight)
            count += 1 # 确保类型一致
    
        return sum_loss / count if count > 0 else tf.constant(0.0, dtype=tf.float32)

        
from sklearn.metrics import roc_auc_score

class EveryDayModel(Model):
    def __init__(self, sparse_features, dense_features, price_features, label_cols, units = [256, 128, 64]):
        super().__init__()
        self.embedding_dict = {}
        self.sparse_features = sparse_features
        self.label_cols = label_cols
        num_bins = 10000
        for name in sparse_features:
            self.embedding_dict[name] = layers.Embedding(num_bins, 8, name = name)
        self.dense_process_layer = Dense_Process_Layer(sparse_features, dense_features, price_features)
        

        # 多任务 
        self.dnn = DNN(units )

        self.day1 = keras.layers.Dense(1)
        self.day2 = keras.layers.Dense(1)
        self.day3 = keras.layers.Dense(1)
        self.day4 = keras.layers.Dense(1)
        self.day5 = keras.layers.Dense(1)
        self.day6 = keras.layers.Dense(1)
        self.day7 = keras.layers.Dense(1)
        

    def call(self, inputs):
        dense_input = self.dense_process_layer(inputs)
        embeddings = [dense_input]

        for name, input in inputs.items():
            if name in self.embedding_dict:
                temp_embd = self.embedding_dict[name](input)
                embeddings.append(temp_embd)
        embedding_input = self.concat_embedding(embeddings)

        base_out_put = self.dnn(embedding_input)
        # print(" mid_output: ",base_out_put)
        logit_7 = tf.sigmoid(self.day7(base_out_put))

        
        # 弄2025-03-24 两个 prob 试试 
        logit_1 = tf.sigmoid(self.day1(base_out_put)) * logit_7
        logit_2 = tf.sigmoid(self.day2(base_out_put)) * logit_7
        logit_3 = tf.sigmoid(self.day3(base_out_put)) * logit_7
        logit_4 = tf.sigmoid(self.day4(base_out_put)) * logit_7
        logit_5 = tf.sigmoid(self.day5(base_out_put)) * logit_7
        logit_6 = tf.sigmoid(self.day6(base_out_put)) * logit_7

        
        return layers.Concatenate()([logit_1, logit_2, logit_3, logit_4, logit_5, logit_6, logit_7])
    def train_step(self, inputs):
        labels = []
        mask_s = inputs['mask']
  
        for lc_name in label_col:
            labeli = tf.expand_dims(inputs[lc_name], 1)
            labels.append(labeli)
        labels = layers.concatenate(labels)
        

        with tf.GradientTape() as tape:
            preds = self(inputs)
            loss = tf.reduce_mean(self.loss(labels, (preds, mask_s)))
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        
        results = {}
        results['loss'] = loss
        return results
        
    def evaluate(self, x, y=None, batch_size=None, steps=None, **kwargs):
        dataset = x
        total_loss = 0.0
        num_batches = 0
    
        # 遍历数据集中的每个批次
        for inputs in dataset:
            labels = []
            mask_s = inputs['mask']
    
            # 拼接标签
            for lc_name in label_col:
                labeli = tf.expand_dims(inputs[lc_name], 1)
                labels.append(labeli)
            labels = layers.concatenate(labels)
    
            # 计算模型预测
            preds = self(inputs)
            loss = tf.reduce_mean(self.loss(labels, (preds, mask_s)))
    
            # 累加损失
            total_loss += loss
            num_batches += 1
    
        # 计算平均损失
        avg_loss = total_loss / num_batches
        results = {'loss': avg_loss}
        return results    
    def predict(self, inputs, pred_index = -1):
        dataset = inputs
        total_loss = 0.0
        num_batches = 0
    
        for inputs in dataset:
            # 获取标签
            labels = inputs[label_col[pred_index]]  
    
            # 计算模型预测
            preds = self(inputs)[:, pred_index]
    
            # 计算下单期望
            pred_orders = tf.reduce_sum(preds)
            true_orders = tf.reduce_sum(labels)
    
            # **修正：确保数据类型一致**
            true_orders = tf.cast(true_orders, dtype=tf.float32)
            bias = (pred_orders - true_orders) / true_orders
    
            # 计算 AUC
            auc = roc_auc_score(y_true=labels.numpy(), y_score=preds.numpy())  # 转换为 numpy 以适配 sklearn
    
            result = {
                'bias': bias.numpy(),  # 转换为 numpy 以避免 TensorFlow 计算图问题
                'AUC': auc,
                'preds': preds[:10],
                'true_orders': true_orders,
                'pred_orders':pred_orders
            }
            
            return result

In [6]:
import datetime
def get_delta_date_str(date_str, delta):
    return  (datetime.datetime.strptime(date_str, "%Y-%m-%d") + datetime.timedelta(days=delta)).strftime('%Y-%m-%d')

def str2date(date_str):
    return datetime.datetime.strptime(date_str, "%Y-%m-%d")

def date2str(date):
    return date.strftime('%Y-%m-%d')

def get_train_test_data(data, startdate, enddate, testdate):
    train_data = data[
        (data['activate_date'] >= startdate) & (data['activate_date'] <= enddate)].reset_index(
        drop=True).copy() 

    test_data = data[
        (data['activate_date'] == testdate)].reset_index(drop=True).copy()  

    return train_data, test_data


from sklearn.model_selection import train_test_split


def process_mask_col(train_data, mask_dates):
    shape = len(train_data)
    train_data['mask'] = [np.ones((7,))] * shape

    train_data, recent_data = train_data[~train_data.dt.isin(mask_dates)] , train_data[train_data.dt.isin(mask_dates)]
    train_data, valid_data = train_test_split(train_data, test_size=0.15, random_state=42)
    
    
    for i, msk_dt in enumerate(mask_dates):
        temp_mask = np.concatenate([np.ones((i + 1,)) , np.zeros((7 - i - 1,))])
        recent_data.loc[recent_data.dt == msk_dt, 'mask'] = recent_data.loc[recent_data.dt == msk_dt].apply(lambda row: temp_mask, axis = 1)
    train_data = pd.concat([train_data, recent_data], ignore_index=True)
    print("数据集信息..." )
    # print(f"训练集： {train_data.dt.value_counts()}, 验证集： {valid_data.dt.value_counts()}")
    return train_data , valid_data

def create_tf_dataset(data, features, batch):
    train_inputs = {name: tf.constant(v.values) if name != 'mask' else tf.constant(np.array(v.tolist())) 
                    for name, v in data[features + label_col].items()}
    train_dataset = tf.data.Dataset.from_tensor_slices(train_inputs)                                    
    return train_dataset.batch(batch)

In [10]:
# 训练基本配置 
res_csv = None
folder_name = 'deep_res'
Pred_Days = 10
Pred_Date = '2023-05-01'
label_col = ['label_1','label_2','label_3','label_4','label_5','label_6','label']
# 处理缺失值
import platform 

data = pd.read_csv("../data/liuliang_data/完整toy_liuliang_data.csv", index_col= 0)
data = data[data.dt > '2023-04-13']
data.loc[:, features] = data.loc[:, features].fillna(0)

train_days = 20

dense_features = [feature for feature in features if feature not in category_features]
sparse_features, dense_features, price_feature = category_features, dense_features, price_fatures


from tensorflow.keras.callbacks import EarlyStopping


#### debug 把头只保留一最后一个 

In [53]:
class EpochLogger(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % 10 == 0:  # 每 10 个 epoch 输出一次
            print(f"Epoch {epoch + 1}: Loss = {logs['loss']:.4f}, Val Loss = {logs.get('val_loss', 'N/A'):.4f}")


def train_model(data):
    early_stopping = EarlyStopping(
        monitor='val_loss',       
        patience=30,               
        restore_best_weights=True )

    for delta in range(0, Pred_Days):
        delta_sample_days = train_days
        
        today = get_delta_date_str(Pred_Date, delta)
        startdate, enddate, testdate = get_delta_date_str(today, -train_days - 7), get_delta_date_str(today, -1), today
        
        
        print(f"训练集开始：{startdate}, 训练集结束：{enddate}, 测试集：{testdate}")
        
        #  在 mask_date中间的 全部用mask 日期填充
        mask_dates = [date2str(j) for j in pd.date_range(str2date(get_delta_date_str(today, -1)),periods=6,freq='-1D')]

        # 按照时间区间 获取 训练 测试集 
        train_data, test_data = get_train_test_data(data, startdate, enddate, testdate)
        # 给 train data 添加mask 列
        train_data, valid_data = process_mask_col(train_data, mask_dates)
        train_data = train_data.sample(frac = 1.0)
        train_dataset = create_tf_dataset(train_data, features + ['mask'], 2048)
        valid_dataset = create_tf_dataset(valid_data, features + ['mask'], 2048)
        test_data = create_tf_dataset(test_data, features, 200000)
        
        # MODEL 
        model = EveryDayModel(sparse_features, dense_features, price_feature, label_col,[256,128,64] )
        model.compile(loss = MultiLoss(), optimizer = tf.keras.optimizers.Adam(learning_rate=0.0002) )

        # debug 
        # model.fit(valid_dataset)
        # print(model.evaluate(valid_dataset))
        # for epoch in range(40):
        #     model.fit(train_dataset, validation_data = valid_dataset, epochs=1,callbacks=[early_stopping]) 
        #     for i in range(7):
        #         format_dict(model.predict(test_data, i))
        
        model.fit(train_dataset, validation_data = valid_dataset, epochs=50 , verbose=0 , callbacks=[EpochLogger(), early_stopping]) 
        format_dict(model.predict(test_data, -1))



model = train_model(data)

训练集开始：2023-04-04, 训练集结束：2023-04-30, 测试集：2023-05-01
数据集信息...
Epoch 10: Loss = 0.0823, Val Loss = 0.1326
Epoch 20: Loss = 0.0753, Val Loss = 0.1320
Epoch 30: Loss = 0.0702, Val Loss = 0.1318
Epoch 40: Loss = 0.0661, Val Loss = 0.1320
Epoch 50: Loss = 0.0626, Val Loss = 0.1319
bias: 0.25350651144981384
AUC: 0.7362075512653528
preds: [0.02053908072412014, 0.015673767775297165, 0.014595883898437023, 0.015665803104639053, 0.03495329990983009, 0.03912032023072243, 0.029007256031036377, 0.030131971463561058, 0.06108703836798668, 0.03760207071900368]
true_orders: 641.0
pred_orders: 803.4976806640625
训练集开始：2023-04-05, 训练集结束：2023-05-01, 测试集：2023-05-02
数据集信息...
Epoch 10: Loss = 0.1452, Val Loss = 0.1327
Epoch 20: Loss = 0.1422, Val Loss = 0.1318
Epoch 30: Loss = 0.1398, Val Loss = 0.1316
Epoch 40: Loss = 0.1375, Val Loss = 0.1317
Epoch 50: Loss = 0.1353, Val Loss = 0.1320
bias: 0.2856878638267517
AUC: 0.7344888737059306
preds: [0.010641887784004211, 0.03650704026222229, 0.0281660296022892, 0.29424

In [11]:
import tensorflow as tf

def format_dict(data: dict) -> str:
    formatted_output = []
    for key, value in data.items():
        if isinstance(value, tf.Tensor):
            formatted_output.append(f"{key}: {value.numpy().tolist()}")
        else:
            formatted_output.append(f"{key}: {value}")
    print( "\n".join(formatted_output))

    


### 调整 预处理   

In [22]:
# pk 加上正则化的模型  且 不加log的深度模型 
class Dense_Process_Layer(layers.Layer):
    def __init__(self, sparse_features, dense_features, price_features):
        super().__init__()
        self.sparse_features = sparse_features
        self.dense_features = dense_features
        self.price_features = price_features
        self.concat_layer = layers.Concatenate()  
    
    def call(self, inputs):
        concat_numeric = []
        for name, input in inputs.items():
            if name in self.dense_features:
                input_cast = tf.cast(input, tf.float32)  
                if name not in self.price_features:
                    temp_feature = input_cast 
                else:
                    temp_feature = input_cast
                temp_feature = tf.expand_dims(temp_feature, 1)
                concat_numeric.append(temp_feature)

        return self.concat_layer(concat_numeric)  

def process_mask_col(train_data, mask_dates):
    shape = len(train_data)
    train_data['mask'] = [np.ones((7,))] * shape

    train_data, recent_data = train_data[~train_data.dt.isin(mask_dates)] , train_data[train_data.dt.isin(mask_dates)]
    train_data, valid_data = train_test_split(train_data, test_size=0.15, random_state=42)
    
    recent_data = None
    for i, msk_dt in enumerate(mask_dates):
        temp_mask = np.concatenate([np.ones((i + 1,)) , np.zeros((7 - i - 1,))])
    train_data = pd.concat([train_data, recent_data], ignore_index=True)
    print("数据集信息..." )
    # print(f"训练集： {train_data.dt.value_counts()}, 验证集： {valid_data.dt.value_counts()}")
    return train_data , valid_data

In [40]:
# 初始化 StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()



early_stopping = EarlyStopping(
    monitor='val_loss',       
    patience=10,               
    restore_best_weights=True )

delta = 0
delta_sample_days = train_days

today = get_delta_date_str(Pred_Date, delta)
startdate, enddate, testdate = get_delta_date_str(today, -train_days - 7), get_delta_date_str(today, -1), today


print(f"训练集开始：{startdate}, 训练集结束：{enddate}, 测试集：{testdate}")

#  在 mask_date中间的 全部用mask 日期填充
mask_dates = [date2str(j) for j in pd.date_range(str2date(get_delta_date_str(today, -1)),periods=6,freq='-1D')]

# 按照时间区间 获取 训练 测试集 
train_data, test_data = get_train_test_data(data, startdate, enddate, testdate)


# 对测试集的指定列进行 Z-score 标准化（使用训练集的均值和标准差）
train_data[dense_features] = scaler.fit_transform(train_data[dense_features])
test_data[dense_features] = scaler.transform(test_data[dense_features])

# 给 train data 添加mask 列
train_data, valid_data = process_mask_col(train_data, mask_dates)

train_dataset = create_tf_dataset(train_data, features + ['mask'], 2048)
valid_dataset = create_tf_dataset(valid_data, features + ['mask'], 2048)
test_data = create_tf_dataset(test_data, features, 200000)

# MODEL 
model = EveryDayModel(sparse_features, dense_features, price_feature, label_col,[256,128,64] )
model.compile(loss = MultiLoss(), optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001) )

# debug 
# model.fit(valid_dataset)
# print(model.evaluate(valid_dataset))
model.fit(train_dataset, validation_data = valid_dataset, epochs=50
          # ,callbacks=[early_stopping]
         ) 
format_dict(model.predict(test_data, -1))

训练集开始：2023-04-04, 训练集结束：2023-04-30, 测试集：2023-05-01
数据集信息...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
bias: 0.02885429933667183
AUC: 0.75652872389508
preds: [0.012203136458992958, 0.011783233843743801, 0.011699656024575233, 0.013902286998927593, 0.017501840367913246, 0.021979747340083122, 0.014634850434958935, 0.022715294733643532, 0.06065065786242485, 0.023245107382535934]
true_orders: 641.0
pred_orders: 659.49560546875


## 结论： 近期样本的加入，AUC能提升近1个点，但是 bias 也会扩大。主要是近期样本的 下单率确实和 预测日gap较大。

In [34]:
model.fit(train_dataset, validation_data = valid_dataset, epochs=40) 
format_dict(model.predict(test_data, -1))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
bias: 0.012142663821578026
AUC: 0.7474684321316727
preds: [0.01302121952176094, 0.012627107091248035, 0.00888181384652853, 0.01409940142184496, 0.017727429047226906, 0.04538991302251816, 0.01447009202092886, 0.020062247291207314, 0.0710325837135315, 0.021725211292505264]
true_orders: 641.0
pred_orders: 648.783447265625


In [39]:
model.fit(train_dataset, validation_data = valid_dataset, epochs=1) 
format_dict(model.predict(test_data, -1))

bias: 0.016261251643300056
AUC: 0.7453388198980566
preds: [0.01309780403971672, 0.01244854461401701, 0.008600215427577496, 0.013786629773676395, 0.017676163464784622, 0.047614872455596924, 0.014451979659497738, 0.019909843802452087, 0.07463574409484863, 0.021418534219264984]
true_orders: 641.0
pred_orders: 651.4234619140625
