In [1]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow import keras
import pandas as pd 
import numpy as np 
from tensorflow.keras.callbacks import EarlyStopping
np.set_printoptions(precision=4, suppress=True)

In [2]:
import sys 
sys.path.append("../data/liuliang_data")
from features_config import *

In [27]:
class Dense_Process_Layer(layers.Layer):
    def __init__(self, sparse_features, dense_features, price_features):
        super().__init__()
        self.sparse_features = sparse_features
        self.dense_features = dense_features
        self.price_features = price_features
        self.concat_layer = layers.Concatenate()  
    
    def call(self, inputs):
        concat_numeric = []
        for name, input in inputs.items():
            if name in self.dense_features:
                input_cast = tf.cast(input, tf.float32)  
                if name not in self.price_features:
                    temp_feature = tf.floor( tf.math.log1p(input_cast + 1) / tf.math.log(tf.constant(2.0, dtype=tf.float32)) )
                else:
                    temp_feature = tf.floor( tf.math.log1p(input_cast + 9) / tf.math.log(tf.constant(10.0, dtype=tf.float32)) )
                temp_feature = tf.expand_dims(temp_feature, 1)
                concat_numeric.append(temp_feature)

        return self.concat_layer(concat_numeric)  


In [32]:
class DNN(layers.Layer):
    def __init__(self, units = [256, 64]):
        super().__init__()
        self.dnn = keras.Sequential([
            layers.Dense(unit, activation = 'relu') for unit in units
        ])
    def call(self, x):
        return self.dnn(x)
        
import tensorflow as tf


class MultiLoss(tf.keras.losses.Loss):
    def __init__(self, **kwargs):
        super(MultiLoss, self).__init__(**kwargs)
        self.bce_loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)

    def call(self, y_true, y_pred_mask):
        y_pred, mask = y_pred_mask  # 解包 y_pred 和 mask
        return self.compute_loss(y_true, y_pred, mask)
    
    def compute_loss(self, y_true, y_pred, mask):  
        sum_loss = 0.0
        batch_size = tf.shape(y_true)[0]
        num_classes = tf.shape(y_true)[1]
        count = tf.constant(0.0, dtype=tf.float32)  # 初始化 count 为 float32 Tensor

        for j in range(num_classes):
            tp_yhat = tf.expand_dims(y_pred[:, j], axis=1)
            tp_y = tf.expand_dims(y_true[:, j], axis=1)
            sample_weight = tf.expand_dims(mask[:, j], axis=1)

            sum_loss += self.bce_loss(tp_y, tp_yhat, sample_weight=sample_weight)
            count += 1 # 确保类型一致
    
        return sum_loss / count if count > 0 else tf.constant(0.0, dtype=tf.float32)

        
from sklearn.metrics import roc_auc_score

class EveryDayModel(Model):
    def __init__(self, sparse_features, dense_features, price_features, label_cols, units = [256, 128, 64]):
        super().__init__()
        self.embedding_dict = {}
        self.sparse_features = sparse_features
        self.label_cols = label_cols
        num_bins = 10000
        for name in sparse_features:
            self.embedding_dict[name] = layers.Embedding(num_bins, 8, name = name)
        self.dense_process_layer = Dense_Process_Layer(sparse_features, dense_features, price_features)
        self.concat_embedding = layers.Concatenate()

        # 多任务 
        self.dnn = DNN(units )

        self.day1 = keras.layers.Dense(1)
        self.day2 = keras.layers.Dense(1)
        self.day3 = keras.layers.Dense(1)
        self.day4 = keras.layers.Dense(1)
        self.day5 = keras.layers.Dense(1)
        self.day6 = keras.layers.Dense(1)
        self.day7 = keras.layers.Dense(1)
        

    def call(self, inputs):
        dense_input = self.dense_process_layer(inputs)
        embeddings = [dense_input]

        for name, input in inputs.items():
            if name in self.embedding_dict:
                temp_embd = self.embedding_dict[name](input)
                embeddings.append(temp_embd)
        embedding_input = self.concat_embedding(embeddings)

        base_out_put = self.dnn(embedding_input)
        # print(" mid_output: ",base_out_put)
        logit_7 = tf.sigmoid(self.day7(base_out_put))
        
        logit_1 = tf.sigmoid(self.day1(base_out_put) * logit_7)
        logit_2 = tf.sigmoid(self.day2(base_out_put) * logit_7)
        logit_3 = tf.sigmoid(self.day3(base_out_put) * logit_7)
        logit_4 = tf.sigmoid(self.day4(base_out_put) * logit_7)
        logit_5 = tf.sigmoid(self.day5(base_out_put) * logit_7)
        logit_6 = tf.sigmoid(self.day6(base_out_put) * logit_7)

        
        return layers.Concatenate()([logit_1, logit_2, logit_3, logit_4, logit_5, logit_6, logit_7])
    def train_step(self, inputs):
        labels = []
        mask_s = inputs['mask']
  
        for lc_name in label_col:
            labeli = tf.expand_dims(inputs[lc_name], 1)
            labels.append(labeli)
        labels = layers.concatenate(labels)
        

        with tf.GradientTape() as tape:
            preds = self(inputs)
            loss = tf.reduce_mean(self.loss(labels, (preds, mask_s)))
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        
        results = {}
        results['loss'] = loss
        return results
        
    def evaluate(self, x, y=None, batch_size=None, steps=None, **kwargs):
        dataset = x
        total_loss = 0.0
        num_batches = 0
    
        # 遍历数据集中的每个批次
        for inputs in dataset:
            labels = []
            mask_s = inputs['mask']
    
            # 拼接标签
            for lc_name in label_col:
                labeli = tf.expand_dims(inputs[lc_name], 1)
                labels.append(labeli)
            labels = layers.concatenate(labels)
    
            # 计算模型预测
            preds = self(inputs)
            loss = tf.reduce_mean(self.loss(labels, (preds, mask_s)))
    
            # 累加损失
            total_loss += loss
            num_batches += 1
    
        # 计算平均损失
        avg_loss = total_loss / num_batches
        results = {'loss': avg_loss}
        return results    
    def predict(self, inputs):
        dataset = inputs
        total_loss = 0.0
        num_batches = 0
    
        for inputs in dataset:
            # 获取标签
            labels = inputs['label']  
    
            # 计算模型预测
            preds = self(inputs)[:, -1]
    
            # 计算下单期望
            pred_orders = tf.reduce_sum(preds)
            true_orders = tf.reduce_sum(labels)
    
            # **修正：确保数据类型一致**
            true_orders = tf.cast(true_orders, dtype=tf.float32)
            bias = (pred_orders - true_orders) / true_orders
    
            # 计算 AUC
            auc = roc_auc_score(y_true=labels.numpy(), y_score=preds.numpy())  # 转换为 numpy 以适配 sklearn
    
            result = {
                'bias': bias.numpy(),  # 转换为 numpy 以避免 TensorFlow 计算图问题
                'AUC': auc,
                'preds': preds[:10],
                'true_orders': true_orders,
                'pred_orders':pred_orders
            }
            
            return result

In [33]:
import datetime
def get_delta_date_str(date_str, delta):
    return  (datetime.datetime.strptime(date_str, "%Y-%m-%d") + datetime.timedelta(days=delta)).strftime('%Y-%m-%d')

def str2date(date_str):
    return datetime.datetime.strptime(date_str, "%Y-%m-%d")

def date2str(date):
    return date.strftime('%Y-%m-%d')

def get_train_test_data(data, startdate, enddate, testdate):
    train_data = data[
        (data['activate_date'] >= startdate) & (data['activate_date'] <= enddate)].reset_index(
        drop=True).copy() 

    test_data = data[
        (data['activate_date'] == testdate)].reset_index(drop=True).copy()  

    return train_data, test_data


from sklearn.model_selection import train_test_split


def process_mask_col(train_data, mask_dates):
    shape = len(train_data)
    train_data['mask'] = [np.ones((7,))] * shape

    train_data, recent_data = train_data[~train_data.dt.isin(mask_dates)] , train_data[train_data.dt.isin(mask_dates)]
    train_data, valid_data = train_test_split(train_data, test_size=0.15, random_state=42)
    
    
    for i, msk_dt in enumerate(mask_dates):
        temp_mask = np.concatenate([np.ones((i + 1,)) , np.zeros((7 - i - 1,))])
        recent_data.loc[recent_data.dt == msk_dt, 'mask'] = recent_data.loc[recent_data.dt == msk_dt].apply(lambda row: temp_mask, axis = 1)
    train_data = pd.concat([train_data, recent_data], ignore_index=True)
    print("数据集信息..." )
    # print(f"训练集： {train_data.dt.value_counts()}, 验证集： {valid_data.dt.value_counts()}")
    return train_data , valid_data

def create_tf_dataset(data, features, batch):
    train_inputs = {name: tf.constant(v.values) if name != 'mask' else tf.constant(np.array(v.tolist())) 
                    for name, v in data[features + label_col].items()}
    train_dataset = tf.data.Dataset.from_tensor_slices(train_inputs)                                    
    return train_dataset.batch(batch)

In [34]:
# 训练基本配置 
res_csv = None
folder_name = 'deep_res'
Pred_Days = 10
Pred_Date = '2023-05-01'
label_col = ['label_1','label_2','label_3','label_4','label_5','label_6','label']
# 处理缺失值
import platform 
if platform.system() == 'Darwin':
    data = pd.read_csv("../data/liuliang_data/完整toy_liuliang_data.csv", index_col= 0)
else:
    data = pd.read_csv("../data/liuliang_data/toy_liuliang_data.csv", index_col= 0)
data.loc[:, features] = data.loc[:, features].fillna(0)

train_days = 20

dense_features = [feature for feature in features if feature not in category_features]
sparse_features, dense_features, price_feature = category_features, dense_features, price_fatures


from tensorflow.keras.callbacks import EarlyStopping

# 设置 EarlyStopping 回调



In [None]:


def train_model(data):
    early_stopping = EarlyStopping(
        monitor='val_loss',       
        patience=3,               
        restore_best_weights=True )

    for delta in range(0, Pred_Days):
        delta_sample_days = train_days
        
        today = get_delta_date_str(Pred_Date, delta)
        startdate, enddate, testdate = get_delta_date_str(today, -train_days - 7), get_delta_date_str(today, -1), today
        
        
        print(f"训练集开始：{startdate}, 训练集结束：{enddate}, 测试集：{testdate}")
        
        #  在 mask_date中间的 全部用mask 日期填充
        mask_dates = [date2str(j) for j in pd.date_range(str2date(get_delta_date_str(today, -1)),periods=6,freq='-1D')]

        # 按照时间区间 获取 训练 测试集 
        train_data, test_data = get_train_test_data(data, startdate, enddate, testdate)
        # 给 train data 添加mask 列
        train_data, valid_data = process_mask_col(train_data, mask_dates)

        train_dataset = create_tf_dataset(train_data, features + ['mask'], 2048)
        valid_dataset = create_tf_dataset(valid_data, features + ['mask'], 2048)
        test_data = create_tf_dataset(test_data, features, 200000)
        
        # MODEL 
        model = EveryDayModel(sparse_features, dense_features, price_feature, label_col,[128,64] )
        model.compile(loss = MultiLoss(), optimizer = tf.keras.optimizers.Adam(learning_rate=0.0003) )

        # debug 
        # model.fit(valid_dataset)
        # print(model.evaluate(valid_dataset))
        
        model.fit(train_dataset, validation_data = valid_dataset, epochs=25,callbacks=[early_stopping]) 
        
        # model.fit(valid_dataset, validation_data = valid_dataset, epochs=1, callbacks=[early_stopping])
        print(model.predict(test_data))
        return train_dataset


t1 = train_model(data)

训练集开始：2023-04-04, 训练集结束：2023-04-30, 测试集：2023-05-01
数据集信息...
Epoch 1/25
Epoch 2/25
Epoch 3/25

In [None]:
[1,1,1,1,1]  [0.2,0.1,0.3,0.5,0.8]  [1, 1, 0, 0, 0]