In [1]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow import keras
import pandas as pd 
import numpy as np 

np.set_printoptions(precision=4, suppress=True)




In [2]:
import sys 
sys.path.append("../data/liuliang_data")
from features_config import *

In [3]:
class Dense_Process_Layer(layers.Layer):
    def __init__(self, sparse_features, dense_features, price_features):
        super().__init__()
        self.sparse_features = sparse_features
        self.dense_features = dense_features
        self.price_features = price_features
        self.concat_layer = layers.Concatenate()  # Specifying axis in constructor
    
    def call(self, inputs):
        concat_numeric = []
        for name, input in inputs.items():
            if name in self.dense_features:
                input_cast = tf.cast(input, tf.float32)  # Cast input once
                if name not in self.price_features:
                    temp_feature = tf.math.log1p(input_cast) / tf.math.log(tf.constant(2.0, dtype=tf.float32))
                else:
                    temp_feature = tf.math.log1p(input_cast) / tf.math.log(tf.constant(10.0, dtype=tf.float32))
                temp_feature = tf.expand_dims(temp_feature, 1)
                concat_numeric.append(temp_feature)

        return self.concat_layer(concat_numeric)  # No need to specify axis again


In [4]:
class DNN(layers.Layer):
    def __init__(self, units = [256, 64]):
        super().__init__()
        self.dnn = keras.Sequential([
            layers.Dense(unit, activation = 'relu') for unit in units
        ])
    def call(self, x):
        return self.dnn(x)
        


import tensorflow as tf

class MultiLoss(tf.keras.losses.Loss):
    def __init__(self, **kwargs):
        super(MultiLoss, self).__init__(**kwargs)
        self.bce_loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)

    def call(self, y_true, y_pred, mask):
        sum_loss = 0.0
        batch_size = tf.shape(y_true)[0]
        num_classes = tf.shape(y_true)[1]

        # mask 的形状是 (batch_size, num_classes)，按样本粒度进行处理
        for i in range(batch_size):
            for j in range(num_classes):
                # 对每个样本的每个类别进行检查
                if mask[i, j] == 1:
                    tp_yhat = tf.expand_dims(y_pred[i, j], axis=0)  # 取出当前样本和类别的预测值
                    tp_y = tf.expand_dims(y_true[i, j], axis=0)     # 取出当前样本和类别的真实值
                    sum_loss += self.bce_loss(tp_y, tp_yhat)

        return sum_loss



class EveryDayModel(Model):
    def __init__(self, sparse_features, dense_features, price_features, label_cols):
        super().__init__()
        self.embedding_dict = {}
        self.sparse_features = sparse_features
        self.label_cols = label_cols
        num_bins = 1000
        for name in sparse_features:
            self.embedding_dict[name] = layers.Embedding(num_bins, 8, name = name)
        self.dense_process_layer = Dense_Process_Layer(sparse_features, dense_features, price_features)
        self.concat_embedding = layers.Concatenate()

        # 多任务 
        self.dnn = DNN([256, 64] )

        self.day1 = keras.layers.Dense(1)
        self.day2 = keras.layers.Dense(1)
        self.day3 = keras.layers.Dense(1)
        self.day4 = keras.layers.Dense(1)
        self.day5 = keras.layers.Dense(1)
        self.day6 = keras.layers.Dense(1)
        self.day7 = keras.layers.Dense(1)
        

    def call(self, inputs):
        dense_input = self.dense_process_layer(inputs)
        embeddings = [dense_input]

        for name, input in inputs.items():
            if name in self.embedding_dict:
                temp_embd = self.embedding_dict[name](input)
                embeddings.append(temp_embd)
        embedding_input = self.concat_embedding(embeddings)

        logit_7 = tf.sigmoid(self.day7(embedding_input))
        
        logit_1 = tf.sigmoid(self.day1(embedding_input) * logit_7)
        logit_2 = tf.sigmoid(self.day2(embedding_input) * logit_7)
        logit_3 = tf.sigmoid(self.day3(embedding_input) * logit_7)
        logit_4 = tf.sigmoid(self.day4(embedding_input) * logit_7)
        logit_5 = tf.sigmoid(self.day5(embedding_input) * logit_7)
        logit_6 = tf.sigmoid(self.day6(embedding_input) * logit_7)

        
        return layers.Concatenate()([logit_1, logit_2, logit_3, logit_4, logit_5, logit_6, logit_7])
    def train_step(self, inputs):
        labels = []
        mask_s = inputs['mask']
  
        for lc_name in label_col:
            labeli = tf.expand_dims(inputs[lc_name], 1)
            labels.append(labeli)
        labels = layers.concatenate(labels)
        

        with tf.GradientTape() as tape:
            preds = self(inputs)
            loss = tf.reduce_mean(self.loss(preds, labels, masks))

        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        
        self.compiled_metrics.update_state(label, predict)
        results = {m.name: m.result() for m in self.metrics}
        return results
        

In [5]:
import datetime
def get_delta_date_str(date_str, delta):
    return  (datetime.datetime.strptime(date_str, "%Y-%m-%d") + datetime.timedelta(days=delta)).strftime('%Y-%m-%d')

def str2date(date_str):
    return datetime.datetime.strptime(date_str, "%Y-%m-%d")

def date2str(date):
    return date.strftime('%Y-%m-%d')

def get_train_test_data(data, startdate, enddate, testdate):
    train_data = data[
        (data['activate_date'] >= startdate) & (data['activate_date'] <= enddate)].reset_index(
        drop=True).copy() 

    test_data = data[
        (data['activate_date'] == testdate)].reset_index(drop=True).copy()  

    return train_data, test_data

def process_mask_col(train_data, mask_dates):
    shape = len(train_data)
    train_data['mask'] = [np.ones((7,))] * shape
    for i, msk_dt in enumerate(mask_dates):
        temp_mask = np.concatenate([np.ones((i + 1,)) , np.zeros((7 - i - 1,))])
        train_data.loc[train_data.dt == msk_dt, 'mask'] = train_data.loc[train_data.dt == msk_dt].apply(lambda row: temp_mask, axis = 1)

    return train_data

In [12]:
# 训练基本配置 
res_csv = None
folder_name = 'deep_res'
Pred_Days = 10
Pred_Date = '2023-05-01'
label_col = ['label_1','label_2','label_3','label_4','label_5','label_6','label']
# 处理缺失值
import platform 
if platform.os == 'Windows':
    data = pd.read_csv("../data/liuliang_data/完整toy_liuliang_data.csv", index_col= 0)
else:
    data = pd.read_csv("../data/liuliang_data/toy_liuliang_data.csv", index_col= 0)
data.loc[:, features] = data.loc[:, features].fillna(0)

train_days = 13
dense_features = [feature for feature in features if feature not in category_features]
sparse_features, dense_features, price_feature = category_features, dense_features, price_fatures


In [25]:


def train_model(data):
 
    for delta in range(0, Pred_Days):
        delta_sample_days = train_days
        
        today = get_delta_date_str(Pred_Date, delta)
        startdate, enddate, testdate = get_delta_date_str(today, -train_days - 7), get_delta_date_str(today, -1), today
        
        
        print(f"训练集开始：{startdate}, 训练集结束：{enddate}, 测试集：{testdate}")
        
        #  在 mask_date中间的 全部用mask 日期填充
        mask_dates = [date2str(j) for j in pd.date_range(str2date(get_delta_date_str(today, -1)),periods=6,freq='-1D')]

        # 按照时间区间 获取 训练 测试集 
        train_data, test_data = get_train_test_data(data, startdate, enddate, testdate)
        # 给 train data 添加mask 列
        train_data = process_mask_col(train_data, mask_dates)
        train_inputs = {name: tf.constant(v.values) if name != 'mask' else tf.constant(np.array(v.tolist())) 
                        for name, v in train_data[features + label_col].items()}
        train_dataset = tf.data.Dataset.from_tensor_slices(train_inputs)
        train_dataset = train_dataset.batch(512)

        # MODEL 
        model = EveryDayModel(sparse_features, dense_features, price_feature, label_col )
        return train_dataset
        


t1 = train_model(data)

训练集开始：2023-04-11, 训练集结束：2023-04-30, 测试集：2023-05-01


In [33]:
for inputs in t1:
    label_cols = []
    for lc_name in label_col:
        labeli = tf.expand_dims(inputs[lc_name], 1)
        label_cols.append(labeli)
    res = layers.concatenate(label_cols)
    print(res)
    break

tf.Tensor(
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]], shape=(512, 7), dtype=int64)


In [14]:
data_dict = {name:tf.constant(v.values) for name,v in data.items() if name in features}

In [15]:
train_data

<tf.Tensor: shape=(20000, 7), dtype=float32, numpy=
array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], dtype=float32)>

tf.Tensor(
[[0 0 1 1 0 1 1]
 [1 1 0 0 0 1 0]
 [1 0 0 0 0 0 1]
 [1 1 1 1 0 0 1]
 [0 0 1 0 1 1 0]
 [0 1 0 1 1 0 1]
 [0 0 0 1 0 1 1]
 [0 1 0 1 1 1 0]
 [1 1 1 1 0 0 0]
 [0 1 1 1 0 0 0]
 [0 0 1 1 0 1 0]
 [0 0 1 0 1 1 1]
 [0 1 1 0 0 1 1]
 [1 0 1 1 0 1 0]
 [0 1 1 1 1 0 0]
 [0 1 1 1 1 0 1]
 [1 1 0 1 1 0 1]
 [0 1 0 1 0 1 1]
 [1 0 0 0 1 0 1]
 [1 1 1 1 0 1 0]
 [0 1 0 1 0 0 0]
 [1 1 0 0 0 1 0]
 [1 1 0 1 0 0 1]
 [0 0 0 0 0 0 0]
 [1 1 1 0 1 0 0]
 [0 0 1 0 1 0 0]
 [0 0 1 1 0 0 0]
 [0 0 1 1 0 0 0]
 [0 0 1 1 0 1 0]
 [1 1 0 0 0 0 0]
 [1 1 1 0 0 1 1]
 [0 1 0 1 1 0 1]], shape=(32, 7), dtype=int32)


In [57]:
a

<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([0.7945, 0.9919, 0.8439, 1.1343, 0.6553, 0.7432, 1.2322, 0.9226,
       1.4304, 0.8548, 0.874 , 0.9434, 1.0212, 1.0771, 1.3806, 1.0818,
       0.9163, 0.8238, 0.7817, 0.8627, 0.7375, 0.853 , 0.8797, 0.8234,
       0.8406, 1.0483, 2.0231, 0.6447, 0.7356, 0.8785, 1.0572, 0.7386],
      dtype=float32)>