In [17]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

Step 1 加载数据

In [18]:
data = pd.read_csv('../data/Sohu2022_data/rec_data/train-dataset.csv')
# test_data = pd.read_csv('../data/Sohu2022_data/rec_data/test-dataset.csv') 

Step 2 特征工程

将类别列重新映射

In [19]:
#需要做映射的列
dict_cols = [x for x in data.columns if x not in ['sampleId','label','pvId','userSeq','logTs',]]

In [20]:
names = globals()

In [21]:
from tqdm import tqdm
for col in tqdm(dict_cols):
    #基础数据
    names[str(col)+'_values' ] = list(data[col].unique())
    #基础映射表,维护
    names[str(col)+'_dict' ] = {value:idx for idx,value in enumerate(names[str(col)+'_values' ])}
    #字典大小
    names[str(col)+'_dict_size'] = len(names[str(col)+'_dict' ])
    #映射到data
    data[f"{col}_idx"] = data[col].map(names[str(col)+'_dict' ])

#如果需要添加新数据扩大映射表，参考特征工程.ipynb

100%|██████████| 8/8 [00:01<00:00,  6.14it/s]


对logTs进行处理

In [22]:
data['date'] = pd.to_datetime(data['logTs'], unit='ms') 
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

超参数

In [23]:
EMBEEDING_SIZE = 64

Step 3 制作inputs

In [24]:
inputs = {
    'suv' : tf.keras.layers.Input(name='suv', shape=(), dtype='int32'),
    'operator' : tf.keras.layers.Input(name='operator', shape=(), dtype='int32'),
    'browserType' : tf.keras.layers.Input(name='browserType', shape=(), dtype='int32'),
    'deviceType' : tf.keras.layers.Input(name='deviceType', shape=(), dtype='int32'),
    'osType' : tf.keras.layers.Input(name='osType', shape=(), dtype='int32'),
    'province' : tf.keras.layers.Input(name='province', shape=(), dtype='int32'),
    'itemId' : tf.keras.layers.Input(name='itemId', shape=(), dtype='int32'),
    'city' : tf.keras.layers.Input(name='city', shape=(), dtype='int32'),
}

Step 4 搭建模型

In [25]:
itemId_embedding = tf.keras.layers.Embedding(itemId_dict_size+1, EMBEEDING_SIZE )(inputs['itemId'])
suv_embedding = tf.keras.layers.Embedding(suv_dict_size+1, EMBEEDING_SIZE )(inputs['suv'])
operator_embedding = tf.keras.layers.Embedding(operator_dict_size+1, EMBEEDING_SIZE )(inputs['operator'])
browserType_embedding = tf.keras.layers.Embedding(browserType_dict_size+1, EMBEEDING_SIZE )(inputs['browserType'])
deviceType_embedding = tf.keras.layers.Embedding(deviceType_dict_size+1, EMBEEDING_SIZE )(inputs['deviceType'])
osType_embedding = tf.keras.layers.Embedding(osType_dict_size+1, EMBEEDING_SIZE )(inputs['osType'])
province_embedding = tf.keras.layers.Embedding(province_dict_size+1, EMBEEDING_SIZE )(inputs['province'])
city_embedding = tf.keras.layers.Embedding(city_dict_size+1, EMBEEDING_SIZE )(inputs['city'])

In [26]:
all_features = tf.keras.layers.Concatenate(axis=-1)([itemId_embedding,suv_embedding,
                                            operator_embedding,browserType_embedding,
                                            deviceType_embedding,osType_embedding,
                                            province_embedding,city_embedding])

In [27]:
outputs = tf.keras.layers.Dense(256,activation='relu')(all_features)
outputs = tf.keras.layers.Dense(64,activation='relu')(outputs)
outputs = tf.keras.layers.Dense(1,activation='sigmoid')(outputs)

In [28]:
model = tf.keras.Model(inputs,outputs)

In [29]:
model.compile(
    loss='binary_crossentropy',
    optimizer='sgd',
    metrics=tf.keras.metrics.AUC(name='auc'))

Step 5 划分数据集：训练集：验证集：测试集 = 6：2：2

In [30]:
#训练集：验证集:测试集 = 6：2：2
import random
train_len = int(len(data)*0.6)
val_len = int(len(data)*0.2)
idx = list(data.index)
random.shuffle(idx)
#训练集
df_train = data.iloc[idx[:train_len]]
#验证集
df_val = data.iloc[idx[train_len:train_len+val_len]]
#测试集
df_test = data.iloc[idx[train_len+val_len:]]

In [31]:
x_train = [df_train['suv_idx'],
           df_train['operator_idx'],
           df_train['browserType_idx'],
           df_train['deviceType_idx'],
           df_train['osType_idx'],
           df_train['province_idx'],
           df_train['itemId_idx'],
           df_train['city_idx']]
y_train = df_train['label']

x_val = [df_val['suv_idx'],
         df_val['operator_idx'],
         df_val['browserType_idx'],
         df_val['deviceType_idx'],
         df_val['osType_idx'],
         df_val['province_idx'],
         df_val['itemId_idx'],
         df_val['city_idx']]
y_val = df_val['label']

x_test = [df_test['suv_idx'],
          df_test['operator_idx'],
          df_test['browserType_idx'],
          df_test['deviceType_idx'],
          df_test['osType_idx'],
          df_test['province_idx'],
          df_test['itemId_idx'],
          df_test['city_idx']]
y_test = df_test['label']

In [32]:
from datetime import datetime
TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now())
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="../logs/MLP"+TIMESTAMP)
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint('../save/MLP/{epoch:02d}-{val_loss:.2f}.hdf5',monitor='val_loss')              
history = model.fit(x_train,y_train,epochs=1,batch_size=32,
                    validation_data=[x_val,y_val],
                    callbacks=[tensorboard_callback,checkpoint_callback,
                    ])



In [33]:
model.evaluate(x_test, y_test, batch_size=32)



[0.48845940828323364, 0.5317859649658203]