## 数据

使用criteo数据集，包括train.csv,test.csv等两个文件。具体数据字段主要分为以下三类：

Label - Target variable that indicates if an ad was clicked (1) or not (0).（待预测广告，被点击是1，没有被点击是0。）

I1-I13 - A total of 13 columns of integer features (mostly count features).（总共 13 列数值型特征，主要是计数特征。）

C1-C26 - A total of 26 columns of categorical features. The values of these features have been hashed onto 32 bits for anonymization purposes. （共有 26 列类别型特征。 出于匿名目的，特征值已散列到 32 位。）

In [2]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
# from deepctr.models import DeepFM,WDL
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
import deepctr

print(tf.__version__)
print(deepctr.__version__)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


1.13.1
0.9.3


In [10]:
# https://arxiv.org/pdf/1606.07792.pdf

# -*- coding:utf-8 -*-
"""
Author:
    Weichen Shen, weichenswc@163.com
Reference:
    [1] Cheng H T, Koc L, Harmsen J, et al. Wide & deep learning for recommender systems[C]//Proceedings of the 1st Workshop on Deep Learning for Recommender Systems. ACM, 2016: 7-10.(https://arxiv.org/pdf/1606.07792.pdf)
"""

from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Dense

from deepctr.feature_column import build_input_features, get_linear_logit, input_from_feature_columns
from deepctr.layers.core import PredictionLayer, DNN
from deepctr.layers.utils import add_func, combined_dnn_input


def WDL(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(256, 128, 64), l2_reg_linear=0.00001,
        l2_reg_embedding=0.00001, l2_reg_dnn=0, seed=1024, dnn_dropout=0, dnn_activation='relu',
        task='binary'):
    """Instantiates the Wide&Deep Learning architecture.
    :param linear_feature_columns: An iterable containing all the features used by linear part of the model.
    :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
    :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN
    :param l2_reg_linear: float. L2 regularizer strength applied to wide part
    :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
    :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
    :param seed: integer ,to use as random seed.
    :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
    :param dnn_activation: Activation function to use in DNN
    :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
    :return: A Keras model instance.
    """

    features = build_input_features(
        linear_feature_columns + dnn_feature_columns)

    inputs_list = list(features.values())

    linear_logit = get_linear_logit(features, linear_feature_columns, seed=seed, prefix='linear',
                                    l2_reg=l2_reg_linear)

    sparse_embedding_list, dense_value_list = input_from_feature_columns(features, dnn_feature_columns,
                                                                         l2_reg_embedding, seed)

    dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list)
    dnn_out = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, False, seed=seed)(dnn_input)
    dnn_logit = Dense(1, use_bias=False)(dnn_out)

    final_logit = add_func([dnn_logit, linear_logit])

    output = PredictionLayer(task)(final_logit)

    model = Model(inputs=inputs_list, outputs=output)
    return model

In [1]:
if __name__ == "__main__":
    data = pd.read_csv('/home/yqstar/ai_project/CTR_Algorithm/DeepCTR/examples/criteo_sample.txt')

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=4) for i, feat in enumerate(sparse_features)] + [DenseFeat(feat, 1, ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2, random_state=2020)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate
    model = WDL(linear_feature_columns, dnn_feature_columns, task='binary')
    model.compile("adam", "binary_crossentropy",metrics=['binary_crossentropy'])

    history = model.fit(train_model_input, 
                        train[target].values,
                        batch_size=256, 
                        epochs=1, 
                        verbose=2, 
                        validation_split=0.2)
    
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

NameError: name 'pd' is not defined

In [3]:
import pandas as pd
import itertools
import tensorflow as tf
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr.estimator import DeepFMEstimator
from deepctr.estimator.inputs import input_fn_pandas

if __name__ == "__main__":
    # 读取Criteo数据集
    data = pd.read_csv('./dataset/criteo_sample.txt')

    # Criteo数据集的sparse特征和dense特征字段名
    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    # 缺失值填充：sparse_features使用-1填充；dense_features使用0填充
    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )

    # label数据赋值target
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    # sparse特征采用LabelEncoder；dense特征采用最大最小化缩放。
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    dnn_feature_columns = []
    linear_feature_columns = []

    for i, feat in enumerate(sparse_features):
        # tf.feature_column.categorical_column_with_identity(key, num_buckets, default_value=None):将数据转为OneHot数据
        # tf.feature_column.embedding_column()：OneHot数据转为embedding数据
        # tf.feature_column.numeric_column():
        dnn_feature_columns.append(tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity(feat, data[feat].max() + 1), 4))
        linear_feature_columns.append(tf.feature_column.categorical_column_with_identity(feat, data[feat].max() + 1))
    for feat in dense_features:
        dnn_feature_columns.append(tf.feature_column.numeric_column(feat))
        linear_feature_columns.append(tf.feature_column.numeric_column(feat))

    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2, random_state=2021)

    # Not setting default value for continuous feature. filled with mean.
    # 构建input_fn函数，本示例使用input_fn_pandas函数
    train_model_input = input_fn_pandas(train, sparse_features + dense_features, 'label', shuffle=True)
    test_model_input = input_fn_pandas(test, sparse_features + dense_features, None, shuffle=False)

    # 4.Define Model,train,predict and evaluate
    model = DeepFMEstimator(linear_feature_columns, dnn_feature_columns, task='binary',
                            config=tf.estimator.RunConfig(tf_random_seed=2021))

    model.train(train_model_input)
    pred_ans_iter = model.predict(test_model_input)
    # pred_ans = list(map(lambda x: x['pred'], pred_ans_iter))
    predictions = list(itertools.islice(pred_ans_iter,1))

    
    #
    # print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    # print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpxfsjdlk4', '_tf_random_seed': 2021, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f02ac277630>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
make sure the activation function use training flag properly call() got an unexpected keyword argument 'training'
make sure the activation functi

In [4]:
predictions

[{'logits': array([-3.4694154], dtype=float32),
  'pred': array([0.03019512], dtype=float32)}]

In [29]:
# input_fn定义
## column定义
import pandas as pd

data = pd.read_csv("./dataset/criteo_sample.txt")

sparse_feat_list = ["C" + str(i) for i in range(1,27)]
dense_feat_list =  ["I" + str(i) for i in range(1,13)]

data[sparse_feat_list] = data[sparse_feat_list].fillna(-1)
data[dense_feat_list] = data[dense_feat_list].fillna(0)

# 特征列
for i,feat in enumerate(sparse_feat_list):
    tf.feature_column.categorical_column_with_identity(sparse_feat_list[i], 100)
    



In [30]:
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,0.0,3,260.0,0.0,17668.0,0.0,0.0,33.0,0.0,...,e5ba7672,87c6f83c,-1,-1,0429f84b,-1,3a171ecb,c0d61a5c,-1,-1
1,0,0.0,-1,19.0,35.0,30251.0,247.0,1.0,35.0,160.0,...,d4bb7bd8,6fc84bfb,-1,-1,5155d8a3,-1,be7c41b4,ded4aac9,-1,-1
2,0,0.0,0,2.0,12.0,2013.0,164.0,6.0,35.0,523.0,...,e5ba7672,675c9258,-1,-1,2e01979f,-1,bcdee96c,6d5d1302,-1,-1
3,0,0.0,13,1.0,4.0,16836.0,200.0,5.0,4.0,29.0,...,e5ba7672,52e44668,-1,-1,e587c466,-1,32c7478e,3b183c5c,-1,-1
4,0,0.0,0,104.0,27.0,1990.0,142.0,4.0,32.0,37.0,...,e5ba7672,25c88e42,21ddcdc9,b1252a9d,0e8585d2,-1,32c7478e,0d4a6d1a,001f3601,92c878de


# 特征列使用

【TF】tf.feature_column特征处理方法汇总：https://blog.csdn.net/pearl8899/article/details/107936792

使用tf.feature_column函数创建特征列返回：CategoricalColumn 对象或者 DenseColumn 对象。

In [6]:
# -*- coding: utf-8 -*-
import tensorflow as tf
sess=tf.Session()
 
if __name__:
    # 特征数据
    features = {
        'birthplace': [[0], [1], [1], [3], [4], [5], [6], [11]]
    }
    print ("———————1———————")
    print (features)
    # 特征列
    birthplace = tf.feature_column.categorical_column_with_identity("birthplace", num_buckets=10, default_value=0)
    print ("———————2———————")
    print (birthplace)
    birthplace = tf.feature_column.indicator_column(birthplace)
    print ("——————3————————")
    print (birthplace)
    # 组合特征列
    columns = [birthplace]
    # 输入层（数据，特征列）
    inputs = tf.feature_column.input_layer(features, columns)
    print ("———————4———————"  )  
    print (inputs)
    # 初始化并运行
    init = tf.global_variables_initializer()
    sess.run(tf.tables_initializer())
    sess.run(init)
    v = sess.run(inputs)
    print ("———————5———————")
    print(v)

———————1———————
{'birthplace': [[0], [1], [1], [3], [4], [5], [6], [11]]}
———————2———————
IdentityCategoricalColumn(key='birthplace', number_buckets=10, default_value=0)
——————3————————
IndicatorColumn(categorical_column=IdentityCategoricalColumn(key='birthplace', number_buckets=10, default_value=0))
———————4———————
Tensor("input_layer_1/concat:0", shape=(8, 10), dtype=float32)
———————5———————
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [14]:
import tensorflow as tf
sess=tf.Session()
#特征数据
features = {
    'department': ['sport', 'sport', 'drawing', 'gardening', 'travelling'],
}
#特征列
department = tf.feature_column.categorical_column_with_hash_bucket('department', 10, dtype=tf.string)
print ("—————1—————")
# print columns
columns = tf.feature_column.embedding_column(department, dimension=20)
print(columns)
#输入层（数据，特征列）
inputs = tf.feature_column.input_layer(features, columns)
#初始化并运行
init = tf.global_variables_initializer()
sess.run(tf.tables_initializer())
sess.run(init)
 
v=sess.run(inputs)
# p=sess.run(department)
print(v)


TypeError: categorical_column_with_identity() got an unexpected keyword argument 'dtype'

In [33]:
import tensorflow as tf
features = {'item_id': [1, 2]}
item_id = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_hash_bucket('item_id', hash_bucket_size=4,
                                                             dtype=tf.int8))
item_id_hash=tf.feature_column.categorical_column_with_hash_bucket('item_id', hash_bucket_size=4,
                                                             dtype=tf.int8)
columns = [item_id_hash]
inputs = tf.feature_column.input_layer(features, columns)
#初始化并运行
init = tf.global_variables_initializer()
sess.run(tf.tables_initializer())
sess.run(init)
 
v=sess.run(inputs)
# p=sess.run(department)
print(v)

ValueError: Items of feature_columns must be a _DenseColumn. You can wrap a categorical column with an embedding_column or indicator_column. Given: HashedCategoricalColumn(key='item_id', hash_bucket_size=4, dtype=tf.int8)

In [None]:
import os
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
import pandas as pd
import itertools
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


def input_fn_pandas(df, features, label=None, batch_size=256, num_epochs=5, shuffle=False, queue_capacity_factor=10,
                    num_threads=1):
    if label is not None:
        y = df[label]
    else:
        y = None
    if tf.__version__ >= "2.0.0":
        return tf.compat.v1.estimator.inputs.pandas_input_fn(df[features], y, batch_size=batch_size,
                                                             num_epochs=num_epochs,
                                                             shuffle=shuffle,
                                                             queue_capacity=batch_size * queue_capacity_factor,
                                                             num_threads=num_threads)

    return tf.estimator.inputs.pandas_input_fn(df[features], y, batch_size=batch_size, num_epochs=num_epochs,
                                               shuffle=shuffle, queue_capacity=batch_size * queue_capacity_factor,
                                               num_threads=num_threads)