In [5]:
import os
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
import pandas as pd
import itertools
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


def input_fn_pandas(df, features, label=None, batch_size=256, num_epochs=5, shuffle=False, queue_capacity_factor=10,
                    num_threads=1):
    if label is not None:
        y = df[label]
    else:
        y = None
    if tf.__version__ >= "2.0.0":
        return tf.compat.v1.estimator.inputs.pandas_input_fn(df[features], y, batch_size=batch_size,
                                                             num_epochs=num_epochs,
                                                             shuffle=shuffle,
                                                             queue_capacity=batch_size * queue_capacity_factor,
                                                             num_threads=num_threads)

    return tf.estimator.inputs.pandas_input_fn(df[features], y, batch_size=batch_size, num_epochs=num_epochs,
                                               shuffle=shuffle, queue_capacity=batch_size * queue_capacity_factor,
                                               num_threads=num_threads)

In [3]:
if __name__ == "__main__":
    # 读取Criteo数据集
    data = pd.read_csv('./dataset/criteo_sample.txt')

    # Criteo数据集的sparse特征和dense特征字段名
    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    # 缺失值填充：sparse_features使用-1填充；dense_features使用0填充;label数据赋值target
    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # sparse特征采用LabelEncoder；dense特征采用最大最小化缩放。
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 列特征处理
    dnn_feature_columns = []
    linear_feature_columns = []

    for i, feat in enumerate(sparse_features):
        # tf.feature_column.categorical_column_with_identity(key, num_buckets, default_value=None):将数据转为OneHot数据
        # tf.feature_column.embedding_column()：OneHot数据转为embedding数据
        # tf.feature_column.numeric_column():实值或数值特征。
        dnn_feature_columns.append(tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity(feat, data[feat].max() + 1), 4))
        linear_feature_columns.append(tf.feature_column.categorical_column_with_identity(feat, data[feat].max() + 1))
    for feat in dense_features:
        dnn_feature_columns.append(tf.feature_column.numeric_column(feat))
        linear_feature_columns.append(tf.feature_column.numeric_column(feat))

    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2, random_state=2021)

    # 构建input_fn函数，本示例使用input_fn_pandas函数
    train_model_input = input_fn_pandas(train, sparse_features + dense_features, 'label', shuffle=True)
    test_model_input = input_fn_pandas(test, sparse_features + dense_features, 'label', shuffle=False)

    # 4.Define Model,train,predict and evaluate
    model = tf.estimator.DNNClassifier(hidden_units=[117, 64],feature_columns=dnn_feature_columns,activation_fn=tf.nn.sigmoid)

    model.train(train_model_input)
    # model.evaluate(test_model_input)
    pred_ans_iter = model.predict(test_model_input)
    ev = model.evaluate(test_model_input)
    print("Loss: {0:f}".format(ev["loss"]))
    # expected=test[target].values
    # template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')
    # for pre_dict, expec in zip(pred_ans_iter, expected):
    #     class_id = pre_dict['class_ids'][0]
    #     probability = pre_dict['probabilities'][class_id]
    #     print(template.format(0,
    #                           100 * probability, expec))
    pred_ans = list(map(lambda x: x['probabilities'], pred_ans_iter))
    # predictions = list(itertools.islice(pred_ans_iter,1))
    # print('PREDICTIONS',predictions)

    # print("test LogLoss", round(log_loss(train[target].values, pred_ans), 4))
    # print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmppue8s09l', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc781181128>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
To construct input pipelines, use the `tf.

In [13]:
len(pred_ans)

200

In [26]:
pred_ans

[array([0.9458044], dtype=float32),
 array([0.9427527], dtype=float32),
 array([0.9486576], dtype=float32),
 array([0.9478439], dtype=float32),
 array([0.9432604], dtype=float32),
 array([0.94361943], dtype=float32),
 array([0.94342005], dtype=float32),
 array([0.9431041], dtype=float32),
 array([0.9417645], dtype=float32),
 array([0.94717264], dtype=float32),
 array([0.94159955], dtype=float32),
 array([0.93793714], dtype=float32),
 array([0.9458681], dtype=float32),
 array([0.9492917], dtype=float32),
 array([0.94691885], dtype=float32),
 array([0.94520605], dtype=float32),
 array([0.9415462], dtype=float32),
 array([0.9427545], dtype=float32),
 array([0.9483095], dtype=float32),
 array([0.94683874], dtype=float32),
 array([0.94278324], dtype=float32),
 array([0.943353], dtype=float32),
 array([0.9460221], dtype=float32),
 array([0.9442728], dtype=float32),
 array([0.94532], dtype=float32),
 array([0.9469573], dtype=float32),
 array([0.9437739], dtype=float32),
 array([0.9394867], dt

In [5]:
from sklearn.metrics import log_loss
from math import log # 自然对数为底
 
# 二分类的交叉熵损失函数的计算
 
# y_true为一维，y_pred为二维
# 用sklearn的log_loss函数计算损失函数
y_true = [0,1]
y_pred = [[0.1,0.9], [0.2,0.8]]
sk_log_loss = log_loss(y_true,y_pred)
print('Loss by sklearn: %s.'%sk_log_loss)

Loss by sklearn: 1.2628643221541276.


In [10]:
# import numpy as np
# from sklearn.preprocessing import LabelEncoder, MinMaxScaler,OneHotEncoder
# # ohe = OneHotEncoder()
# # yht = ohe.fit_transform(y_true)
# # -(np.log(0.1)+np.log(0.8))/2

ValueError: Expected 2D array, got 1D array instead:
array=[0 1].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.