In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from deepctr.models import xDeepFM
from deepctr.inputs import SparseFeat,get_feature_names


DeepCTR version 0.7.5 detected. Your version is 0.7.4.
Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.7.5


In [3]:
##==================== 设置文件路径File-Path (fp) ====================##
file_path = ''
fp_train_f = file_path + "train_sample.csv" #使用小样本进行训练

##==================== xDeepFM 训练 ====================##
data = pd.read_csv(fp_train_f, dtype={'id':str}, index_col=None)
print('data loaded')

#数据加载
sparse_features = ['C1', 'banner_pos', 'site_domain', 'site_id','site_category','app_id','app_category', 'device_type', 'device_conn_type','C14', 'C15','C16']
target = ['click']

data loaded


In [7]:
data.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1000009418151094273,0,14102100,2,0,43,301,2,293,7801e8d9,...,1,1,67,2,1,1722,0,35,-1,79
1,10000169349117863715,0,14102100,2,0,43,301,2,293,7801e8d9,...,1,0,65,2,1,1722,0,35,100084,79
2,10000371904215119486,0,14102100,2,0,43,301,2,293,7801e8d9,...,1,0,65,2,1,1722,0,35,100084,79
3,10000640724480838376,0,14102100,2,0,43,301,2,293,7801e8d9,...,1,0,67,2,1,1722,0,35,100084,79
4,10000679056417042096,0,14102100,2,1,374,169,0,293,7801e8d9,...,1,0,135,2,1,2161,0,35,-1,157


In [5]:
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [6]:
# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])

In [8]:
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

print(fixlen_feature_columns)
print(feature_names)

[SparseFeat(name='C1', vocabulary_size=6, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C1', group_name='default_group'), SparseFeat(name='banner_pos', vocabulary_size=4, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='banner_pos', group_name='default_group'), SparseFeat(name='site_domain', vocabulary_size=317, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='site_domain', group_name='default_group'), SparseFeat(name='site_id', vocabulary_size=381, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='site_id', group_name='default_group'), SparseFeat(name='site_category', vocabulary_size=14, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='site_category', group_name='default_group'), SparseFeat(name='app_id', vocabulary_size=313, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='app_id', group_name='default_group'), SparseFeat(name='app_category', vocabulary_size=14, embedding_dim=4, use_hash=False,

In [11]:
# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

In [14]:
# 使用xDeepFM进行训练
model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary')

# 计算logloss
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )

In [16]:
# fit model
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=True, validation_split=0.2, )

# 使用xDeepFM进行预测
pred_ans = model.predict(test_model_input, batch_size=256)

# 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse ** 0.5
print("test RMSE", rmse)

Train on 6400 samples, validate on 1600 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
test RMSE 0.3655133376499413


In [17]:
# 输出LogLoss
from sklearn.metrics import log_loss
score = log_loss(test[target].values, pred_ans)
print("LogLoss", score)

LogLoss 0.437828815815883
