In [2]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat,get_feature_names

#数据加载，设置稀疏特征和测试对象
data = pd.read_csv("movielens_sample.txt")
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ['rating']

data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009


In [3]:
# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
    
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]

# DNN 和 FM 共享输入 fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [4]:
fixlen_feature_columns

[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group'),
 SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='user_id', group_name='default_group'),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='gender', group_name='default_group'),
 SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='age', group_name='default_group'),
 SparseFeat(name='occupation', vocabulary_size=20, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='occupation', group_name='default_group'),
 SparseFeat(name='zip', vocabulary_size=188, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='zip', group_name='default_group')]

In [7]:
feature_names

['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip']

In [8]:
# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)

# 训练集和测试集的输入格式为 dict
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

In [10]:
test_model_input

{'movie_id': array([ 18,  74,  97,   6, 156, 173,  76, 124, 170, 116,  29,  34, 163,
         85,  70, 168,  65, 174,  91,  72, 111, 123, 169, 103, 130, 115,
        162, 112,  67, 177, 166,  41,  59, 144,  50,  19, 136, 113,  92,
        173], dtype=int64),
 'user_id': array([158, 101, 173,  12,  24,  10,  31, 159, 178, 144, 108, 157, 141,
         84,  72, 192,  53,  68,  77,  86, 160,  41, 123,  18, 146, 128,
         44, 182,  69, 115, 136,  34, 169,  64, 173,  27,  66, 188, 162,
          7], dtype=int64),
 'gender': array([1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1]),
 'age': array([3, 2, 1, 2, 5, 3, 4, 3, 2, 2, 1, 1, 5, 0, 3, 2, 5, 5, 1, 5, 5, 2,
        1, 2, 6, 3, 3, 2, 4, 1, 2, 2, 1, 2, 1, 5, 5, 2, 3, 2], dtype=int64),
 'occupation': array([16,  3, 11, 13,  1,  7,  8,  0,  6,  4,  4,  4, 11,  9, 16,  6, 15,
        10,  4,  1, 13, 18,  4,  0,  1,  6, 14, 10, 13,  4,  0,  3,  9,  0,
        11,

In [11]:
# 使用DeepFM进行训练
# 使用 adam 优化器，损失函数是mse，评分矩阵为mse
# 模型参数：每次喂 256 batch_size的数据，跑 10个 epoch
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=True, validation_split=0.2, )


# 使用DeepFM进行预测
pred_ans = model.predict(test_model_input, batch_size=256)
# 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)

# 打印整体均方根差
rmse = mse ** 0.5
print("test RMSE", rmse)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Train on 128 samples, validate on 32 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
test RMSE 3.4940663989111598


In [14]:
print("预测结果：\n", pred_ans)

预测结果：
 [[0.21043625]
 [0.20929465]
 [0.2101976 ]
 [0.22802433]
 [0.20880629]
 [0.20918243]
 [0.2081047 ]
 [0.21028034]
 [0.20920505]
 [0.21002479]
 [0.21047027]
 [0.22811396]
 [0.20875746]
 [0.20844439]
 [0.21092242]
 [0.20969236]
 [0.20879379]
 [0.20826463]
 [0.21049587]
 [0.22594738]
 [0.20832177]
 [0.22636545]
 [0.21063592]
 [0.20934561]
 [0.20987874]
 [0.21057247]
 [0.24240315]
 [0.22583348]
 [0.20955668]
 [0.21053374]
 [0.2094241 ]
 [0.2097049 ]
 [0.2098247 ]
 [0.20982923]
 [0.21045828]
 [0.20865898]
 [0.20876163]
 [0.20845261]
 [0.21056964]
 [0.21016021]]
