In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from deepctr.models import DeepFM
from deepctr.models import WDL
from deepctr.feature_column import SparseFeat,get_feature_names
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("movielens_sample.csv")
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009


In [3]:
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ['rating']

In [4]:
# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]  #特征稀疏化
print(fixlen_feature_columns)
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000001C29406EBC8>, embedding_name='movie_id', group_name='default_group', trainable=True), SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000001C29406EFC8>, embedding_name='user_id', group_name='default_group', trainable=True), SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000001C294075148>, embedding_name='gender', group_name='default_group', trainable=True), SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras

In [5]:
# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

In [6]:
train_model_input

{'movie_id': array([126,  31,  87,   1, 170,  13,  67,  27, 133,  80, 172,  44,  22,
        130,  11,   9, 165,  88, 122,  16, 174, 176,  29,  15,  43, 136,
          6,  68,   0, 158, 121,  85, 140,  86, 112,  73, 156,  33,  32,
         66, 145,  55,  72, 164, 115,  79, 124, 184, 100,  53,  34,  17,
         61, 149, 185,  23, 141, 111, 137, 149, 148, 181, 106, 150, 134,
        108,  64, 123,  81,   5,  28, 128,  75, 116,   9,  95,  21, 157,
         60, 183,  70,  50,  72,   7,  24, 178,  66,  20, 138,  97,  91,
        113, 135,  39, 163, 169,  30,  99, 151, 173,  48, 142, 107,  59,
         45,  84, 119,  27,  40,  62,  52, 146,  54,  35, 153,  35,  71,
         26, 101, 114,  63, 127,  10, 173,  65, 103, 170, 179, 177,  93,
        161,  58, 112,  83,  38, 169, 110,  92,  76, 168,  34,  18, 159,
         12, 105,  89,  96,   8, 104,  94,  25, 123,  82, 180,  98, 120,
         49, 139, 160,  47], dtype=int64),
 'user_id': array([ 37, 124,  44, 180,  11,  15,  69,  39, 116, 171, 

In [7]:
# 使用WDL进行训练
model = WDL(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=True, validation_split=0.2, )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
# 使用WDL进行预测
pred_ans = model.predict(test_model_input, batch_size=256)

In [13]:
# 输出RMSE或MSE
from sklearn import metrics
import numpy as np
mse=metrics.mean_squared_error(test[target].values, pred_ans)
rmse = np.sqrt(metrics.mean_squared_error(test[target].values, pred_ans))
print("test RMSE", rmse)
print("test MSE", mse)

test RMSE 3.4981872389648734
test MSE 12.237313958856685
