In [20]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from deepctr.models import WDL
from deepctr.inputs import SparseFeat, get_feature_names

In [21]:
data = pd.read_csv(r'./movielens_sample.txt')
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009


In [22]:
# 确定稀疏数据和连续数据
sparse = ['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip'] 
target = ['rating']

In [23]:
# 给sparse features转码
for label in sparse:
    lbe = LabelEncoder()
    data[label] = lbe.fit_transform(data[label])

In [24]:
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[label].nunique()) for feature in sparse]

# 共享输入
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [25]:
print(feature_names)

['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip']


In [26]:
# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

print(train_model_input)

{'movie_id': array([112,  83, 142,  96,  34,  46,  29, 161,  94,  35,  88, 124,  54,
        25,  82,  13,  19,  57, 140,  87,  27, 141, 105, 147, 176,  81,
       169, 158, 126, 175,  31, 186, 127, 118,  11, 160, 130, 102,  77,
       155,  32, 154, 104, 133,  52,   1,  12, 123, 170, 152, 116, 181,
       129,  45,   9, 151, 139, 149,  21, 103, 169, 132,  26, 183,   7,
        56,  20,  79, 113,  30,  70,  16,  78, 125, 165, 174, 109,  75,
        65, 122, 178,  85, 162,  67,  74, 180, 173, 111, 156, 108,  33,
        66,  41,  24, 106,  48, 168,  40,  50, 137,  76, 112,  15,  51,
       157,  17, 107, 119,  98,   8,  80,  69,  49,  89, 138,  66, 136,
         9,   6,  92, 163, 166, 134, 167, 121,   5, 150, 123,  28,  53,
        38,  42, 145,  71, 149,  58,  59,  61,  60,  44, 171,  35, 114,
       170,  22, 159,  73, 172,  68, 177,  10,  23,   0,  34,  43,  95,
       153,   3,  36, 117], dtype=int64), 'user_id': array([ 21,  52,   5, 138, 157,  81, 108,  89, 177,   0, 118, 159,  33

In [32]:
# 使用WDL训练
model = WDL(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile('adam', 'mse', metrics=['mse'])

history = model.fit(train_model_input, train[target].values, 
                    batch_size=256, epochs=10, verbose=True, validation_split=0.2,)

pred_ans = model.predict(test_model_input, batch_size=256)

mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse ** 0.5
print(f'test RMSE = {rmse}')

Train on 128 samples, validate on 32 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
test RMSE = 3.5085467076839665
