In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

from deepctr.models import WDL
from deepctr.inputs import SparseFeat,get_feature_names

In [3]:
data = pd.read_csv('movielens_sample.txt')
sparse_features = ['user_id','movie_id','gender','age',
                  'occupation','zip']
target = ['rating']

In [5]:
#对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])

In [6]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,107,12,4,968035345,Ed Wood (1994),Comedy|Drama,0,2,4,35
1,123,169,3,966536874,Patriot Games (1992),Action|Thriller,1,1,4,118
2,12,6,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,0,2,13,99
3,21,112,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,1,1,18,55
4,187,45,5,957782527,"Apartment, The (1960)",Comedy|Drama,1,5,19,41


In [14]:
#计算每个特征中的不同特征值个数
fixlen_feature_columns = [SparseFeat(feature,data[feature].nunique())
                      for feature in sparse_features]
#print(fixlen_feature_columns)
linearlen_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + 
                                 dnn_feature_columns)

In [16]:
#数据集切分
train,test = train_test_split(data,test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

In [20]:
#使用WDL训练
model = WDL(linear_feature_columns,dnn_feature_columns,
           task='regression')


Train on 128 samples, validate on 32 samples
Epoch 1/1


In [21]:
#使用WDL预测
pred_ans=model.predict(test_model_input,batch_size=256)
#输出RMSE或MSE
mse = round(mean_squared_error(test[target].values,pred_ans),4)
rmse = mse**0.5
print('test RMSE',rmse)

test RMSE 3.979334115150423


In [22]:
#NFM对比
from deepctr.models import NFM
model = NFM(linear_feature_columns,dnn_feature_columns,
           task='regression')
model.compile('adam','mse',metrics=['mse'])
history = model.fit(train_model_input,train[target].values,
                   batch_size=256,epochs=10,verbose=True,validation_split=0.2)

Train on 128 samples, validate on 32 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
