In [3]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr.models import DeepFM

In [2]:
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

In [10]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,107,12,4,968035345,Ed Wood (1994),Comedy|Drama,0,2,4,35
1,123,169,3,966536874,Patriot Games (1992),Action|Thriller,1,1,4,118
2,12,6,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,0,2,13,99
3,21,112,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,1,1,18,55
4,187,45,5,957782527,"Apartment, The (1960)",Comedy|Drama,1,5,19,41
...,...,...,...,...,...,...,...,...,...,...
195,46,176,3,974840560,Screwed (2000),Comedy,1,2,11,48
196,131,89,3,965855033,Fire Down Below (1997),Action|Drama|Thriller,1,1,11,113
197,4,125,3,976730191,Desperately Seeking Susan (1985),Comedy|Romance,0,1,13,83
198,181,15,4,958503395,Clear and Present Danger (1994),Action|Adventure|Thriller,1,2,0,106


In [20]:
data = pd.read_csv("./movielens_sample.txt")
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip", ]
target = ['rating']

# 1.Label Encoding for sparse features,and process sequence features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [33]:
# step 1：把每一行每个分类都加进字典，做 histogram 计算总量 （通过 split 填充 key2index
# Step 2：计算字典内每个元素的长度，每个元素都是一个list
key2index = {}
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)


# 上面的计算主要用于下面这行，用于补零
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post')

In [35]:
# 设置定长特征
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
                          for feat in sparse_features]

# 不用 use_weighted_sequence, 设置 变长特征（嵌入到4维向量）
varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(key2index) + 1, embedding_dim=4), 
                                           maxlen=max_len, 
                                           combiner='mean', 
                                           weight_name=None)]

# 常规操作：设置共享输入
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [36]:
model_input = {name: data[name] for name in sparse_features}


model_input["genres"] = genres_list
# model_input["genres_weight"] = np.random.randn(data.shape[0], max_len, 1)


model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', dnn_hidden_units=(256,256),)

model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(model_input, data[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Train on 160 samples, validate on 40 samples
Epoch 1/10
160/160 - 2s - loss: 14.3000 - mean_squared_error: 14.3000 - val_loss: 13.3483 - val_mean_squared_error: 13.3483
Epoch 2/10
160/160 - 0s - loss: 14.1239 - mean_squared_error: 14.1239 - val_loss: 13.1793 - val_mean_squared_error: 13.1793
Epoch 3/10
160/160 - 0s - loss: 13.9287 - mean_squared_error: 13.9287 - val_loss: 12.9898 - val_mean_squared_error: 12.9898
Epoch 4/10
160/160 - 0s - loss: 13.7112 - mean_squared_error: 13.7112 - val_loss: 12.7798 - val_mean_squared_error: 12.7798
Epoch 5/10
160/160 - 0s - loss: 13.4708 - mean_squared_error: 13.4708 - val_loss: 12.5472 - val_mean_squared_error: 12.5472
Epoch 6/10
160/160 - 0s - loss: 13.2051 - mean_squared_error: 13.2051 - val_loss: 12.2906 - val_mean_squared_error: 12.2906
Epoch 7/10
160/160 - 0s - loss: 12.9119 - mean_squared_error: 12.9119 - val_loss: 12.0082 - val_mean_squared_error: 12.0082
Epoch 8/10
160/160 - 0s - loss: 12.5897 - mean_squared_error: 12.5897 - val_loss: 11.69