In [1]:
import warnings
warnings.filterwarnings("ignore")
import itertools
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import namedtuple

import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

from utils.utils import SparseFeat, DenseFeat, VarLenSparseFeat

In [2]:
def build_input_layers(feature_columns):
    # 构建Input层字典，并以dense和sparse两类字典的形式返回
    
    dense_input_dict, sparse_input_dict = {}, {}
    
    for feat in feature_columns:
        if isinstance(feat, SparseFeat):
            sparse_input_dict[feat.name] = Input(shape=(1,), name=feat.name)
        else:
            dense_input_dict[feat.name] = Input(shape=(feat.dimension,), name=feat.name)
    
    return dense_input_dict, sparse_input_dict

In [3]:
def build_embedding_layers(feature_columns, input_layers_dict, is_linear, prefix=''):
    # 定义一个embedding层对应的字典
    embedding_layers_dict = {}
    
    # 将特征中的sparse特征筛选出来
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
    
    # 如果是线性部分的embedding层，其维度是1， 否则维度是自己定义的embedding维度
    if is_linear:
        for feat in sparse_feature_columns:
            embedding_layers_dict[feat.name] = Embedding(feat.vocab_size+1, 1, name=prefix + '1d_emb_' + feat.name)
    else:
        for feat in sparse_feature_columns:
            embedding_layers_dict[feat.name] = Embedding(feat.vocab_size+1, feat.embed_dim, name=prefix + 'kd_emb_' + feat.name)
    return embedding_layers_dict

In [4]:
def get_dnn_out(dnn_inputs, units=(32, 16)):
    dnn_out = dnn_inputs
    for out_dim in units:
        dnn_out = Dense(out_dim)(dnn_out)
    return dnn_out

In [13]:
def NCF(dnn_feature_columns):
    # 构建输入层，即所有特征对应的Input()层，这里使用字典的形式返回， 方便后续构建模型
    _, sparse_input_dict = build_input_layers(dnn_feature_columns)  # 没有dense特征
    
    # 构建模型的输入层，模型的输入层不能是字典的形式，应该将字典的形式转换成列表的形式
    # 注意： 这里实际的输入与Input（）层的对应，是通过模型的输入时候的字典数据的key与对应name的Input层
    input_layers = list(sparse_input_dict.values())
    
    # 创建两份embedding向量，由于embedding层的name不能相同，所以这里加入一个prefix参数
    GML_embedding_dict = build_embedding_layers(dnn_feature_columns, sparse_input_dict, is_linear=False, prefix='GML')
    MLP_embedding_dict = build_embedding_layers(dnn_feature_columns, sparse_input_dict, is_linear=False, prefix='MLP')
    
    # 构建GML的输出
    GML_user_emb = Flatten()(GML_embedding_dict['user_id'](sparse_input_dict['user_id']))   # B X embed_dim
    GML_item_emb = Flatten()(GML_embedding_dict['movie_id'](sparse_input_dict['movie_id'])) # B X embed_dim
    GML_out = tf.multiply(GML_user_emb, GML_item_emb) # 按照元素相乘
    
    # 构建MLP的输出
    MLP_user_emb = Flatten()(MLP_embedding_dict['user_id'](sparse_input_dict['user_id']))  # B X embed_dim
    MLP_item_emb = Flatten()(MLP_embedding_dict['movie_id'](sparse_input_dict['movie_id']))  # B X embed_dim
    MLP_dnn_input = Concatenate(axis=1)([MLP_user_emb, MLP_item_emb])
    MLP_dnn_out = get_dnn_out(MLP_dnn_input, (32, 16))
    
    # 将dense特征和Sparse特征拼接在一起
    concat_dnn = Concatenate(axis=1)([GML_out, MLP_dnn_out])
    
    # 输入到dnn中，需要提前定义需要几个残差块
    # output_layer = Dense(1, 'sigmoid')(concat_out)
    output_layer = Dense(1)(concat_dnn)
    
    model = Model(input_layers, output_layer)
    
    return model

In [6]:
# 读取数据，NCF使用的特征只有user_id和item_id
rnames = ['user_id','movie_id','rating','timestamp']
data= pd.read_csv('./data/ml-1m/ratings.dat', sep="::", engine='python', names=rnames)

In [7]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [8]:
lbe = LabelEncoder()
data['user_id'] = lbe.fit_transform(data['user_id'])
data['movie_id'] = lbe.fit_transform(data['movie_id'])

In [9]:
train_data = data[['user_id', 'movie_id']]
train_data['label'] = data['rating']

In [10]:
train_data.head()

Unnamed: 0,user_id,movie_id,label
0,0,1104,5
1,0,639,3
2,0,853,3
3,0,3177,4
4,0,2162,5


In [11]:
dnn_feature_columns = [
    SparseFeat('user_id', train_data['user_id'].nunique(), 8),
    SparseFeat('movie_id', train_data['movie_id'].nunique(), 8)
]

In [14]:
# 构建FM模型
history = NCF(dnn_feature_columns)
history.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
movie_id (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
MLPkd_emb_user_id (Embedding)   (None, 1, 8)         48328       user_id[0][0]                    
__________________________________________________________________________________________________
MLPkd_emb_movie_id (Embedding)  (None, 1, 8)         29656       movie_id[0][0]                   
_______________________________________________________________________________________

In [15]:
history.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [17]:
# 将输入数据转化成字典形式输入
# 将数据转换成字典的形式，用于Input()层对应
train_model_input = {name: train_data[name] for name in ['user_id', 'movie_id']}

In [18]:
# 模型训练
history.fit(train_model_input, train_data['label'].values, batch_size=32, epochs=2, validation_split=0.2,)

Epoch 1/2


 3491/25006 [===>..........................] - ETA: 0s - loss: 15.0516 - mae: 3.72 - ETA: 1:24 - loss: 13.5493 - mae: 3.50 - ETA: 1:17 - loss: 12.5801 - mae: 3.36 - ETA: 1:14 - loss: 11.1399 - mae: 3.12 - ETA: 1:10 - loss: 9.4693 - mae: 2.8213 - ETA: 1:07 - loss: 7.7455 - mae: 2.441 - ETA: 1:08 - loss: 6.6903 - mae: 2.189 - ETA: 1:06 - loss: 5.8150 - mae: 1.978 - ETA: 1:05 - loss: 5.1849 - mae: 1.832 - ETA: 1:06 - loss: 4.7561 - mae: 1.729 - ETA: 1:06 - loss: 4.4327 - mae: 1.655 - ETA: 1:07 - loss: 4.1702 - mae: 1.590 - ETA: 1:07 - loss: 3.9288 - mae: 1.535 - ETA: 1:08 - loss: 3.7431 - mae: 1.490 - ETA: 1:08 - loss: 3.5585 - mae: 1.446 - ETA: 1:08 - loss: 3.4169 - mae: 1.412 - ETA: 1:09 - loss: 3.2858 - mae: 1.379 - ETA: 1:09 - loss: 3.1702 - mae: 1.351 - ETA: 1:09 - loss: 3.0484 - mae: 1.321 - ETA: 1:08 - loss: 2.9309 - mae: 1.292 - ETA: 1:08 - loss: 2.8419 - mae: 1.271 - ETA: 1:09 - loss: 2.7685 - mae: 1.253 - ETA: 1:08 - loss: 2.6903 - mae: 1.234 - ETA: 1:08 - loss: 2.6272 - mae: 1.











Epoch 2/2


 4533/25006 [====>.........................] - ETA: 0s - loss: 0.7261 - mae: 0.644 - ETA: 1:05 - loss: 0.8208 - mae: 0.701 - ETA: 1:03 - loss: 0.7774 - mae: 0.681 - ETA: 1:02 - loss: 0.7638 - mae: 0.681 - ETA: 1:01 - loss: 0.7564 - mae: 0.684 - ETA: 1:00 - loss: 0.7706 - mae: 0.691 - ETA: 1:00 - loss: 0.7702 - mae: 0.690 - ETA: 59s - loss: 0.7632 - mae: 0.687 - ETA: 58s - loss: 0.7603 - mae: 0.68 - ETA: 58s - loss: 0.7600 - mae: 0.68 - ETA: 58s - loss: 0.7644 - mae: 0.68 - ETA: 58s - loss: 0.7681 - mae: 0.68 - ETA: 57s - loss: 0.7623 - mae: 0.68 - ETA: 57s - loss: 0.7596 - mae: 0.68 - ETA: 57s - loss: 0.7612 - mae: 0.68 - ETA: 57s - loss: 0.7581 - mae: 0.68 - ETA: 57s - loss: 0.7575 - mae: 0.68 - ETA: 57s - loss: 0.7587 - mae: 0.68 - ETA: 57s - loss: 0.7572 - mae: 0.68 - ETA: 56s - loss: 0.7557 - mae: 0.68 - ETA: 56s - loss: 0.7545 - mae: 0.68 - ETA: 56s - loss: 0.7552 - mae: 0.68 - ETA: 56s - loss: 0.7545 - mae: 0.68 - ETA: 56s - loss: 0.7556 - mae: 0.68 - ETA: 56s - loss: 0.7533 - ma











<tensorflow.python.keras.callbacks.History at 0x51c36a0>