In [1]:
import warnings
warnings.filterwarnings("ignore")
import itertools
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import namedtuple

import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

from utils.utils import SparseFeat, DenseFeat, VarLenSparseFeat

In [2]:
def build_input_layers(feature_columns):
    # 构建Input层字典，并以dense和sparse两类字典的形式返回
    
    dense_input_dict, sparse_input_dict = {}, {}
    
    for feat in feature_columns:
        if isinstance(feat, SparseFeat):
            sparse_input_dict[feat.name] = Input(shape=(1,), name=feat.name)
        else:
            dense_input_dict[feat.name] = Input(shape=(feat.dimension,), name=feat.name)
    
    return dense_input_dict, sparse_input_dict

In [3]:
def build_embedding_layers(feature_columns, input_layers_dict, is_linear, prefix=''):
    # 定义一个embedding层对应的字典
    embedding_layers_dict = {}
    
    # 将特征中的sparse特征筛选出来
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
    
    # 如果是线性部分的embedding层，其维度是1， 否则维度是自己定义的embedding维度
    if is_linear:
        for feat in sparse_feature_columns:
            embedding_layers_dict[feat.name] = Embedding(feat.vocab_size+1, 1, name=prefix + '1d_emb_' + feat.name)
    else:
        for feat in sparse_feature_columns:
            embedding_layers_dict[feat.name] = Embedding(feat.vocab_size+1, feat.embed_dim, name=prefix + 'kd_emb_' + feat.name)
    return embedding_layers_dict

In [4]:
def get_dnn_out(dnn_inputs, units=(32, 16)):
    dnn_out = dnn_inputs
    for out_dim in units:
        dnn_out = Dense(out_dim)(dnn_out)
    return dnn_out

In [5]:
def NCF(dnn_feature_columns):
    # 构建输入层，即所有特征对应的Input()层，这里使用字典的形式返回， 方便后续构建模型
    _, sparse_input_dict = build_input_layers(dnn_feature_columns)  # 没有dense特征
    
    # 构建模型的输入层，模型的输入层不能是字典的形式，应该将字典的形式转换成列表的形式
    # 注意： 这里实际的输入与Input（）层的对应，是通过模型的输入时候的字典数据的key与对应name的Input层
    input_layers = list(sparse_input_dict.values())
    
    # 创建两份embedding向量，由于embedding层的name不能相同，所以这里加入一个prefix参数
    GML_embedding_dict = build_embedding_layers(dnn_feature_columns, sparse_input_dict, is_linear=False, prefix='GML')
    MLP_embedding_dict = build_embedding_layers(dnn_feature_columns, sparse_input_dict, is_linear=False, prefix='MLP')
    print(GML_embedding_dict)
    
    # 构建GML的输出
    GML_user_emb = Flatten()(GML_embedding_dict['user_id'](sparse_input_dict['user_id']))   # B X embed_dim
    print(GML_user_emb)
    GML_item_emb = Flatten()(GML_embedding_dict['movie_id'](sparse_input_dict['movie_id'])) # B X embed_dim
    GML_out = tf.multiply(GML_user_emb, GML_item_emb) # 按照元素相乘
    
    # 构建MLP的输出
    MLP_user_emb = Flatten()(MLP_embedding_dict['user_id'](sparse_input_dict['user_id']))  # B X embed_dim
    MLP_item_emb = Flatten()(MLP_embedding_dict['movie_id'](sparse_input_dict['movie_id']))  # B X embed_dim
    MLP_dnn_input = Concatenate(axis=1)([MLP_user_emb, MLP_item_emb])
    MLP_dnn_out = get_dnn_out(MLP_dnn_input, (32, 16))
    
    # 将dense特征和Sparse特征拼接在一起
    concat_dnn = Concatenate(axis=1)(GML_out, MLP_dnn_out)
    
    # 输入到dnn中，需要提前定义需要几个残差块
    # output_layer = Dense(1, 'sigmoid')(concat_out)
    output_layer = Dense(1)(concat_out)
    
    model = Model(input_layers, output_layer)
    
    return model

In [6]:
# 读取数据，NCF使用的特征只有user_id和item_id
rnames = ['user_id','movie_id','rating','timestamp']
data= pd.read_csv('./data/ml-1m/ratings.dat', sep="::", engine='python', names=rnames)

In [7]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [8]:
lbe = LabelEncoder()
data['user_id'] = lbe.fit_transform(data['user_id'])
data['movie_id'] = lbe.fit_transform(data['movie_id'])

In [9]:
train_data = data[['user_id', 'movie_id']]
train_data['label'] = data['rating']

In [10]:
train_data.head()

Unnamed: 0,user_id,movie_id,label
0,0,1104,5
1,0,639,3
2,0,853,3
3,0,3177,4
4,0,2162,5


In [11]:
dnn_feature_columns = [
    SparseFeat('user_id', train_data['user_id'].nunique(), 8),
    SparseFeat('movie_id', train_data['movie_id'].nunique(), 8)
]

In [12]:
# 构建FM模型
history = NCF(dnn_feature_columns)
history.summary()

{'user_id': <tf.Tensor 'user_id:0' shape=(None, 1) dtype=float32>, 'movie_id': <tf.Tensor 'movie_id:0' shape=(None, 1) dtype=float32>}


KeyError: 'item_id'