In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras import layers

In [7]:
# 1. 读取并预处理数据
data = pd.read_csv(r"D:\software\pycharm_repository\StarMaker\MultiRecSys\data_files\movielens_sample.txt")
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009


In [8]:
data['genres_2'] = data['genres']

In [10]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip,genres_2
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119,Comedy|Drama
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005,Action|Thriller
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408,Drama|Romance
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307,Action|Adventure
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009,Comedy|Drama


In [12]:
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
sequence_features = ['genres', 'genres_2']  # 变长序列特征列表
target = ["rating"]

In [13]:
# 对稀疏特征进行标签编码（Label Encoding）
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

pad_sequences_dict = {}

# 每个变长特征用一个独立的 Tokenizer，可以支持多个变长特征
tokenizers = {}

for feature in sequence_features:
    # 文本格式准备：['a|b|c'] → ['a b c']
    texts = data[feature].apply(lambda x: x.replace('|', ' ')).tolist()

    # 创建并拟合 Tokenizer
    tokenizer = Tokenizer(oov_token='OOV')  # OOV 用于处理未知词
    tokenizer.fit_on_texts(texts)

    # 转为索引序列
    sequences = tokenizer.texts_to_sequences(texts)

    # 序列填充（右侧填充）
    padded = pad_sequences(sequences, padding='post')

    # 存储结果与 tokenizer
    pad_sequences_dict[feature] = padded
    tokenizers[feature] = tokenizer

In [32]:
tokenizers['genres'].index_word

{1: 'OOV',
 2: 'comedy',
 3: 'drama',
 4: 'action',
 5: 'thriller',
 6: 'romance',
 7: 'sci',
 8: 'fi',
 9: 'adventure',
 10: 'horror',
 11: 'crime',
 12: "children's",
 13: 'fantasy',
 14: 'war',
 15: 'western',
 16: 'mystery',
 17: 'musical',
 18: 'animation',
 19: 'film',
 20: 'noir'}

In [15]:
# 对所有变长序列特征进行处理
pad_sequences_dict = {}
for feature in sequence_features:
    feature_list = list(map(lambda x: split(x, key2index), data[feature].values))
    pad_sequences_dict[feature] = pad_sequences(feature_list, padding='post')  # 填充序列，默认就是按照最大长度的序列填充

In [23]:
for key,value in pad_sequences_dict.items():
    print(key,value[0:5])

genres [[1 2 0 0 0]
 [3 4 0 0 0]
 [2 5 0 0 0]
 [3 6 0 0 0]
 [1 2 0 0 0]]
genres_2 [[1 2 0 0 0]
 [3 4 0 0 0]
 [2 5 0 0 0]
 [3 6 0 0 0]
 [1 2 0 0 0]]


In [21]:
key2index

{'Comedy': 1,
 'Drama': 2,
 'Action': 3,
 'Thriller': 4,
 'Romance': 5,
 'Adventure': 6,
 "Children's": 7,
 'Western': 8,
 'Horror': 9,
 'Fantasy': 10,
 'Sci-Fi': 11,
 'Animation': 12,
 'Crime': 13,
 'Film-Noir': 14,
 'Musical': 15,
 'War': 16,
 'Mystery': 17}

In [107]:
# 2. 创建每个特征的嵌入层
embedding_dim = 4
vocab_sizes = {feat: data[feat].nunique() for feat in sparse_features}
vocab_sizes.update({feat: len(key2index) + 1 for feat in sequence_features})

# 创建嵌入层字典
all_sparse_like_features = sparse_features + sequence_features

embed_layers = {
    feat: layers.Embedding(input_dim=vocab_sizes[feat] + 1, output_dim=embedding_dim, mask_zero=True)
    for feat in all_sparse_like_features
}



embed_layers = {
    feat: layers.Embedding(input_dim=vocab_sizes[feat] + 1, output_dim=embedding_dim, mask_zero=True)
    for feat in sparse_features + sequence_features
}
all_sparse_like_features = sparse_features + sequence_features

embed_layers = {
    feat: layers.Embedding(input_dim=vocab_sizes[feat] + 1, output_dim=embedding_dim, mask_zero=True)
    for feat in all_sparse_like_features
}



# 3. 构建模型输入并提取嵌入
inputs = {feat: tf.keras.Input(shape=(1,), name=feat, dtype=tf.int32) for feat in sparse_features}

# 对变长序列特征进行输入处理
for feat in sequence_features:
    inputs[feat] = tf.keras.Input(shape=(pad_sequences_dict[feat].shape[1],), name=feat, dtype=tf.int32)

# 固定长度稀疏特征的嵌入
embeds = [embed_layers[feat](inputs[feat]) for feat in sparse_features]  # shape: (batch_size, 1, embedding_dim)

# 变长序列特征的嵌入（按均值池化）
pooled_embeds = []
for feat in sequence_features:
    feature_embed = embed_layers[feat](inputs[feat])  # shape: (batch_size, max_len, embedding_dim)
    pooled_embeds.append(tf.reduce_mean(feature_embed, axis=1, keepdims=True))  # shape: (batch_size, 1, embedding_dim)

# 将所有嵌入拼接在一起
total_embeds = tf.concat(embeds + pooled_embeds, axis=1)  # shape: (batch_size, num_fields, embedding_dim)

# 4. FM 二阶交互计算
sum_square = tf.square(tf.reduce_sum(total_embeds, axis=1))  # shape: (batch_size, embedding_dim)
square_sum = tf.reduce_sum(tf.square(total_embeds), axis=1)  # shape: (batch_size, embedding_dim)
fm_second_order = 0.5 * tf.reduce_sum(sum_square - square_sum, axis=1, keepdims=True)  # shape: (batch_size, 1)

# 5. DNN 部分 - 扁平化并通过全连接层
dnn_input = tf.reshape(total_embeds, shape=(-1, total_embeds.shape[1] * embedding_dim))
x = layers.Dense(64, activation='relu')(dnn_input)
x = layers.Dense(32, activation='relu')(x)
dnn_output = layers.Dense(1)(x)  # shape: (batch_size, 1)

# 6. 结合 FM 和 DNN 输出
output = layers.Add()([fm_second_order, dnn_output])
model = tf.keras.Model(inputs=list(inputs.values()), outputs=output)

# 7. 编译并训练模型
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

model_input = {feat: data[feat].values for feat in sparse_features}

# 将填充后的变长序列特征加入模型输入
for feat in sequence_features:
    model_input[feat] = pad_sequences_dict[feat]

# 训练模型
model.fit(model_input, data[target].values, batch_size=256, epochs=10, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2c60c3c0a90>

In [117]:
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ["rating"]

# 对稀疏特征进行标签编码（Label Encoding）
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

# 变长序列特征列表
sequence_features = ['genres']  # 可以任意增加更多的变长序列特征

# 创建字典来存储每个变长特征的索引
key2index = {}

# 定义一个通用的函数来处理所有变长序列特征
def split(x, key2index):
    keys = x.split('|')
    for k in keys:
        if k not in key2index:
            key2index[k] = len(key2index) + 1
    return [key2index[k] for k in keys]

# 对所有变长序列特征进行处理
pad_sequences_dict = {}
for feature in sequence_features:
    feature_list = list(map(lambda x: split(x, key2index), data[feature].values))
    pad_sequences_dict[feature] = pad_sequences(feature_list, padding='post')  # 填充序列

In [119]:
pad_sequences_dict

{'genres': array([[ 1,  2,  0,  0,  0],
        [ 3,  4,  0,  0,  0],
        [ 2,  5,  0,  0,  0],
        [ 3,  6,  0,  0,  0],
        [ 1,  2,  0,  0,  0],
        [ 1,  0,  0,  0,  0],
        [ 7,  1,  0,  0,  0],
        [ 2,  0,  0,  0,  0],
        [ 2,  0,  0,  0,  0],
        [ 8,  0,  0,  0,  0],
        [ 3,  9,  0,  0,  0],
        [ 1,  0,  0,  0,  0],
        [ 1,  2,  0,  0,  0],
        [ 7,  2,  0,  0,  0],
        [ 2,  0,  0,  0,  0],
        [ 1, 10,  5, 11,  0],
        [12,  7,  0,  0,  0],
        [ 1, 13,  9,  0,  0],
        [ 2, 14,  0,  0,  0],
        [ 3, 13,  4,  0,  0],
        [ 1, 13,  0,  0,  0],
        [ 1,  2,  5,  0,  0],
        [ 1,  9, 15, 11,  0],
        [ 2,  5,  0,  0,  0],
        [ 1, 11,  0,  0,  0],
        [ 2,  0,  0,  0,  0],
        [ 9,  0,  0,  0,  0],
        [13,  2,  0,  0,  0],
        [ 1,  0,  0,  0,  0],
        [ 1,  5,  0,  0,  0],
        [ 1,  2,  0,  0,  0],
        [ 2,  0,  0,  0,  0],
        [ 9,  4,  0,  0,  0],


In [118]:
feature_list

[[1, 2],
 [3, 4],
 [2, 5],
 [3, 6],
 [1, 2],
 [1],
 [7, 1],
 [2],
 [2],
 [8],
 [3, 9],
 [1],
 [1, 2],
 [7, 2],
 [2],
 [1, 10, 5, 11],
 [12, 7],
 [1, 13, 9],
 [2, 14],
 [3, 13, 4],
 [1, 13],
 [1, 2, 5],
 [1, 9, 15, 11],
 [2, 5],
 [1, 11],
 [2],
 [9],
 [13, 2],
 [1],
 [1, 5],
 [1, 2],
 [2],
 [9, 4],
 [1, 5],
 [1, 9],
 [1, 10],
 [1],
 [4],
 [2],
 [2, 11],
 [1, 11],
 [1, 13, 10],
 [2, 16],
 [3, 2, 5],
 [3, 6, 11],
 [3, 11, 4],
 [3, 6, 13],
 [1],
 [3, 4],
 [13, 2, 11],
 [3, 5],
 [1, 5],
 [1],
 [3, 4],
 [3, 9, 11, 4],
 [1, 9],
 [12, 7, 1, 15],
 [1, 2],
 [1, 5],
 [1, 5],
 [7, 1],
 [2, 5, 11],
 [1, 5],
 [7, 9],
 [13, 4],
 [2],
 [2, 16],
 [1, 9],
 [3, 13, 11],
 [6, 2, 8],
 [1],
 [1, 5],
 [2, 4],
 [1, 2],
 [2, 5],
 [3, 6],
 [1, 8],
 [3, 11, 4],
 [3, 2, 17],
 [1, 5],
 [2, 17, 5],
 [1, 5],
 [2],
 [1],
 [1],
 [13, 2],
 [1, 2],
 [1],
 [3, 2, 16],
 [1],
 [14, 17, 4],
 [2, 4],
 [6, 7, 10, 11],
 [2, 5],
 [2, 4],
 [1, 2],
 [1, 2, 8],
 [2],
 [11],
 [6, 1, 11],
 [7, 2, 10, 11],
 [2],
 [1, 9],
 [1],
 [3, 4

In [115]:
pad_sequences_dict

{'genres': array([[ 1,  2,  0,  0,  0],
        [ 3,  4,  0,  0,  0],
        [ 2,  5,  0,  0,  0],
        [ 3,  6,  0,  0,  0],
        [ 1,  2,  0,  0,  0],
        [ 1,  0,  0,  0,  0],
        [ 7,  1,  0,  0,  0],
        [ 2,  0,  0,  0,  0],
        [ 2,  0,  0,  0,  0],
        [ 8,  0,  0,  0,  0],
        [ 3,  9,  0,  0,  0],
        [ 1,  0,  0,  0,  0],
        [ 1,  2,  0,  0,  0],
        [ 7,  2,  0,  0,  0],
        [ 2,  0,  0,  0,  0],
        [ 1, 10,  5, 11,  0],
        [12,  7,  0,  0,  0],
        [ 1, 13,  9,  0,  0],
        [ 2, 14,  0,  0,  0],
        [ 3, 13,  4,  0,  0],
        [ 1, 13,  0,  0,  0],
        [ 1,  2,  5,  0,  0],
        [ 1,  9, 15, 11,  0],
        [ 2,  5,  0,  0,  0],
        [ 1, 11,  0,  0,  0],
        [ 2,  0,  0,  0,  0],
        [ 9,  0,  0,  0,  0],
        [13,  2,  0,  0,  0],
        [ 1,  0,  0,  0,  0],
        [ 1,  5,  0,  0,  0],
        [ 1,  2,  0,  0,  0],
        [ 2,  0,  0,  0,  0],
        [ 9,  4,  0,  0,  0],


In [87]:
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ["rating"]

In [88]:
# 对稀疏特征做标签编码（Label Encoding）
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [89]:
# 处理变长序列特征 'genres'（使用 VarLenSparseFeat 的思想）
key2index = {}
def split(x):
    keys = x.split('|')
    for k in keys:
        if k not in key2index:
            key2index[k] = len(key2index) + 1
    return [key2index[k] for k in keys]

In [90]:
genres_list = list(map(split, data['genres'].values))
genres_pad = pad_sequences(genres_list, padding='post')  # shape: (样本数, max_len)
max_len = genres_pad.shape[1]

In [93]:
max_len

5

In [94]:
# 2. 为每个特征构建嵌入层（Embedding Layer）
embedding_dim = 4
vocab_sizes = {feat: data[feat].nunique() for feat in sparse_features}
vocab_sizes['genres'] = len(key2index)

In [97]:
vocab_sizes

{'movie_id': 187,
 'user_id': 193,
 'gender': 2,
 'age': 7,
 'occupation': 20,
 'zip': 188,
 'genres': 17}

In [98]:
# 普通稀疏特征不使用 mask_zero
for feat in sparse_features:
    embed_layers[feat] = layers.Embedding(input_dim=vocab_sizes[feat] + 1,
                                          output_dim=embedding_dim,
                                          mask_zero=False)

# 变长序列特征使用 mask_zero=True，忽略 padding 位置
embed_layers['genres'] = layers.Embedding(input_dim=vocab_sizes['genres'] + 1,
                                          output_dim=embedding_dim,
                                          mask_zero=True)

In [99]:
embed_layers

{'movie_id': <keras.layers.embeddings.Embedding at 0x2c60c33c280>,
 'user_id': <keras.layers.embeddings.Embedding at 0x2c60c33c4f0>,
 'gender': <keras.layers.embeddings.Embedding at 0x2c60c33c760>,
 'age': <keras.layers.embeddings.Embedding at 0x2c60c33c9d0>,
 'occupation': <keras.layers.embeddings.Embedding at 0x2c60c33cc40>,
 'zip': <keras.layers.embeddings.Embedding at 0x2c60c33ceb0>,
 'genres': <keras.layers.embeddings.Embedding at 0x2c60c33c1f0>}

In [100]:
# 3. 构建模型输入并提取嵌入
inputs = {feat: tf.keras.Input(shape=(1,), name=feat, dtype=tf.int32) for feat in sparse_features}
inputs['genres'] = tf.keras.Input(shape=(max_len,), name='genres', dtype=tf.int32)

In [101]:
inputs

{'movie_id': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'movie_id')>,
 'user_id': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'user_id')>,
 'gender': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'gender')>,
 'age': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'age')>,
 'occupation': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'occupation')>,
 'zip': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'zip')>,
 'genres': <KerasTensor: shape=(None, 5) dtype=int32 (created by layer 'genres')>}

In [103]:
# 固定长度稀疏特征的嵌入 shape: (batch_size, 1, embedding_dim)
embeds = [embed_layers[feat](inputs[feat]) for feat in sparse_features]
embeds

[<KerasTensor: shape=(None, 1, 4) dtype=float32 (created by layer 'embedding_70')>,
 <KerasTensor: shape=(None, 1, 4) dtype=float32 (created by layer 'embedding_71')>,
 <KerasTensor: shape=(None, 1, 4) dtype=float32 (created by layer 'embedding_72')>,
 <KerasTensor: shape=(None, 1, 4) dtype=float32 (created by layer 'embedding_73')>,
 <KerasTensor: shape=(None, 1, 4) dtype=float32 (created by layer 'embedding_74')>,
 <KerasTensor: shape=(None, 1, 4) dtype=float32 (created by layer 'embedding_75')>]

In [104]:
# 变长序列特征 'genres' 的嵌入，并通过 mean pooling 汇聚为 shape: (batch_size, 1, embedding_dim)
genres_embed = embed_layers['genres'](inputs['genres'])
genres_embed_pooled = tf.reduce_mean(genres_embed, axis=1, keepdims=True)

In [106]:
genres_embed

<KerasTensor: shape=(None, 5, 4) dtype=float32 (created by layer 'embedding_76')>

In [38]:
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ["rating"]

In [39]:
# 对稀疏特征做标签编码（Label Encoding）
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [60]:
data['user_id'].min()

0

In [40]:
# 处理 genres 字段为变长序列特征（VarLenSparseFeat）
key2index = {}
def split(x):
    keys = x.split('|')
    for k in keys:
        if k not in key2index:
            key2index[k] = len(key2index) + 1
    return [key2index[k] for k in keys]

In [41]:
genres_list = list(map(split, data['genres'].values))
genres_list

[[1, 2],
 [3, 4],
 [2, 5],
 [3, 6],
 [1, 2],
 [1],
 [7, 1],
 [2],
 [2],
 [8],
 [3, 9],
 [1],
 [1, 2],
 [7, 2],
 [2],
 [1, 10, 5, 11],
 [12, 7],
 [1, 13, 9],
 [2, 14],
 [3, 13, 4],
 [1, 13],
 [1, 2, 5],
 [1, 9, 15, 11],
 [2, 5],
 [1, 11],
 [2],
 [9],
 [13, 2],
 [1],
 [1, 5],
 [1, 2],
 [2],
 [9, 4],
 [1, 5],
 [1, 9],
 [1, 10],
 [1],
 [4],
 [2],
 [2, 11],
 [1, 11],
 [1, 13, 10],
 [2, 16],
 [3, 2, 5],
 [3, 6, 11],
 [3, 11, 4],
 [3, 6, 13],
 [1],
 [3, 4],
 [13, 2, 11],
 [3, 5],
 [1, 5],
 [1],
 [3, 4],
 [3, 9, 11, 4],
 [1, 9],
 [12, 7, 1, 15],
 [1, 2],
 [1, 5],
 [1, 5],
 [7, 1],
 [2, 5, 11],
 [1, 5],
 [7, 9],
 [13, 4],
 [2],
 [2, 16],
 [1, 9],
 [3, 13, 11],
 [6, 2, 8],
 [1],
 [1, 5],
 [2, 4],
 [1, 2],
 [2, 5],
 [3, 6],
 [1, 8],
 [3, 11, 4],
 [3, 2, 17],
 [1, 5],
 [2, 17, 5],
 [1, 5],
 [2],
 [1],
 [1],
 [13, 2],
 [1, 2],
 [1],
 [3, 2, 16],
 [1],
 [14, 17, 4],
 [2, 4],
 [6, 7, 10, 11],
 [2, 5],
 [2, 4],
 [1, 2],
 [1, 2, 8],
 [2],
 [11],
 [6, 1, 11],
 [7, 2, 10, 11],
 [2],
 [1, 9],
 [1],
 [3, 4

In [34]:
key2index

{'Comedy': 1,
 'Drama': 2,
 'Action': 3,
 'Thriller': 4,
 'Romance': 5,
 'Adventure': 6,
 "Children's": 7,
 'Western': 8,
 'Horror': 9,
 'Fantasy': 10,
 'Sci-Fi': 11,
 'Animation': 12,
 'Crime': 13,
 'Film-Noir': 14,
 'Musical': 15,
 'War': 16,
 'Mystery': 17}

In [35]:
genres_pad

array([[ 1,  2,  0,  0,  0],
       [ 3,  4,  0,  0,  0],
       [ 2,  5,  0,  0,  0],
       [ 3,  6,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 7,  1,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 8,  0,  0,  0,  0],
       [ 3,  9,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 7,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 1, 10,  5, 11,  0],
       [12,  7,  0,  0,  0],
       [ 1, 13,  9,  0,  0],
       [ 2, 14,  0,  0,  0],
       [ 3, 13,  4,  0,  0],
       [ 1, 13,  0,  0,  0],
       [ 1,  2,  5,  0,  0],
       [ 1,  9, 15, 11,  0],
       [ 2,  5,  0,  0,  0],
       [ 1, 11,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0],
       [13,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  4,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  9

In [36]:
max_len = genres_pad.shape[1]
max_len

5

In [None]:
genres_pad

In [44]:
genres_pad.shape

(200, 5)

In [45]:
genres_pad = pad_sequences(genres_list, padding='post')  # shape: (样本数, max_le)
genres_pad

array([[ 1,  2,  0,  0,  0],
       [ 3,  4,  0,  0,  0],
       [ 2,  5,  0,  0,  0],
       [ 3,  6,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 7,  1,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 8,  0,  0,  0,  0],
       [ 3,  9,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 7,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 1, 10,  5, 11,  0],
       [12,  7,  0,  0,  0],
       [ 1, 13,  9,  0,  0],
       [ 2, 14,  0,  0,  0],
       [ 3, 13,  4,  0,  0],
       [ 1, 13,  0,  0,  0],
       [ 1,  2,  5,  0,  0],
       [ 1,  9, 15, 11,  0],
       [ 2,  5,  0,  0,  0],
       [ 1, 11,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0],
       [13,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  4,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  9

In [46]:
max_len = genres_pad.shape[1]
max_len

5

In [48]:
# 2. 为每个特征创建 embedding 层
embedding_dim = 4
vocab_sizes = {feat: data[feat].nunique() for feat in sparse_features}
vocab_sizes

{'movie_id': 187,
 'user_id': 193,
 'gender': 2,
 'age': 7,
 'occupation': 20,
 'zip': 188}

In [49]:
vocab_sizes['genres'] = len(key2index) + 1

In [50]:
vocab_sizes

{'movie_id': 187,
 'user_id': 193,
 'gender': 2,
 'age': 7,
 'occupation': 20,
 'zip': 188,
 'genres': 18}

In [61]:
# 普通稀疏特征不使用 mask_zero
for feat in sparse_features:
    embed_layers[feat] = layers.Embedding(input_dim=vocab_sizes[feat] + 1,
                                          output_dim=embedding_dim,
                                          mask_zero=False)

# 变长序列特征使用 mask_zero=True，忽略 padding 位置
embed_layers['genres'] = layers.Embedding(input_dim=vocab_sizes['genres'] + 1,
                                          output_dim=embedding_dim,
                                          mask_zero=True)

In [62]:
embed_layers

{'movie_id': <keras.layers.embeddings.Embedding at 0x2c60c1e92b0>,
 'user_id': <keras.layers.embeddings.Embedding at 0x2c608e26970>,
 'gender': <keras.layers.embeddings.Embedding at 0x2c60c1ea970>,
 'age': <keras.layers.embeddings.Embedding at 0x2c60c1ea490>,
 'occupation': <keras.layers.embeddings.Embedding at 0x2c60c1eaaf0>,
 'zip': <keras.layers.embeddings.Embedding at 0x2c60c1eac70>,
 'genres': <keras.layers.embeddings.Embedding at 0x2c60c1ea250>}

In [63]:
# 3. 构建模型输入并提取嵌入
inputs = {feat: tf.keras.Input(shape=(1,), name=feat, dtype=tf.int32) for feat in sparse_features}
inputs['genres'] = tf.keras.Input(shape=(max_len,), name='genres', dtype=tf.int32)

In [64]:
inputs

{'movie_id': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'movie_id')>,
 'user_id': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'user_id')>,
 'gender': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'gender')>,
 'age': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'age')>,
 'occupation': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'occupation')>,
 'zip': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'zip')>,
 'genres': <KerasTensor: shape=(None, 5) dtype=int32 (created by layer 'genres')>}

In [65]:
# 固定长度稀疏特征的嵌入 shape: (batch_size, 1, embedding_dim)
embeds = [embed_layers[feat](inputs[feat]) for feat in sparse_features]

In [66]:
embeds

[<KerasTensor: shape=(None, 1, 4) dtype=float32 (created by layer 'embedding_63')>,
 <KerasTensor: shape=(None, 1, 4) dtype=float32 (created by layer 'embedding_64')>,
 <KerasTensor: shape=(None, 1, 4) dtype=float32 (created by layer 'embedding_65')>,
 <KerasTensor: shape=(None, 1, 4) dtype=float32 (created by layer 'embedding_66')>,
 <KerasTensor: shape=(None, 1, 4) dtype=float32 (created by layer 'embedding_67')>,
 <KerasTensor: shape=(None, 1, 4) dtype=float32 (created by layer 'embedding_68')>]

In [67]:
# 变长序列特征 'genres' 的嵌入，并通过 mean pooling 汇聚为 shape: (batch_size, 1, embedding_dim)
genres_embed = embed_layers['genres'](inputs['genres'])
genres_embed_pooled = tf.reduce_mean(genres_embed, axis=1, keepdims=True)

In [68]:
genres_embed

<KerasTensor: shape=(None, 5, 4) dtype=float32 (created by layer 'embedding_69')>

In [69]:
genres_embed_pooled

<KerasTensor: shape=(None, 1, 4) dtype=float32 (created by layer 'tf.math.reduce_mean_2')>

In [71]:
# 拼接所有特征嵌入，形成 shape: (batch_size, num_fields, embedding_dim)
total_embeds = tf.concat(embeds + [genres_embed_pooled], axis=1)

In [72]:
total_embeds

<KerasTensor: shape=(None, 7, 4) dtype=float32 (created by layer 'tf.concat_2')>

In [73]:
# 4. FM 的二阶交叉项计算
sum_square = tf.square(tf.reduce_sum(total_embeds, axis=1))       # shape: (batch_size, embedding_dim)
square_sum = tf.reduce_sum(tf.square(total_embeds), axis=1)      # shape: (batch_size, embedding_dim)
fm_second_order = 0.5 * tf.reduce_sum(sum_square - square_sum, axis=1, keepdims=True)  # shape: (batch_size, 1)

In [74]:
fm_second_order

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'tf.math.multiply_2')>

In [75]:
# 5. DNN 部分 - Flatten 后接多个全连接层
dnn_input = tf.reshape(total_embeds, shape=(-1, total_embeds.shape[1] * embedding_dim))
x = layers.Dense(64, activation='relu')(dnn_input)
x = layers.Dense(32, activation='relu')(x)
dnn_output = layers.Dense(1)(x)  # shape: (batch_size, 1)

In [76]:
dnn_output

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'dense_8')>

In [77]:
# 6. 合并 FM 输出和 DNN 输出
output = layers.Add()([fm_second_order, dnn_output])
model = tf.keras.Model(inputs=list(inputs.values()), outputs=output)

In [78]:
output

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'add_2')>

In [80]:
inputs

{'movie_id': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'movie_id')>,
 'user_id': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'user_id')>,
 'gender': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'gender')>,
 'age': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'age')>,
 'occupation': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'occupation')>,
 'zip': <KerasTensor: shape=(None, 1) dtype=int32 (created by layer 'zip')>,
 'genres': <KerasTensor: shape=(None, 5) dtype=int32 (created by layer 'genres')>}

In [81]:
model_input = {feat: data[feat].values for feat in sparse_features}

In [82]:
model_input

{'movie_id': array([ 12, 169,   6, 112,  45, 146,  43, 156,  30, 174,  82, 173,  91,
        108, 132,  40, 109,  31, 180, 183, 129,  67, 137,  87, 127,   8,
        104, 100, 140,  25, 122, 124, 116, 126,  72, 117,  42, 145, 131,
          2,  52,  17, 101,  94, 136,  65,  20, 144,  26,  83,  55, 126,
        184,  23, 121, 142,  33,   0,  46, 139, 150, 135,  36, 110,  79,
        162,  70, 147,   9,  34,   7,  76,   4, 185,  73, 112, 130,  95,
         28,  24, 148, 119, 168, 149, 181,  13, 154,  56,  66, 172,  69,
         35,  49, 106,  35,  11, 152, 166,  37, 164,  54, 167,  72,  29,
         92, 114,  88, 170,  64,  60,  38,  22,  62, 178, 134, 157,  99,
         34, 111,  96,  50,  75,  47,  14,  21,  77, 118, 182, 113, 143,
        149, 141,  10,  58,  81,  44,  27, 151, 165,  98, 163,  80, 158,
        161,  27, 155, 171,  78,  57, 123,  84,  93, 170, 120,   1, 153,
         39,  61,  51,  71,  19, 107,   9,  66, 102,  74, 177, 103, 133,
        160,  53,  90,   5, 173,  41,  

In [83]:
model_input['genres'] = genres_pad

In [84]:
model_input['genres']

array([[ 1,  2,  0,  0,  0],
       [ 3,  4,  0,  0,  0],
       [ 2,  5,  0,  0,  0],
       [ 3,  6,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 7,  1,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 8,  0,  0,  0,  0],
       [ 3,  9,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 7,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 1, 10,  5, 11,  0],
       [12,  7,  0,  0,  0],
       [ 1, 13,  9,  0,  0],
       [ 2, 14,  0,  0,  0],
       [ 3, 13,  4,  0,  0],
       [ 1, 13,  0,  0,  0],
       [ 1,  2,  5,  0,  0],
       [ 1,  9, 15, 11,  0],
       [ 2,  5,  0,  0,  0],
       [ 1, 11,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0],
       [13,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  4,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  9

In [79]:
model

<keras.engine.functional.Functional at 0x2c60c2fd6d0>

In [2]:


# 3. 构建模型输入并获取 embedding 向量
inputs = {feat: tf.keras.Inputbbbb(shape=(1,), name=feat, dtype=tf.int32) for feat in sparse_features}
inputs['genres'] = tf.keras.Input(shape=(max_len,), name='genres', dtype=tf.int32)

# 获取固定长度稀疏特征的 embedding，输出 shape: (batch_size, 1, embedding_dim)
embeds = [embed_layers[feat](inputs[feat]) for feat in sparse_features]

# 获取 genres 序列特征的 embedding，做 mean pooling，输出 shape: (batch_size, 1, embedding_dim)
genres_embed = embed_layers['genres'](inputs['genres'])
genres_embed_pooled = tf.reduce_mean(genres_embed, axis=1, keepdims=True)

# 将所有特征 embedding 拼接在一起，shape: (batch_size, num_fields, embedding_dim)
total_embeds = tf.concat(embeds + [genres_embed_pooled], axis=1)

# 4. FM 二阶交叉项计算
sum_square = tf.square(tf.reduce_sum(total_embeds, axis=1))       # shape: (batch_size, embedding_dim)
square_sum = tf.reduce_sum(tf.square(total_embeds), axis=1)      # shape: (batch_size, embedding_dim)
fm_second_order = 0.5 * tf.reduce_sum(sum_square - square_sum, axis=1, keepdims=True)  # shape: (batch_size, 1)

# 5. DNN 部分，将 embedding 展平后输入多层全连接层
# 输入 shape: (batch_size, num_fields * embedding_dim)
dnn_input = tf.reshape(total_embeds, shape=(-1, total_embeds.shape[1] * embedding_dim))
x = layers.Dense(64, activation='relu')(dnn_input)
x = layers.Dense(32, activation='relu')(x)
dnn_output = layers.Dense(1)(x)  # 输出 shape: (batch_size, 1)

# 6. FM输出和DNN输出相加作为最终输出
output = layers.Add()([fm_second_order, dnn_output])
model = tf.keras.Model(inputs=list(inputs.values()), outputs=output)

# 7. 编译并训练模型
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

model_input = {feat: data[feat].values for feat in sparse_features}
model_input['genres'] = genres_pad

# 训练模型
model.fit(model_input, data[target].values, batch_size=256, epochs=10, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2c607ade280>

In [240]:
data = pd.read_csv(r"D:\software\pycharm_repository\StarMaker\MultiRecSys\data_files\movielens_sample.txt")

In [241]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009


In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras import layers

# 1. 读取并预处理数据
data = pd.read_csv(r"D:\software\pycharm_repository\StarMaker\MultiRecSys\data_files\movielens_sample.txt")

sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ["rating"]

# 对稀疏特征做标签编码（Label Encoding）
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

# 处理 genres 字段为变长序列特征（VarLenSparseFeat）
key2index = {}
def split(x):
    keys = x.split('|')
    for k in keys:
        if k not in key2index:
            key2index[k] = len(key2index) + 1
    return [key2index[k] for k in keys]

genres_list = list(map(split, data['genres'].values))
genres_pad = pad_sequences(genres_list, padding='post')  # shape: (样本数, max_len)
max_len = genres_pad.shape[1]

# 2. 为每个特征创建 embedding 层
embedding_dim = 4
vocab_sizes = {feat: data[feat].nunique() for feat in sparse_features}
vocab_sizes['genres'] = len(key2index) + 1

embed_layers = {
    feat: layers.Embedding(input_dim=vocab_sizes[feat]+1, output_dim=embedding_dim, mask_zero=True)
    for feat in sparse_features + ['genres']
}

# 3. 构建模型输入并获取 embedding 向量
inputs = {feat: tf.keras.Input(shape=(1,), name=feat, dtype=tf.int32) for feat in sparse_features}
inputs['genres'] = tf.keras.Input(shape=(max_len,), name='genres', dtype=tf.int32)

# 获取固定长度稀疏特征的 embedding，输出 shape: (batch_size, 1, embedding_dim)
embeds = [embed_layers[feat](inputs[feat]) for feat in sparse_features]

# 获取 genres 序列特征的 embedding，做 mean pooling，输出 shape: (batch_size, 1, embedding_dim)
genres_embed = embed_layers['genres'](inputs['genres'])
genres_embed_pooled = tf.reduce_mean(genres_embed, axis=1, keepdims=True)

# 将所有特征 embedding 拼接在一起，shape: (batch_size, num_fields, embedding_dim)
total_embeds = tf.concat(embeds + [genres_embed_pooled], axis=1)

# 4. FM 二阶交叉项计算
sum_square = tf.square(tf.reduce_sum(total_embeds, axis=1))       # shape: (batch_size, embedding_dim)
square_sum = tf.reduce_sum(tf.square(total_embeds), axis=1)      # shape: (batch_size, embedding_dim)
fm_second_order = 0.5 * tf.reduce_sum(sum_square - square_sum, axis=1, keepdims=True)  # shape: (batch_size, 1)

# 5. DNN 部分，将 embedding 展平后输入多层全连接层
# 输入 shape: (batch_size, num_fields * embedding_dim)
dnn_input = tf.reshape(total_embeds, shape=(-1, total_embeds.shape[1] * embedding_dim))
x = layers.Dense(64, activation='relu')(dnn_input)
x = layers.Dense(32, activation='relu')(x)
dnn_output = layers.Dense(1)(x)  # 输出 shape: (batch_size, 1)

# 6. FM输出和DNN输出相加作为最终输出
output = layers.Add()([fm_second_order, dnn_output])
model = tf.keras.Model(inputs=list(inputs.values()), outputs=output)

# 7. 编译并训练模型
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

model_input = {feat: data[feat].values for feat in sparse_features}
model_input['genres'] = genres_pad

# 训练模型
model.fit(model_input, data[target].values, batch_size=256, epochs=10, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2c609ed0be0>

In [196]:
data = get_user_feature(data)
data

Unnamed: 0,user_id,user_mean_rating_x,user_hist_x,user_mean_rating_y,user_hist_y,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,182,1.0,1089,1.0,1089,1089,1,977085647,Reservoir Dogs (1992),Crime|Thriller,M,18,4,03052
1,192,1.0,761,1.0,761,761,1,977028390,"Phantom, The (1996)",Adventure,M,18,1,10977
2,506,1.0,3354,1.0,3354,3354,1,976208080,Mission to Mars (2000),Sci-Fi,M,25,16,55103-1006
3,621,1.0,833,1.0,833,833,1,975799925,High School High (1996),Comedy,M,18,4,93560
4,1010,1.5,2953,1.5,2953,2953,1,975222613,Home Alone 2: Lost in New York (1992),Children's|Comedy,M,25,0,10310
5,1010,1.5,2953,1.5,2953,1379,2,975220259,Young Guns II (1990),Action|Comedy|Western,M,25,0,10310
6,2744,1.0,588,1.0,588,588,1,973215985,Aladdin (1992),Animation|Children's|Comedy|Musical,M,18,17,53818
7,2996,2.0,2799,2.0,2799,2799,1,972769867,Problem Child 2 (1991),Comedy,M,18,0,63011
8,2996,2.0,2799,2.0,2799,3763,3,972413564,F/X (1986),Action|Crime|Thriller,M,18,0,63011
9,3679,1.0,2557,1.0,2557,2557,1,976298130,I Stand Alone (Seul contre tous) (1998),Drama,M,25,4,68108


In [198]:
def get_item_feature(data):
    data_group = data[['movie_id', 'rating']].groupby('movie_id').agg('mean').reset_index()
    data_group.rename(columns={'rating': 'item_mean_rating'}, inplace=True)
    data = pd.merge(data_group, data, on='movie_id')
    return data

In [199]:
data = get_item_feature(data)
data

Unnamed: 0,movie_id,item_mean_rating,user_id,user_mean_rating_x,user_hist_x,user_mean_rating_y,user_hist_y,rating,timestamp,title,genres,gender,age,occupation,zip
0,588,1.0,2744,1.0,588,1.0,588,1,973215985,Aladdin (1992),Animation|Children's|Comedy|Musical,M,18,17,53818
1,761,1.0,192,1.0,761,1.0,761,1,977028390,"Phantom, The (1996)",Adventure,M,18,1,10977
2,833,1.0,621,1.0,833,1.0,833,1,975799925,High School High (1996),Comedy,M,18,4,93560
3,1089,1.0,182,1.0,1089,1.0,1089,1,977085647,Reservoir Dogs (1992),Crime|Thriller,M,18,4,03052
4,1379,2.0,1010,1.5,2953,1.5,2953,2,975220259,Young Guns II (1990),Action|Comedy|Western,M,25,0,10310
5,1792,1.0,5039,1.0,1792,1.0,1792,1,962513044,U.S. Marshalls (1998),Action|Thriller,F,35,4,97068
6,2458,1.0,5831,1.0,2458,1.0,2458,1,957898337,Armed and Dangerous (1986),Comedy|Crime,M,25,1,92120
7,2557,1.0,3679,1.0,2557,1.0,2557,1,976298130,I Stand Alone (Seul contre tous) (1998),Drama,M,25,4,68108
8,2799,1.0,2996,2.0,2799,2.0,2799,1,972769867,Problem Child 2 (1991),Comedy,M,18,0,63011
9,2953,1.0,1010,1.5,2953,1.5,2953,1,975222613,Home Alone 2: Lost in New York (1992),Children's|Comedy,M,25,0,10310


In [242]:
sparse_features = ['user_id', 'movie_id', 'gender', 'age', 'occupation']
dense_features = ['age']
target = ['rating']

In [243]:
sparse_features

['user_id', 'movie_id', 'gender', 'age', 'occupation']

In [244]:
dense_features

['age']

In [245]:
target

['rating']

In [204]:
user_sparse_features, user_dense_features = ['user_id', 'gender', 'age', 'occupation'], ['user_mean_rating']
item_sparse_features, item_dense_features = ['movie_id', ], ['item_mean_rating']

In [246]:
# 1.Label Encoding for sparse features,and process sequence features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [247]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,107,12,4,968035345,Ed Wood (1994),Comedy|Drama,0,2,4,19119
1,123,169,3,966536874,Patriot Games (1992),Action|Thriller,1,1,4,77005
2,12,6,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,0,2,13,55408
3,21,112,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,1,1,18,29307
4,187,45,5,957782527,"Apartment, The (1960)",Comedy|Drama,1,5,19,20009
...,...,...,...,...,...,...,...,...,...,...
195,46,176,3,974840560,Screwed (2000),Comedy,1,2,11,21401
196,131,89,3,965855033,Fire Down Below (1997),Action|Drama|Thriller,1,1,11,73112
197,4,125,3,976730191,Desperately Seeking Susan (1985),Comedy|Romance,0,1,13,48126
198,181,15,4,958503395,Clear and Present Danger (1994),Action|Adventure|Thriller,1,2,0,60610


In [248]:
def get_var_feature(data, col):
    key2index = {}

    def split(x):
        key_ans = x.split('|')
        for key in key_ans:
            if key not in key2index:
                # Notice : input value 0 is a special "padding",\
                # so we do not use 0 to encode valid feature for sequence input
                key2index[key] = len(key2index) + 1
        return list(map(lambda x: key2index[x], key_ans))

    var_feature = list(map(split, data[col].values))
    var_feature_length = np.array(list(map(len, var_feature)))
    max_len = max(var_feature_length)
    var_feature = pad_sequences(var_feature, maxlen=max_len, padding='post', )
    return key2index, var_feature, max_len

In [249]:
# 2.preprocess the sequence feature
genres_key2index, data_genres_list, genres_maxlen = get_var_feature(data, 'genres')

In [250]:
genres_key2index

{'Comedy': 1,
 'Drama': 2,
 'Action': 3,
 'Thriller': 4,
 'Romance': 5,
 'Adventure': 6,
 "Children's": 7,
 'Western': 8,
 'Horror': 9,
 'Fantasy': 10,
 'Sci-Fi': 11,
 'Animation': 12,
 'Crime': 13,
 'Film-Noir': 14,
 'Musical': 15,
 'War': 16,
 'Mystery': 17}

In [252]:
data_genres_list

array([[ 1,  2,  0,  0,  0],
       [ 3,  4,  0,  0,  0],
       [ 2,  5,  0,  0,  0],
       [ 3,  6,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 7,  1,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 8,  0,  0,  0,  0],
       [ 3,  9,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 7,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 1, 10,  5, 11,  0],
       [12,  7,  0,  0,  0],
       [ 1, 13,  9,  0,  0],
       [ 2, 14,  0,  0,  0],
       [ 3, 13,  4,  0,  0],
       [ 1, 13,  0,  0,  0],
       [ 1,  2,  5,  0,  0],
       [ 1,  9, 15, 11,  0],
       [ 2,  5,  0,  0,  0],
       [ 1, 11,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0],
       [13,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  4,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  9

In [253]:
genres_maxlen

5

In [254]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].max() + 1, embedding_dim=4) for feat in sparse_features] + [DenseFeat(feat, 1, ) for feat in dense_features]

In [255]:
fixlen_feature_columns

[SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E9624FAA00>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E9624FAF10>, embedding_name='movie_id', group_name='default_group', trainable=True),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E9624FA610>, embedding_name='gender', group_name='default_group', trainable=True),
 SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=

In [256]:
item_varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=1000, embedding_dim=4),
                                                    maxlen=genres_maxlen, combiner='mean', length_name=None)]

In [257]:
item_varlen_feature_columns

[VarLenSparseFeat(sparsefeat=SparseFeat(name='genres', vocabulary_size=1000, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E9624FAAC0>, embedding_name='genres', group_name='default_group', trainable=True), maxlen=5, combiner='mean', length_name=None, weight_name=None, weight_norm=True)]

In [258]:
fixlen_feature_columns += item_varlen_feature_columns

In [259]:
fixlen_feature_columns

[SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E9624FAA00>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E9624FAF10>, embedding_name='movie_id', group_name='default_group', trainable=True),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E9624FA610>, embedding_name='gender', group_name='default_group', trainable=True),
 SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=

In [260]:
feature_names = get_feature_names(fixlen_feature_columns)
feature_names

['user_id', 'movie_id', 'gender', 'age', 'occupation', 'genres']

In [261]:
model_input = {name: data[name] for name in sparse_features + dense_features}  #

In [263]:
model_input = {name: data[name] for name in sparse_features + dense_features}  #

In [264]:
model_input

{'user_id': 0      107
 1      123
 2       12
 3       21
 4      187
       ... 
 195     46
 196    131
 197      4
 198    181
 199     25
 Name: user_id, Length: 200, dtype: int64,
 'movie_id': 0       12
 1      169
 2        6
 3      112
 4       45
       ... 
 195    176
 196     89
 197    125
 198     15
 199     86
 Name: movie_id, Length: 200, dtype: int64,
 'gender': 0      0
 1      1
 2      0
 3      1
 4      1
       ..
 195    1
 196    1
 197    0
 198    1
 199    1
 Name: gender, Length: 200, dtype: int32,
 'age': 0      2
 1      1
 2      2
 3      1
 4      5
       ..
 195    2
 196    1
 197    1
 198    2
 199    2
 Name: age, Length: 200, dtype: int64,
 'occupation': 0       4
 1       4
 2      13
 3      18
 4      19
        ..
 195    11
 196    11
 197    13
 198     0
 199     0
 Name: occupation, Length: 200, dtype: int64}

In [265]:
model_input["genres"] = data_genres_list

In [266]:
model_input

{'user_id': 0      107
 1      123
 2       12
 3       21
 4      187
       ... 
 195     46
 196    131
 197      4
 198    181
 199     25
 Name: user_id, Length: 200, dtype: int64,
 'movie_id': 0       12
 1      169
 2        6
 3      112
 4       45
       ... 
 195    176
 196     89
 197    125
 198     15
 199     86
 Name: movie_id, Length: 200, dtype: int64,
 'gender': 0      0
 1      1
 2      0
 3      1
 4      1
       ..
 195    1
 196    1
 197    0
 198    1
 199    1
 Name: gender, Length: 200, dtype: int32,
 'age': 0      2
 1      1
 2      2
 3      1
 4      5
       ..
 195    2
 196    1
 197    1
 198    2
 199    2
 Name: age, Length: 200, dtype: int64,
 'occupation': 0       4
 1       4
 2      13
 3      18
 4      19
        ..
 195    11
 196    11
 197    13
 198     0
 199     0
 Name: occupation, Length: 200, dtype: int64,
 'genres': array([[ 1,  2,  0,  0,  0],
        [ 3,  4,  0,  0,  0],
        [ 2,  5,  0,  0,  0],
        [ 3,  6,  0,  0,  0

In [272]:
print("Target shape:", data[target].values.shape)
for k, v in model_input.items():
    print(f"{k}: {np.array(v).shape}")


Target shape: (200, 1)
user_id: (200,)
movie_id: (200,)
gender: (200,)
age: (200,)
occupation: (200,)
genres: (200, 5)


In [268]:
# 4.Define Model,compile and train
model = DeepFM(fixlen_feature_columns, fixlen_feature_columns, task='regression')
model

<tensorflow.python.keras.engine.functional.Functional at 0x1e96257aa00>

In [270]:
model.compile("adam", "mse", metrics=['mse'], )

In [271]:
history = model.fit(model_input, data[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Epoch 1/10


StagingError: in user code:


    IndexError: pop from empty list


In [None]:
  # 3.generate input data for model
    model_input = {name: data[name] for name in feature_names}  #
    model_input["genres"] = genres_list
    model_input["genres_weight"] = np.random.randn(data.shape[0], max_len, 1)

    # 4.Define Model,compile and train
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')

    model.compile("adam", "mse", metrics=['mse'], )
    history = model.fit(model_input, data[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

In [273]:
import numpy as np

from deepctr.models import DIN
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names


def get_xy_fd():
    feature_columns = [SparseFeat('user', 3, embedding_dim=10), SparseFeat(
        'gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8),
                       SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1)]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'),
                         maxlen=4, length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4,
                         length_name="seq_length")]
    # Notice: History behavior sequence feature name must start with "hist_".
    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 2])  # 0 is mask value
    pay_score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]])
    seq_length = np.array([3, 3, 2])  # the actual length of the behavior sequence

    feature_dict = {'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id,
                    'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id,
                    'pay_score': pay_score, 'seq_length': seq_length}
    x = {name: feature_dict[name] for name in get_feature_names(feature_columns)}
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list


if __name__ == "__main__":
    x, y, feature_columns, behavior_feature_list = get_xy_fd()
    model = DIN(feature_columns, behavior_feature_list)
    # model = BST(feature_columns, behavior_feature_list,att_head_num=4)
    model.compile('adam', 'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    history = model.fit(x, y, verbose=1, epochs=10, validation_split=0.5)

Epoch 1/10


TypeError: in user code:


    TypeError: Cannot convert a symbolic Keras input/output to a numpy array. This error may indicate that you're trying to pass a symbolic value to a NumPy call, which is not supported. Or, you may be trying to pass Keras symbolic inputs/outputs to a TF API that does not register dispatching, preventing Keras from automatically converting the API call to a lambda layer in the Functional Model.


In [274]:
import numpy as np
from sklearn.model_selection import train_test_split
from deepctr.models import DIN
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names


def get_xy_fd():
    feature_columns = [SparseFeat('user', 3, embedding_dim=10),
                       SparseFeat('gender', 2, embedding_dim=4),
                       SparseFeat('item_id', 4, embedding_dim=8),
                       SparseFeat('cate_id', 3, embedding_dim=4),
                       DenseFeat('pay_score', 1)]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=4, embedding_dim=8, embedding_name='item_id'),
                         maxlen=4, length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id', vocabulary_size=3, embedding_dim=4, embedding_name='cate_id'),
                         maxlen=4, length_name="seq_length")
    ]
    behavior_feature_list = ["item_id", "cate_id"]

    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])
    cate_id = np.array([1, 2, 2])
    pay_score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]])
    seq_length = np.array([3, 3, 2])

    feature_dict = {
        'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id,
        'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id,
        'pay_score': pay_score, 'seq_length': seq_length
    }

    x = {name: feature_dict[name] for name in get_feature_names(feature_columns)}
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list


if __name__ == "__main__":
    x, y, feature_columns, behavior_feature_list = get_xy_fd()

    # 显式划分训练/验证集（这里只是演示：3条样本取前2训练，最后1个验证）
    train_idx = [0, 1]
    val_idx = [2]
    x_train = {k: v[train_idx] for k, v in x.items()}
    x_val = {k: v[val_idx] for k, v in x.items()}
    y_train = y[train_idx]
    y_val = y[val_idx]

    # 构建模型
    model = DIN(feature_columns, behavior_feature_list)
    model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])

    # 训练模型
    model.fit(
        x_train, y_train,
        validation_data=(x_val, y_val),
        batch_size=2,
        epochs=10,
        verbose=1
    )

Epoch 1/10


TypeError: in user code:


    TypeError: Cannot convert a symbolic Keras input/output to a numpy array. This error may indicate that you're trying to pass a symbolic value to a NumPy call, which is not supported. Or, you may be trying to pass Keras symbolic inputs/outputs to a TF API that does not register dispatching, preventing Keras from automatically converting the API call to a lambda layer in the Functional Model.


In [275]:
import numpy as np
from sklearn.model_selection import train_test_split
from deepctr.models import DIN
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names


def get_xy_fd():
    feature_columns = [SparseFeat('user', 3, embedding_dim=10),
                       SparseFeat('gender', 2, embedding_dim=4),
                       SparseFeat('item_id', 4, embedding_dim=8),
                       SparseFeat('cate_id', 3, embedding_dim=4),
                       DenseFeat('pay_score', 1)]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=4, embedding_dim=8, embedding_name='item_id'),
                         maxlen=4, length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id', vocabulary_size=3, embedding_dim=4, embedding_name='cate_id'),
                         maxlen=4, length_name="seq_length")
    ]
    behavior_feature_list = ["item_id", "cate_id"]

    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])
    cate_id = np.array([1, 2, 2])
    pay_score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]])
    seq_length = np.array([3, 3, 2])

    feature_dict = {
        'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id,
        'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id,
        'pay_score': pay_score, 'seq_length': seq_length
    }

    x = {name: feature_dict[name] for name in get_feature_names(feature_columns)}
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list


if __name__ == "__main__":
    x, y, feature_columns, behavior_feature_list = get_xy_fd()

    # 显式划分训练/验证集（这里只是演示：3条样本取前2训练，最后1个验证）
    train_idx = [0, 1]
    val_idx = [2]
    x_train = {k: v[train_idx] for k, v in x.items()}
    x_val = {k: v[val_idx] for k, v in x.items()}
    y_train = y[train_idx]
    y_val = y[val_idx]

    # 构建模型
    model = DIN(feature_columns, behavior_feature_list)
    model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])

    # 训练模型
    history = model.fit(
        x_train, y_train,
        batch_size=2,
        epochs=10,
        verbose=1,
        validation_data=(x_val, y_val)  # 显式传入验证数据，不再用 validation_split
    )


Epoch 1/10


TypeError: in user code:


    TypeError: Cannot convert a symbolic Keras input/output to a numpy array. This error may indicate that you're trying to pass a symbolic value to a NumPy call, which is not supported. Or, you may be trying to pass Keras symbolic inputs/outputs to a TF API that does not register dispatching, preventing Keras from automatically converting the API call to a lambda layer in the Functional Model.


In [183]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names

def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            key2index[key] = len(key2index) + 1
    return [key2index[k] for k in key_ans]

if __name__ == "__main__":
    data = pd.read_csv(r"D:\software\pycharm_repository\StarMaker\MultiRecSys\data_files\movielens_sample.txt")
    sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
    target = ['rating']

    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    key2index = {}
    genres_list = list(map(split, data['genres'].values))
    genres_length = np.array(list(map(len, genres_list)))
    max_len = max(genres_length)
    genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', truncating='post')

    # 构建特征列
    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
                              for feat in sparse_features]

    # 👇 不使用权重的序列特征（推荐稳定方案）
    varlen_feature_columns = [VarLenSparseFeat(
        SparseFeat('genres', vocabulary_size=len(key2index) + 1, embedding_dim=4),
        maxlen=max_len, combiner='mean'
    )]

    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 构造模型输入
    model_input = {name: data[name] for name in sparse_features}
    model_input["genres"] = genres_list  # 注意：sequence 类型字段必须填 numpy 数组

    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
    model.compile("adam", "mse", metrics=['mse'])
    history = model.fit(model_input, data[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2)


Epoch 1/10


StagingError: in user code:


    IndexError: pop from empty list


In [184]:
model_input

{'movie_id': 0       12
 1      169
 2        6
 3      112
 4       45
       ... 
 195    176
 196     89
 197    125
 198     15
 199     86
 Name: movie_id, Length: 200, dtype: int64,
 'user_id': 0      107
 1      123
 2       12
 3       21
 4      187
       ... 
 195     46
 196    131
 197      4
 198    181
 199     25
 Name: user_id, Length: 200, dtype: int64,
 'gender': 0      0
 1      1
 2      0
 3      1
 4      1
       ..
 195    1
 196    1
 197    0
 198    1
 199    1
 Name: gender, Length: 200, dtype: int32,
 'age': 0      2
 1      1
 2      2
 3      1
 4      5
       ..
 195    2
 196    1
 197    1
 198    2
 199    2
 Name: age, Length: 200, dtype: int64,
 'occupation': 0       4
 1       4
 2      13
 3      18
 4      19
        ..
 195    11
 196    11
 197    13
 198     0
 199     0
 Name: occupation, Length: 200, dtype: int64,
 'zip': 0       35
 1      118
 2       99
 3       55
 4       41
       ... 
 195     48
 196    113
 197     83
 198    106


In [186]:
data[target]

Unnamed: 0,rating
0,4
1,3
2,4
3,3
4,5
...,...
195,3
196,3
197,3
198,4


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names


def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))


if __name__ == "__main__":
    data = pd.read_csv("./movielens_sample.txt")
    sparse_features = ["movie_id", "user_id",
                       "gender", "age", "occupation", "zip", ]
    target = ['rating']

    # 1.Label Encoding for sparse features,and process sequence features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    # preprocess the sequence feature

    key2index = {}
    genres_list = list(map(split, data['genres'].values))
    genres_length = np.array(list(map(len, genres_list)))
    max_len = max(genres_length)
    # Notice : padding=`post`
    genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    fixlen_feature_columns = [SparseFeat(feat, data[feat].max() + 1, embedding_dim=4)
                              for feat in sparse_features]

    use_weighted_sequence = False
    if use_weighted_sequence:
        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
            key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                                   weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
    else:
        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
            key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                                   weight_name=None)]  # Notice : value 0 is for padding for sequence input feature

    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model
    model_input = {name: data[name] for name in feature_names}  #
    model_input["genres"] = genres_list
    model_input["genres_weight"] = np.random.randn(data.shape[0], max_len, 1)

    # 4.Define Model,compile and train
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')

    model.compile("adam", "mse", metrics=['mse'], )
    history = model.fit(model_input, data[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

In [151]:
genres_list

array([[ 1,  2,  0,  0,  0],
       [ 3,  4,  0,  0,  0],
       [ 2,  5,  0,  0,  0],
       [ 3,  6,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 7,  1,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 8,  0,  0,  0,  0],
       [ 3,  9,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 7,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 1, 10,  5, 11,  0],
       [12,  7,  0,  0,  0],
       [ 1, 13,  9,  0,  0],
       [ 2, 14,  0,  0,  0],
       [ 3, 13,  4,  0,  0],
       [ 1, 13,  0,  0,  0],
       [ 1,  2,  5,  0,  0],
       [ 1,  9, 15, 11,  0],
       [ 2,  5,  0,  0,  0],
       [ 1, 11,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0],
       [13,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  4,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  9

In [152]:
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names
import numpy as np

feature_columns = [SparseFeat('user_id',120,),SparseFeat('item_id',60,),DenseFeat("pic_vec",5)]
fixlen_feature_names = get_feature_names(feature_columns)

user_id = np.array([[1],[0],[1]])
item_id = np.array([[30],[20],[10]])
pic_vec = np.array([[0.1,0.5,0.4,0.3,0.2],[0.1,0.5,0.4,0.3,0.2],[0.1,0.5,0.4,0.3,0.2]])
label = np.array([1,0,1])

model_input = {'user_id':user_id,'item_id':item_id,'pic_vec':pic_vec}

model = DeepFM(feature_columns,feature_columns)
model.compile('adagrad','binary_crossentropy')
model.fit(model_input,label)



<tensorflow.python.keras.callbacks.History at 0x1e961755070>

In [153]:
feature_columns

[SparseFeat(name='user_id', vocabulary_size=120, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E95DEBD820>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='item_id', vocabulary_size=60, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E9616E0880>, embedding_name='item_id', group_name='default_group', trainable=True),
 DenseFeat(name='pic_vec', dimension=5, dtype='float32', transform_fn=None)]

In [157]:
pic_vec

array([[0.1, 0.5, 0.4, 0.3, 0.2],
       [0.1, 0.5, 0.4, 0.3, 0.2],
       [0.1, 0.5, 0.4, 0.3, 0.2]])

In [158]:
data["movie_id"]

0       12
1      169
2        6
3      112
4       45
      ... 
195    176
196     89
197    125
198     15
199     86
Name: movie_id, Length: 200, dtype: int64

In [150]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names


def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice: input value 0 is a special "padding", so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))


if __name__ == "__main__":
    # 1. Load and preprocess data
    data = pd.read_csv(r"D:\software\pycharm_repository\StarMaker\MultiRecSys\data_files\movielens_sample.txt")
    sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
    target = ['rating']

    # Label Encoding for sparse features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    # Process sequence feature 'genres'
    key2index = {}
    genres_list = list(map(split, data['genres'].values))
    genres_length = np.array(list(map(len, genres_list)))
    max_len = max(genres_length)
    genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post')

    # 2. Define feature columns
    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4) 
                             for feat in sparse_features]

    varlen_feature_columns = [VarLenSparseFeat(
        SparseFeat('genres', vocabulary_size=len(key2index)+1, embedding_dim=4),
        maxlen=max_len, combiner='mean', weight_name=None)]

    # Key Fix: VarLen features only in DNN part
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3. Generate input data
    model_input = {name: data[name] for name in feature_names if name != 'genres'}  # Exclude genres from sparse features
    model_input["genres"] = genres_list

    # 4. Build and train model
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
    model.compile("adam", "mse", metrics=['mse'])
    
    history = model.fit(model_input, data[target].values,
                       batch_size=256, epochs=10, verbose=2, validation_split=0.2)

Epoch 1/10


StagingError: in user code:


    IndexError: pop from empty list


In [159]:
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))


key2index = {}
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

In [160]:
genres_list

array([[ 1,  2,  0,  0,  0],
       [ 3,  4,  0,  0,  0],
       [ 2,  5,  0,  0,  0],
       [ 3,  6,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 7,  1,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 8,  0,  0,  0,  0],
       [ 3,  9,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 7,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 1, 10,  5, 11,  0],
       [12,  7,  0,  0,  0],
       [ 1, 13,  9,  0,  0],
       [ 2, 14,  0,  0,  0],
       [ 3, 13,  4,  0,  0],
       [ 1, 13,  0,  0,  0],
       [ 1,  2,  5,  0,  0],
       [ 1,  9, 15, 11,  0],
       [ 2,  5,  0,  0,  0],
       [ 1, 11,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0],
       [13,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  4,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  9

In [161]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
                          for feat in sparse_features]

use_weighted_sequence = False
if use_weighted_sequence:
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                               weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
else:
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                               weight_name=None)]  # Notice : value 0 is for padding for sequence input feature

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [163]:
linear_feature_columns

[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E9616E0E80>, embedding_name='movie_id', group_name='default_group', trainable=True),
 SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E961821610>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E96181DFA0>, embedding_name='gender', group_name='default_group', trainable=True),
 SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=

In [166]:
import pandas
import pandas as pd
import sklearn
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tqdm import tqdm

from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
import numpy as np

from tensorflow.keras.preprocessing.sequence import pad_sequences


In [167]:
data = sklearn.utils.shuffle(data)
train_model_input = {name: data[name] for name in sparse_features}  #
train_model_input["genres"] = genres_list

In [144]:
print(data[sparse_features].isnull().sum())  # 检查稀疏特征是否有缺失值


movie_id      0
user_id       0
gender        0
age           0
occupation    0
zip           0
dtype: int64


In [169]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')

model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['AUC', 'Precision', 'Recall'])
model.summary()

Model: "model_26"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tf.cast_98 (TFOpLambda)         (None, 5)            0                                            
__________________________________________________________________________________________________
genres (InputLayer)             [(None, 5)]          0                                            
__________________________________________________________________________________________________
tf.cast_94 (TFOpLambda)         (None, 5)            0                                            
__________________________________________________________________________________________________
tf.expand_dims_98 (TFOpLambda)  (None, 5, 1)         0           tf.cast_98[0][0]                 
___________________________________________________________________________________________

In [172]:
model.fit(train_model_input, data[target].values,
						batch_size=256, epochs=20, verbose=2,
						validation_split=0.2
				)

Epoch 1/20


StagingError: in user code:


    IndexError: pop from empty list


In [173]:
pred_ans = model.predict(train_model_input, batch_size=256)
count = 0

StagingError: in user code:


    IndexError: pop from empty list


In [171]:

for (i, j) in zip(pred_ans, data['rating'].values):
    print(i, j)
    count += 1
    if count > 10:
        break

IndentationError: unexpected indent (4287855165.py, line 2)

In [97]:
data = pd.read_csv(r"D:\software\pycharm_repository\StarMaker\MultiRecSys\data_files\movielens_sample.txt")
sparse_features = ["movie_id", "user_id","gender", "age", "occupation", "zip", ]
target = ['rating']

In [94]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009


In [99]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,107,12,4,968035345,Ed Wood (1994),Comedy|Drama,0,2,4,35
1,123,169,3,966536874,Patriot Games (1992),Action|Thriller,1,1,4,118
2,12,6,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,0,2,13,99
3,21,112,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,1,1,18,55
4,187,45,5,957782527,"Apartment, The (1960)",Comedy|Drama,1,5,19,41


In [98]:
# 1.Label Encoding for sparse features,and process sequence features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
# preprocess the sequence feature

In [101]:
key2index = {}

In [103]:
genres_list = list(map(split, data['genres'].values))
genres_list

[[1, 2],
 [3, 4],
 [2, 5],
 [3, 6],
 [1, 2],
 [1],
 [7, 1],
 [2],
 [2],
 [8],
 [3, 9],
 [1],
 [1, 2],
 [7, 2],
 [2],
 [1, 10, 5, 11],
 [12, 7],
 [1, 13, 9],
 [2, 14],
 [3, 13, 4],
 [1, 13],
 [1, 2, 5],
 [1, 9, 15, 11],
 [2, 5],
 [1, 11],
 [2],
 [9],
 [13, 2],
 [1],
 [1, 5],
 [1, 2],
 [2],
 [9, 4],
 [1, 5],
 [1, 9],
 [1, 10],
 [1],
 [4],
 [2],
 [2, 11],
 [1, 11],
 [1, 13, 10],
 [2, 16],
 [3, 2, 5],
 [3, 6, 11],
 [3, 11, 4],
 [3, 6, 13],
 [1],
 [3, 4],
 [13, 2, 11],
 [3, 5],
 [1, 5],
 [1],
 [3, 4],
 [3, 9, 11, 4],
 [1, 9],
 [12, 7, 1, 15],
 [1, 2],
 [1, 5],
 [1, 5],
 [7, 1],
 [2, 5, 11],
 [1, 5],
 [7, 9],
 [13, 4],
 [2],
 [2, 16],
 [1, 9],
 [3, 13, 11],
 [6, 2, 8],
 [1],
 [1, 5],
 [2, 4],
 [1, 2],
 [2, 5],
 [3, 6],
 [1, 8],
 [3, 11, 4],
 [3, 2, 17],
 [1, 5],
 [2, 17, 5],
 [1, 5],
 [2],
 [1],
 [1],
 [13, 2],
 [1, 2],
 [1],
 [3, 2, 16],
 [1],
 [14, 17, 4],
 [2, 4],
 [6, 7, 10, 11],
 [2, 5],
 [2, 4],
 [1, 2],
 [1, 2, 8],
 [2],
 [11],
 [6, 1, 11],
 [7, 2, 10, 11],
 [2],
 [1, 9],
 [1],
 [3, 4

In [104]:
genres_length = np.array(list(map(len, genres_list)))
genres_length

array([2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 4, 2, 3, 2, 3, 2, 3,
       4, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 3, 2, 3,
       3, 3, 3, 1, 2, 3, 2, 2, 1, 2, 4, 2, 4, 2, 2, 2, 2, 3, 2, 2, 2, 1,
       2, 2, 3, 3, 1, 2, 2, 2, 2, 2, 2, 3, 3, 2, 3, 2, 1, 1, 1, 2, 2, 1,
       3, 1, 3, 2, 4, 2, 2, 2, 3, 1, 1, 3, 4, 1, 2, 1, 2, 2, 4, 1, 2, 4,
       2, 2, 2, 1, 1, 1, 3, 3, 3, 2, 1, 1, 3, 2, 2, 2, 1, 3, 1, 2, 1, 2,
       2, 1, 3, 2, 3, 1, 1, 3, 1, 3, 1, 2, 3, 3, 1, 1, 2, 4, 1, 2, 1, 3,
       3, 2, 1, 2, 2, 2, 1, 2, 3, 3, 1, 2, 2, 1, 2, 3, 2, 3, 5, 1, 1, 5,
       4, 1, 2, 1, 2, 3, 3, 1, 1, 2, 1, 1, 2, 4, 2, 1, 1, 4, 2, 1, 3, 2,
       3, 1])

In [106]:
max_len = max(genres_length)
max_len

5

In [108]:
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )
genres_list

array([[ 1,  2,  0,  0,  0],
       [ 3,  4,  0,  0,  0],
       [ 2,  5,  0,  0,  0],
       [ 3,  6,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 7,  1,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 8,  0,  0,  0,  0],
       [ 3,  9,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 7,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 1, 10,  5, 11,  0],
       [12,  7,  0,  0,  0],
       [ 1, 13,  9,  0,  0],
       [ 2, 14,  0,  0,  0],
       [ 3, 13,  4,  0,  0],
       [ 1, 13,  0,  0,  0],
       [ 1,  2,  5,  0,  0],
       [ 1,  9, 15, 11,  0],
       [ 2,  5,  0,  0,  0],
       [ 1, 11,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0],
       [13,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  4,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  9

In [100]:
key2index = {}
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

In [112]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [SparseFeat(feat, data[feat].max() + 1, embedding_dim=4) for feat in sparse_features]
fixlen_feature_columns

[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E956C6F550>, embedding_name='movie_id', group_name='default_group', trainable=True),
 SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E956C6F040>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E95A3F1190>, embedding_name='gender', group_name='default_group', trainable=True),
 SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=

In [113]:
use_weighted_sequence = False
if use_weighted_sequence:
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                               weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
else:
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                               weight_name=None)]  # Notice : value 0 is for padding for sequence input feature

In [115]:
varlen_feature_columns

[VarLenSparseFeat(sparsefeat=SparseFeat(name='genres', vocabulary_size=18, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E95A602B20>, embedding_name='genres', group_name='default_group', trainable=True), maxlen=5, combiner='mean', length_name=None, weight_name=None, weight_norm=True)]

In [116]:
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

In [117]:
linear_feature_columns

[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E956C6F550>, embedding_name='movie_id', group_name='default_group', trainable=True),
 SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E956C6F040>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E95A3F1190>, embedding_name='gender', group_name='default_group', trainable=True),
 SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=

In [129]:
np.random.randn(data.shape[0], max_len, 1)

array([[[-8.76809725e-01],
        [ 5.42572851e-01],
        [ 2.17293022e+00],
        [ 1.01302475e+00],
        [ 5.79730097e-01]],

       [[-4.15299410e-01],
        [-1.34013424e+00],
        [-1.86301551e+00],
        [-3.64452273e-01],
        [-8.52888302e-01]],

       [[ 2.62087995e-01],
        [ 7.97594950e-01],
        [ 5.75844267e-02],
        [ 7.51891955e-01],
        [ 5.00650441e-02]],

       [[-1.10197480e-01],
        [ 1.24948741e+00],
        [-6.27117722e-01],
        [-1.24792101e-01],
        [-2.70288404e-01]],

       [[ 1.28325223e+00],
        [-1.85986312e+00],
        [ 1.26415362e+00],
        [ 4.16979001e-01],
        [-1.73432311e+00]],

       [[-7.52858444e-01],
        [-3.64031579e-01],
        [-4.31425683e-01],
        [ 2.06466521e+00],
        [-8.14837766e-01]],

       [[-8.75194329e-02],
        [-8.53184996e-01],
        [ 1.96543335e+00],
        [ 1.03801647e+00],
        [ 6.72609647e-02]],

       [[ 6.97226182e-01],
        [ 1.46

In [118]:
dnn_feature_columns

[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E956C6F550>, embedding_name='movie_id', group_name='default_group', trainable=True),
 SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E956C6F040>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v2.RandomNormal object at 0x000001E95A3F1190>, embedding_name='gender', group_name='default_group', trainable=True),
 SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=

In [123]:
model_input["genres"] = genres_list

In [124]:
model_input

{'movie_id': 0       12
 1      169
 2        6
 3      112
 4       45
       ... 
 195    176
 196     89
 197    125
 198     15
 199     86
 Name: movie_id, Length: 200, dtype: int64,
 'user_id': 0      107
 1      123
 2       12
 3       21
 4      187
       ... 
 195     46
 196    131
 197      4
 198    181
 199     25
 Name: user_id, Length: 200, dtype: int64,
 'gender': 0      0
 1      1
 2      0
 3      1
 4      1
       ..
 195    1
 196    1
 197    0
 198    1
 199    1
 Name: gender, Length: 200, dtype: int32,
 'age': 0      2
 1      1
 2      2
 3      1
 4      5
       ..
 195    2
 196    1
 197    1
 198    2
 199    2
 Name: age, Length: 200, dtype: int64,
 'occupation': 0       4
 1       4
 2      13
 3      18
 4      19
        ..
 195    11
 196    11
 197    13
 198     0
 199     0
 Name: occupation, Length: 200, dtype: int64,
 'zip': 0       35
 1      118
 2       99
 3       55
 4       41
       ... 
 195     48
 196    113
 197     83
 198    106


In [131]:
# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model

<tensorflow.python.keras.engine.functional.Functional at 0x1e95dd6e520>

In [133]:
model.compile("adam", "mse", metrics=['mse'], )

In [135]:
history = model.fit(model_input, data[target].values,batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Epoch 1/10


StagingError: in user code:


    IndexError: pop from empty list


In [132]:
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(`model_input, data[target].values,batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Epoch 1/10


StagingError: in user code:


    IndexError: pop from empty list


In [119]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [80]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names


def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))


if __name__ == "__main__":
    data = pd.read_csv(r"D:\software\pycharm_repository\StarMaker\MultiRecSys\data_files\movielens_sample.txt")
    sparse_features = ["movie_id", "user_id","gender", "age", "occupation", "zip", ]
    target = ['rating']

    # 1.Label Encoding for sparse features,and process sequence features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    # preprocess the sequence feature

    key2index = {}
    genres_list = list(map(split, data['genres'].values))
    genres_length = np.array(list(map(len, genres_list)))
    max_len = max(genres_length)
    # Notice : padding=`post`
    genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    fixlen_feature_columns = [SparseFeat(feat, data[feat].max() + 1, embedding_dim=4)
                              for feat in sparse_features]

    use_weighted_sequence = False
    if use_weighted_sequence:
        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
            key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                                   weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
    else:
        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
            key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                                   weight_name=None)]  # Notice : value 0 is for padding for sequence input feature

    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model
    model_input = {name: data[name] for name in feature_names}  #
    model_input["genres"] = genres_list
    model_input["genres_weight"] = np.random.randn(data.shape[0], max_len, 1)

    # 4.Define Model,compile and train
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')

    model.compile("adam", "mse", metrics=['mse'], )
    history = model.fit(model_input, data[target].values,batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
						
						

Epoch 1/10




StagingError: in user code:


    IndexError: pop from empty list
