In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names

In [23]:
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

In [29]:
data = pd.read_csv(r"D:\software\pycharm_repository\StarMaker\MultiRecSys\data_files\movielens_sample.txt")
sparse_features = ["movie_id", "user_id","gender", "age", "occupation", "zip", ]
target = ['rating']

In [30]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009
...,...,...,...,...,...,...,...,...,...,...
195,1427,3596,3,974840560,Screwed (2000),Comedy,M,25,12,21401
196,3868,1626,3,965855033,Fire Down Below (1997),Action|Drama|Thriller,M,18,12,73112
197,249,2369,3,976730191,Desperately Seeking Susan (1985),Comedy|Romance,F,18,14,48126
198,5720,349,4,958503395,Clear and Present Danger (1994),Action|Adventure|Thriller,M,25,0,60610


In [31]:
# 1.Label Encoding for sparse features,and process sequence features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [32]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,107,12,4,968035345,Ed Wood (1994),Comedy|Drama,0,2,4,35
1,123,169,3,966536874,Patriot Games (1992),Action|Thriller,1,1,4,118
2,12,6,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,0,2,13,99
3,21,112,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,1,1,18,55
4,187,45,5,957782527,"Apartment, The (1960)",Comedy|Drama,1,5,19,41
...,...,...,...,...,...,...,...,...,...,...
195,46,176,3,974840560,Screwed (2000),Comedy,1,2,11,48
196,131,89,3,965855033,Fire Down Below (1997),Action|Drama|Thriller,1,1,11,113
197,4,125,3,976730191,Desperately Seeking Susan (1985),Comedy|Romance,0,1,13,83
198,181,15,4,958503395,Clear and Present Danger (1994),Action|Adventure|Thriller,1,2,0,106


In [33]:
# preprocess the sequence feature

key2index = {}
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

In [34]:
genres_list

array([[ 1,  2,  0,  0,  0],
       [ 3,  4,  0,  0,  0],
       [ 2,  5,  0,  0,  0],
       [ 3,  6,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 7,  1,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 8,  0,  0,  0,  0],
       [ 3,  9,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 7,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 1, 10,  5, 11,  0],
       [12,  7,  0,  0,  0],
       [ 1, 13,  9,  0,  0],
       [ 2, 14,  0,  0,  0],
       [ 3, 13,  4,  0,  0],
       [ 1, 13,  0,  0,  0],
       [ 1,  2,  5,  0,  0],
       [ 1,  9, 15, 11,  0],
       [ 2,  5,  0,  0,  0],
       [ 1, 11,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0],
       [13,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  4,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  9

In [35]:
genres_list.shape

(200, 5)

In [37]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [SparseFeat(feat, data[feat].max() + 1, embedding_dim=4)  for feat in sparse_features]

In [38]:
fixlen_feature_columns

[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x0000016709971220>, embedding_name='movie_id', group_name='default_group', trainable=True),
 SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x0000016709971490>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x0000016709971BE0>, embedding_name='gender', group_name='default_group', trainable=True),
 SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=

In [39]:
key2index

{'Comedy': 1,
 'Drama': 2,
 'Action': 3,
 'Thriller': 4,
 'Romance': 5,
 'Adventure': 6,
 "Children's": 7,
 'Western': 8,
 'Horror': 9,
 'Fantasy': 10,
 'Sci-Fi': 11,
 'Animation': 12,
 'Crime': 13,
 'Film-Noir': 14,
 'Musical': 15,
 'War': 16,
 'Mystery': 17}

In [40]:
use_weighted_sequence = False
if use_weighted_sequence:
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                               weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
else:
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                               weight_name=None)]  # Notice : value 0 is for padding for sequence input feature

In [41]:
varlen_feature_columns

[VarLenSparseFeat(sparsefeat=SparseFeat(name='genres', vocabulary_size=18, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000001670D394100>, embedding_name='genres', group_name='default_group', trainable=True), maxlen=5, combiner='mean', length_name=None, weight_name=None, weight_norm=True)]

In [42]:
fixlen_feature_columns

[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x0000016709971220>, embedding_name='movie_id', group_name='default_group', trainable=True),
 SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x0000016709971490>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x0000016709971BE0>, embedding_name='gender', group_name='default_group', trainable=True),
 SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=

In [43]:
varlen_feature_columns

[VarLenSparseFeat(sparsefeat=SparseFeat(name='genres', vocabulary_size=18, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000001670D394100>, embedding_name='genres', group_name='default_group', trainable=True), maxlen=5, combiner='mean', length_name=None, weight_name=None, weight_norm=True)]

In [44]:
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

In [45]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip', 'genres']

In [46]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,107,12,4,968035345,Ed Wood (1994),Comedy|Drama,0,2,4,35
1,123,169,3,966536874,Patriot Games (1992),Action|Thriller,1,1,4,118
2,12,6,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,0,2,13,99
3,21,112,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,1,1,18,55
4,187,45,5,957782527,"Apartment, The (1960)",Comedy|Drama,1,5,19,41
...,...,...,...,...,...,...,...,...,...,...
195,46,176,3,974840560,Screwed (2000),Comedy,1,2,11,48
196,131,89,3,965855033,Fire Down Below (1997),Action|Drama|Thriller,1,1,11,113
197,4,125,3,976730191,Desperately Seeking Susan (1985),Comedy|Romance,0,1,13,83
198,181,15,4,958503395,Clear and Present Danger (1994),Action|Adventure|Thriller,1,2,0,106


In [47]:
# 3.generate input data for model
model_input = {name: data[name] for name in feature_names}  #

In [48]:
model_input

{'movie_id': 0       12
 1      169
 2        6
 3      112
 4       45
       ... 
 195    176
 196     89
 197    125
 198     15
 199     86
 Name: movie_id, Length: 200, dtype: int64,
 'user_id': 0      107
 1      123
 2       12
 3       21
 4      187
       ... 
 195     46
 196    131
 197      4
 198    181
 199     25
 Name: user_id, Length: 200, dtype: int64,
 'gender': 0      0
 1      1
 2      0
 3      1
 4      1
       ..
 195    1
 196    1
 197    0
 198    1
 199    1
 Name: gender, Length: 200, dtype: int32,
 'age': 0      2
 1      1
 2      2
 3      1
 4      5
       ..
 195    2
 196    1
 197    1
 198    2
 199    2
 Name: age, Length: 200, dtype: int64,
 'occupation': 0       4
 1       4
 2      13
 3      18
 4      19
        ..
 195    11
 196    11
 197    13
 198     0
 199     0
 Name: occupation, Length: 200, dtype: int64,
 'zip': 0       35
 1      118
 2       99
 3       55
 4       41
       ... 
 195     48
 196    113
 197     83
 198    106


In [49]:
model_input["genres"] = genres_list

In [51]:
for key,value in model_input.items():
    print(key,value.shape)

movie_id (200,)
user_id (200,)
gender (200,)
age (200,)
occupation (200,)
zip (200,)
genres (200, 5)


In [52]:
model_input["genres_weight"] = np.random.randn(data.shape[0], max_len, 1)

In [53]:
for key,value in model_input.items():
    print(key,value.shape)

movie_id (200,)
user_id (200,)
gender (200,)
age (200,)
occupation (200,)
zip (200,)
genres (200, 5)
genres_weight (200, 5, 1)


In [54]:
# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')

In [55]:
model

<tensorflow.python.keras.engine.functional.Functional at 0x1670f436280>

In [20]:






if __name__ == "__main__":










    


    
    

    

    model.compile("adam", "mse", metrics=['mse'], )
    history = model.fit(model_input, data[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Please check the latest version manually on https://pypi.org/project/deepctr/#history
Epoch 1/10




1/1 - 3s - loss: 14.3002 - mse: 14.3002 - val_loss: 13.4051 - val_mse: 13.4051
Epoch 2/10
1/1 - 0s - loss: 14.1830 - mse: 14.1830 - val_loss: 13.2915 - val_mse: 13.2915
Epoch 3/10
1/1 - 0s - loss: 14.0464 - mse: 14.0464 - val_loss: 13.1667 - val_mse: 13.1667
Epoch 4/10
1/1 - 0s - loss: 13.8974 - mse: 13.8974 - val_loss: 13.0334 - val_mse: 13.0334
Epoch 5/10
1/1 - 0s - loss: 13.7392 - mse: 13.7392 - val_loss: 12.8894 - val_mse: 12.8894
Epoch 6/10
1/1 - 0s - loss: 13.5695 - mse: 13.5695 - val_loss: 12.7341 - val_mse: 12.7341
Epoch 7/10
1/1 - 0s - loss: 13.3872 - mse: 13.3872 - val_loss: 12.5638 - val_mse: 12.5638
Epoch 8/10
1/1 - 0s - loss: 13.1890 - mse: 13.1890 - val_loss: 12.3763 - val_mse: 12.3763
Epoch 9/10
1/1 - 0s - loss: 12.9727 - mse: 12.9727 - val_loss: 12.1722 - val_mse: 12.1722
Epoch 10/10
1/1 - 0s - loss: 12.7367 - mse: 12.7367 - val_loss: 11.9488 - val_mse: 11.9488


In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer

# 1. 加载并预处理数据，这里模拟出2个变长序列数据
data = pd.read_csv(r"D:\software\pycharm_repository\StarMaker\MultiRecSys\data_files\movielens_sample.txt")
data['genres_bak'] = data['genres']
data.head()

sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ["rating"]
sequence_features = ["genres", "genres_bak"]

# 对稀疏特征做标签编码（Label Encoding）
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

# 用于存储每个变长特征处理后的 padding 序列
pad_sequences_dict = {}

# 每个变长特征对应一个独立的 Tokenizer，用于后续文本转索引
tokenizers = {}

# 用于记录每个变长特征的 padding 长度（即序列被填充后的最大长度）
pad_len_dict = {}

# 遍历所有变长序列特征
for feature in sequence_features:
    texts = data[feature].apply(lambda x: x.replace('|', ' ')).tolist()
    tokenizer = Tokenizer(oov_token='OOV')
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences, padding='post')  # shape: (num_samples, max_seq_len)
    pad_sequences_dict[feature] = padded
    tokenizers[feature] = tokenizer
    pad_len_dict[feature] = padded.shape[1]  # 保存每个特征的序列长度（max_seq_len）

# 2. 创建所有特征的Embedding层
embedding_dim = 4
vocab_sizes = {feat: data[feat].nunique() for feat in sparse_features}

for feature in sequence_features:
    feat_num = len(tokenizers[feature].word_index) + 1
    vocab_sizes[feature] = feat_num

# 创建嵌入层字典
embed_layers = {}
for feat in sparse_features:
    embed_layers[feat] = layers.Embedding(input_dim=vocab_sizes[feat] + 1, output_dim=embedding_dim, mask_zero=False)
for feat in sequence_features:
    embed_layers[feat] = layers.Embedding(input_dim=vocab_sizes[feat] + 1, output_dim=embedding_dim, mask_zero=True)

# 1. 初始化模型输入字典
inputs = {}
for feat in sparse_features:
    inputs[feat] = tf.keras.Input(shape=(1,), name=feat, dtype=tf.int32)  # shape: (batch_size, 1)
for feat in sequence_features:
    max_len = pad_len_dict[feat]
    inputs[feat] = tf.keras.Input(shape=(max_len,), name=feat, dtype=tf.int32)  # shape: (batch_size, max_len)

# 2. 构建嵌入列表
embeds = []
for feat in sparse_features:
    embed = embed_layers[feat](inputs[feat])  # shape: (batch_size, 1, embedding_dim)
    embeds.append(embed)

for feat in sequence_features:
    seq_embed = embed_layers[feat](inputs[feat])  # shape: (batch_size, seq_len, embedding_dim)
    pooled_embed = tf.reduce_mean(seq_embed, axis=1, keepdims=True)  # shape: (batch_size, 1, embedding_dim) 从这可以看出边长序列的字段数据最终整体也是当成一个字段处理
    embeds.append(pooled_embed)

# 拼接所有嵌入特征
total_embeds = tf.concat(embeds, axis=1)  # shape: (batch_size, num_fields, embedding_dim)

# 4. FM 二阶交叉项计算
sum_square = tf.square(tf.reduce_sum(total_embeds, axis=1))  # shape: (batch_size, embedding_dim)
square_sum = tf.reduce_sum(tf.square(total_embeds), axis=1)  # shape: (batch_size, embedding_dim)
fm_second_order = 0.5 * tf.reduce_sum(sum_square - square_sum, axis=1, keepdims=True)  # shape: (batch_size, 1)

# 5. DNN 部分
flatten_input = tf.reshape(total_embeds, shape=(-1, total_embeds.shape[1] * embedding_dim))  # shape: (batch_size, num_fields * embedding_dim)
x = layers.Dense(64, activation='relu')(flatten_input)  # shape: (batch_size, 64)
x = layers.Dense(32, activation='relu')(x)              # shape: (batch_size, 32)
dnn_output = layers.Dense(1)(x)                         # shape: (batch_size, 1)

# 6. 合并 FM 和 DNN 输出结果
output = layers.Add()([fm_second_order, dnn_output])    # shape: (batch_size, 1)
model = tf.keras.Model(inputs=list(inputs.values()), outputs=output)

# 7. 编译并训练模型
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

# 构建模型输入字典
model_input = {}
for feat in sparse_features:
    model_input[feat] = data[feat].values  # shape: (num_samples,)
for feat in sequence_features:
    model_input[feat] = pad_sequences_dict[feat]  # shape: (num_samples, max_seq_len)

# 模型训练
model.fit(model_input, data[target].values, batch_size=256, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x167065a1a30>

In [19]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers, models
import tensorflow as tf


class FeatureProcessor:
    def __init__(self, sparse_features, sequence_features, embedding_dim=4):
        self.sparse_features = sparse_features
        self.sequence_features = sequence_features
        self.embedding_dim = embedding_dim

        self.label_encoders = {}
        self.tokenizers = {}
        self.pad_sequences_dict = {}
        self.pad_len_dict = {}
        self.vocab_sizes = {}

    def fit_transform(self, df):
        # 编码稀疏特征
        for feat in self.sparse_features:
            lbe = LabelEncoder()
            df[feat] = lbe.fit_transform(df[feat])
            self.label_encoders[feat] = lbe
            self.vocab_sizes[feat] = df[feat].nunique()

        # 编码变长序列特征
        for feat in self.sequence_features:
            texts = df[feat].apply(lambda x: x.replace('|', ' ')).tolist()
            tokenizer = Tokenizer(oov_token='OOV')
            tokenizer.fit_on_texts(texts)
            sequences = tokenizer.texts_to_sequences(texts)
            padded = pad_sequences(sequences, padding='post')
            self.tokenizers[feat] = tokenizer
            self.pad_sequences_dict[feat] = padded
            self.pad_len_dict[feat] = padded.shape[1]
            self.vocab_sizes[feat] = len(tokenizer.word_index) + 1

        return df

    def get_model_input(self, df):
        model_input = {}
        for feat in self.sparse_features:
            model_input[feat] = df[feat].values
        for feat in self.sequence_features:
            model_input[feat] = self.pad_sequences_dict[feat]
        return model_input


class FM_DNN_Model(tf.keras.Model):
    def __init__(self, sparse_features, sequence_features, vocab_sizes, pad_len_dict, embedding_dim=4):
        super(FM_DNN_Model, self).__init__()
        self.embedding_dim = embedding_dim
        self.sparse_features = sparse_features
        self.sequence_features = sequence_features
        self.vocab_sizes = vocab_sizes
        self.pad_len_dict = pad_len_dict

        self.embed_layers = {}
        for feat in sparse_features:
            self.embed_layers[feat] = layers.Embedding(input_dim=vocab_sizes[feat] + 1,
                                                       output_dim=embedding_dim,
                                                       mask_zero=False)

        for feat in sequence_features:
            self.embed_layers[feat] = layers.Embedding(input_dim=vocab_sizes[feat] + 1,
                                                       output_dim=embedding_dim,
                                                       mask_zero=True)

        # DNN 部分
        self.dnn = tf.keras.Sequential([
            layers.Dense(64, activation='relu'),
            layers.Dense(32, activation='relu'),
            layers.Dense(1)
        ])

    def call(self, inputs):
        embeds = []

        # 稀疏特征嵌入 shape: (batch, 1, embed_dim)
        for feat in self.sparse_features:
            embed = self.embed_layers[feat](inputs[feat])
            embeds.append(embed)

        # 序列特征嵌入并池化 shape: (batch, 1, embed_dim)
        for feat in self.sequence_features:
            embed = self.embed_layers[feat](inputs[feat])
            pooled = tf.reduce_mean(embed, axis=1, keepdims=True)
            embeds.append(pooled)

        print(embeds)
        
        total_embeds = tf.concat(embeds, axis=1)  # shape: (batch, field_num, embed_dim)

        # FM 二阶交叉项
        sum_square = tf.square(tf.reduce_sum(total_embeds, axis=1))
        square_sum = tf.reduce_sum(tf.square(total_embeds), axis=1)
        fm_output = 0.5 * tf.reduce_sum(sum_square - square_sum, axis=1, keepdims=True)

        # DNN 输入
        flatten_input = tf.reshape(total_embeds, shape=(-1, total_embeds.shape[1] * self.embedding_dim))
        dnn_output = self.dnn(flatten_input)

        # FM + DNN
        return fm_output + dnn_output


# =================== 示例使用 ===================
if __name__ == '__main__':
    # 读取数据
    data = pd.read_csv(r"D:\software\pycharm_repository\StarMaker\MultiRecSys\data_files\movielens_sample.txt")
    data['genres_bak'] = data['genres']

    sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
    sequence_features = ["genres", "genres_bak"]
    target = ["rating"]

    # 特征处理
    processor = FeatureProcessor(sparse_features, sequence_features, embedding_dim=4)
    data = processor.fit_transform(data)
    model_input = processor.get_model_input(data)

    # 模型构建
    model = FM_DNN_Model(sparse_features, sequence_features,
                         vocab_sizes=processor.vocab_sizes,
                         pad_len_dict=processor.pad_len_dict,
                         embedding_dim=4)

    # 模型编译 & 训练
    model.compile(optimizer='adam', loss='mse', metrics=['mse'])
    model.fit(model_input, data[target].values, batch_size=256, epochs=10, validation_split=0.2)

Epoch 1/10
[<tf.Tensor 'fm_dnn__model_2/embedding_24/embedding_lookup/Identity_1:0' shape=(None, 1, 4) dtype=float32>, <tf.Tensor 'fm_dnn__model_2/embedding_25/embedding_lookup/Identity_1:0' shape=(None, 1, 4) dtype=float32>, <tf.Tensor 'fm_dnn__model_2/embedding_26/embedding_lookup/Identity_1:0' shape=(None, 1, 4) dtype=float32>, <tf.Tensor 'fm_dnn__model_2/embedding_27/embedding_lookup/Identity_1:0' shape=(None, 1, 4) dtype=float32>, <tf.Tensor 'fm_dnn__model_2/embedding_28/embedding_lookup/Identity_1:0' shape=(None, 1, 4) dtype=float32>, <tf.Tensor 'fm_dnn__model_2/embedding_29/embedding_lookup/Identity_1:0' shape=(None, 1, 4) dtype=float32>, <tf.Tensor 'fm_dnn__model_2/Mean:0' shape=(None, 1, 4) dtype=float32>, <tf.Tensor 'fm_dnn__model_2/Mean_1:0' shape=(None, 1, 4) dtype=float32>]
[<tf.Tensor 'fm_dnn__model_2/embedding_24/embedding_lookup/Identity_1:0' shape=(None, 1, 4) dtype=float32>, <tf.Tensor 'fm_dnn__model_2/embedding_25/embedding_lookup/Identity_1:0' shape=(None, 1, 4) dty

In [4]:
data = pd.read_csv(r"D:\software\pycharm_repository\StarMaker\MultiRecSys\data_files\movielens_sample.txt")
data['genres_bak'] = data['genres']
data

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip,genres_bak
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119,Comedy|Drama
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005,Action|Thriller
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408,Drama|Romance
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307,Action|Adventure
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009,Comedy|Drama
...,...,...,...,...,...,...,...,...,...,...,...
195,1427,3596,3,974840560,Screwed (2000),Comedy,M,25,12,21401,Comedy
196,3868,1626,3,965855033,Fire Down Below (1997),Action|Drama|Thriller,M,18,12,73112,Action|Drama|Thriller
197,249,2369,3,976730191,Desperately Seeking Susan (1985),Comedy|Romance,F,18,14,48126,Comedy|Romance
198,5720,349,4,958503395,Clear and Present Danger (1994),Action|Adventure|Thriller,M,25,0,60610,Action|Adventure|Thriller


In [5]:
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
sequence_features = ["genres", "genres_bak"]
target = ["rating"]

In [6]:
# 特征处理
processor = FeatureProcessor(sparse_features, sequence_features, embedding_dim=4)
data = processor.fit_transform(data)

In [7]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip,genres_bak
0,107,12,4,968035345,Ed Wood (1994),Comedy|Drama,0,2,4,35,Comedy|Drama
1,123,169,3,966536874,Patriot Games (1992),Action|Thriller,1,1,4,118,Action|Thriller
2,12,6,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,0,2,13,99,Drama|Romance
3,21,112,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,1,1,18,55,Action|Adventure
4,187,45,5,957782527,"Apartment, The (1960)",Comedy|Drama,1,5,19,41,Comedy|Drama
...,...,...,...,...,...,...,...,...,...,...,...
195,46,176,3,974840560,Screwed (2000),Comedy,1,2,11,48,Comedy
196,131,89,3,965855033,Fire Down Below (1997),Action|Drama|Thriller,1,1,11,113,Action|Drama|Thriller
197,4,125,3,976730191,Desperately Seeking Susan (1985),Comedy|Romance,0,1,13,83,Comedy|Romance
198,181,15,4,958503395,Clear and Present Danger (1994),Action|Adventure|Thriller,1,2,0,106,Action|Adventure|Thriller


In [10]:
model_input = processor.get_model_input(data)
model_input

{'movie_id': array([ 12, 169,   6, 112,  45, 146,  43, 156,  30, 174,  82, 173,  91,
        108, 132,  40, 109,  31, 180, 183, 129,  67, 137,  87, 127,   8,
        104, 100, 140,  25, 122, 124, 116, 126,  72, 117,  42, 145, 131,
          2,  52,  17, 101,  94, 136,  65,  20, 144,  26,  83,  55, 126,
        184,  23, 121, 142,  33,   0,  46, 139, 150, 135,  36, 110,  79,
        162,  70, 147,   9,  34,   7,  76,   4, 185,  73, 112, 130,  95,
         28,  24, 148, 119, 168, 149, 181,  13, 154,  56,  66, 172,  69,
         35,  49, 106,  35,  11, 152, 166,  37, 164,  54, 167,  72,  29,
         92, 114,  88, 170,  64,  60,  38,  22,  62, 178, 134, 157,  99,
         34, 111,  96,  50,  75,  47,  14,  21,  77, 118, 182, 113, 143,
        149, 141,  10,  58,  81,  44,  27, 151, 165,  98, 163,  80, 158,
        161,  27, 155, 171,  78,  57, 123,  84,  93, 170, 120,   1, 153,
         39,  61,  51,  71,  19, 107,   9,  66, 102,  74, 177, 103, 133,
        160,  53,  90,   5, 173,  41,  

In [11]:
for item,value in model_input.items():
    print(item,value.shape)

movie_id (200,)
user_id (200,)
gender (200,)
age (200,)
occupation (200,)
zip (200,)
genres (200, 6)
genres_bak (200, 6)


In [12]:
# 模型构建
model = FM_DNN_Model(sparse_features, sequence_features,
                     vocab_sizes=processor.vocab_sizes,
                     pad_len_dict=processor.pad_len_dict,
                     embedding_dim=4)

In [13]:
model

<__main__.FM_DNN_Model at 0x1670ac92130>

In [14]:
sparse_features

['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip']

In [15]:
sequence_features

['genres', 'genres_bak']

In [17]:
processor.vocab_sizes

{'movie_id': 187,
 'user_id': 193,
 'gender': 2,
 'age': 7,
 'occupation': 20,
 'zip': 188,
 'genres': 21,
 'genres_bak': 21}

In [18]:
processor.pad_len_dict

{'genres': 6, 'genres_bak': 6}