### 如何结合 SparseFeat 和 VarLenSparseFeat 构建 DeepFM 模型，并完成一个回归任务（评分预测）

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names


def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))



if __name__ == "__main__":
    data = pd.read_csv(r"D:\software\pycharm_repository\StarMaker\MultiRecSys\data_files\movielens_sample.txt")
    sparse_features = ["movie_id", "user_id",
                       "gender", "age", "occupation", "zip", ]
    target = ['rating']

    # 1.Label Encoding for sparse features,and process sequence features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    # preprocess the sequence feature

    key2index = {}
    genres_list = list(map(split, data['genres'].values))
    genres_length = np.array(list(map(len, genres_list)))
    max_len = max(genres_length)
    # Notice : padding=`post`
    genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    fixlen_feature_columns = [SparseFeat(feat, data[feat].max() + 1, embedding_dim=4)
                              for feat in sparse_features]

    use_weighted_sequence = False
    if use_weighted_sequence:
        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
            key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                                   weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
    else:
        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
            key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                                   weight_name=None)]  # Notice : value 0 is for padding for sequence input feature

    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model
    model_input = {name: data[name] for name in feature_names}  #
    model_input["genres"] = genres_list
    model_input["genres_weight"] = np.random.randn(data.shape[0], max_len, 1)

    # 4.Define Model,compile and train
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')

    model.compile("adam", "mse", metrics=['mse'], )
    history = model.fit(model_input, data[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Epoch 1/10




1/1 - 3s - loss: 14.2999 - mse: 14.2999 - val_loss: 13.3738 - val_mse: 13.3738
Epoch 2/10
1/1 - 0s - loss: 14.1494 - mse: 14.1494 - val_loss: 13.2324 - val_mse: 13.2324
Epoch 3/10
1/1 - 0s - loss: 13.9823 - mse: 13.9823 - val_loss: 13.0764 - val_mse: 13.0764
Epoch 4/10
1/1 - 0s - loss: 13.7998 - mse: 13.7998 - val_loss: 12.9062 - val_mse: 12.9062
Epoch 5/10
1/1 - 0s - loss: 13.6013 - mse: 13.6013 - val_loss: 12.7185 - val_mse: 12.7185
Epoch 6/10
1/1 - 0s - loss: 13.3836 - mse: 13.3836 - val_loss: 12.5112 - val_mse: 12.5112
Epoch 7/10
1/1 - 0s - loss: 13.1434 - mse: 13.1434 - val_loss: 12.2819 - val_mse: 12.2819
Epoch 8/10
1/1 - 0s - loss: 12.8788 - mse: 12.8788 - val_loss: 12.0289 - val_mse: 12.0289
Epoch 9/10
1/1 - 0s - loss: 12.5874 - mse: 12.5874 - val_loss: 11.7503 - val_mse: 11.7503
Epoch 10/10
1/1 - 0s - loss: 12.2665 - mse: 12.2665 - val_loss: 11.4438 - val_mse: 11.4438


### 步骤详解

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names


def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

In [4]:
data = pd.read_csv(r"D:\software\pycharm_repository\StarMaker\MultiRecSys\data_files\movielens_sample.txt")
data

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009
...,...,...,...,...,...,...,...,...,...,...
195,1427,3596,3,974840560,Screwed (2000),Comedy,M,25,12,21401
196,3868,1626,3,965855033,Fire Down Below (1997),Action|Drama|Thriller,M,18,12,73112
197,249,2369,3,976730191,Desperately Seeking Susan (1985),Comedy|Romance,F,18,14,48126
198,5720,349,4,958503395,Clear and Present Danger (1994),Action|Adventure|Thriller,M,25,0,60610


In [5]:
sparse_features = ["movie_id", "user_id","gender", "age", "occupation", "zip", ]
sparse_features

['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip']

In [6]:
target = ['rating']

In [7]:
# 1.Label Encoding for sparse features,and process sequence features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [8]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,107,12,4,968035345,Ed Wood (1994),Comedy|Drama,0,2,4,35
1,123,169,3,966536874,Patriot Games (1992),Action|Thriller,1,1,4,118
2,12,6,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,0,2,13,99
3,21,112,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,1,1,18,55
4,187,45,5,957782527,"Apartment, The (1960)",Comedy|Drama,1,5,19,41
...,...,...,...,...,...,...,...,...,...,...
195,46,176,3,974840560,Screwed (2000),Comedy,1,2,11,48
196,131,89,3,965855033,Fire Down Below (1997),Action|Drama|Thriller,1,1,11,113
197,4,125,3,976730191,Desperately Seeking Susan (1985),Comedy|Romance,0,1,13,83
198,181,15,4,958503395,Clear and Present Danger (1994),Action|Adventure|Thriller,1,2,0,106


In [9]:
# preprocess the sequence feature
key2index = {}
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

In [10]:
genres_list

array([[ 1,  2,  0,  0,  0],
       [ 3,  4,  0,  0,  0],
       [ 2,  5,  0,  0,  0],
       [ 3,  6,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 7,  1,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 8,  0,  0,  0,  0],
       [ 3,  9,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 7,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 1, 10,  5, 11,  0],
       [12,  7,  0,  0,  0],
       [ 1, 13,  9,  0,  0],
       [ 2, 14,  0,  0,  0],
       [ 3, 13,  4,  0,  0],
       [ 1, 13,  0,  0,  0],
       [ 1,  2,  5,  0,  0],
       [ 1,  9, 15, 11,  0],
       [ 2,  5,  0,  0,  0],
       [ 1, 11,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0],
       [13,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 2,  0,  0,  0,  0],
       [ 9,  4,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  9

In [11]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature
fixlen_feature_columns = [SparseFeat(feat, data[feat].max() + 1, embedding_dim=4)  for feat in sparse_features]

In [12]:
fixlen_feature_columns

[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002A9067619D0>, embedding_name='movie_id', group_name='default_group', trainable=True),
 SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002A8E235E250>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002A8E235E6D0>, embedding_name='gender', group_name='default_group', trainable=True),
 SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=

In [13]:
use_weighted_sequence = False
if use_weighted_sequence:
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                               weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
else:
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                               weight_name=None)]  # Notice : value 0 is for padding for sequence input feature

In [14]:
varlen_feature_columns

[VarLenSparseFeat(sparsefeat=SparseFeat(name='genres', vocabulary_size=18, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002A906765490>, embedding_name='genres', group_name='default_group', trainable=True), maxlen=5, combiner='mean', length_name=None, weight_name=None, weight_norm=True)]

In [15]:
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns    = fixlen_feature_columns + varlen_feature_columns

In [16]:
linear_feature_columns

[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002A9067619D0>, embedding_name='movie_id', group_name='default_group', trainable=True),
 SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002A8E235E250>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002A8E235E6D0>, embedding_name='gender', group_name='default_group', trainable=True),
 SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=

In [17]:
dnn_feature_columns

[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002A9067619D0>, embedding_name='movie_id', group_name='default_group', trainable=True),
 SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002A8E235E250>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002A8E235E6D0>, embedding_name='gender', group_name='default_group', trainable=True),
 SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=

In [19]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip', 'genres']

In [20]:
# 3.generate input data for model
model_input = {name: data[name] for name in feature_names}  

In [21]:
model_input

{'movie_id': 0       12
 1      169
 2        6
 3      112
 4       45
       ... 
 195    176
 196     89
 197    125
 198     15
 199     86
 Name: movie_id, Length: 200, dtype: int64,
 'user_id': 0      107
 1      123
 2       12
 3       21
 4      187
       ... 
 195     46
 196    131
 197      4
 198    181
 199     25
 Name: user_id, Length: 200, dtype: int64,
 'gender': 0      0
 1      1
 2      0
 3      1
 4      1
       ..
 195    1
 196    1
 197    0
 198    1
 199    1
 Name: gender, Length: 200, dtype: int32,
 'age': 0      2
 1      1
 2      2
 3      1
 4      5
       ..
 195    2
 196    1
 197    1
 198    2
 199    2
 Name: age, Length: 200, dtype: int64,
 'occupation': 0       4
 1       4
 2      13
 3      18
 4      19
        ..
 195    11
 196    11
 197    13
 198     0
 199     0
 Name: occupation, Length: 200, dtype: int64,
 'zip': 0       35
 1      118
 2       99
 3       55
 4       41
       ... 
 195     48
 196    113
 197     83
 198    106


In [22]:
model_input["genres"] = genres_list

In [23]:
model_input

{'movie_id': 0       12
 1      169
 2        6
 3      112
 4       45
       ... 
 195    176
 196     89
 197    125
 198     15
 199     86
 Name: movie_id, Length: 200, dtype: int64,
 'user_id': 0      107
 1      123
 2       12
 3       21
 4      187
       ... 
 195     46
 196    131
 197      4
 198    181
 199     25
 Name: user_id, Length: 200, dtype: int64,
 'gender': 0      0
 1      1
 2      0
 3      1
 4      1
       ..
 195    1
 196    1
 197    0
 198    1
 199    1
 Name: gender, Length: 200, dtype: int32,
 'age': 0      2
 1      1
 2      2
 3      1
 4      5
       ..
 195    2
 196    1
 197    1
 198    2
 199    2
 Name: age, Length: 200, dtype: int64,
 'occupation': 0       4
 1       4
 2      13
 3      18
 4      19
        ..
 195    11
 196    11
 197    13
 198     0
 199     0
 Name: occupation, Length: 200, dtype: int64,
 'zip': 0       35
 1      118
 2       99
 3       55
 4       41
       ... 
 195     48
 196    113
 197     83
 198    106


In [24]:
model_input["genres_weight"] = np.random.randn(data.shape[0], max_len, 1)
model_input

{'movie_id': 0       12
 1      169
 2        6
 3      112
 4       45
       ... 
 195    176
 196     89
 197    125
 198     15
 199     86
 Name: movie_id, Length: 200, dtype: int64,
 'user_id': 0      107
 1      123
 2       12
 3       21
 4      187
       ... 
 195     46
 196    131
 197      4
 198    181
 199     25
 Name: user_id, Length: 200, dtype: int64,
 'gender': 0      0
 1      1
 2      0
 3      1
 4      1
       ..
 195    1
 196    1
 197    0
 198    1
 199    1
 Name: gender, Length: 200, dtype: int32,
 'age': 0      2
 1      1
 2      2
 3      1
 4      5
       ..
 195    2
 196    1
 197    1
 198    2
 199    2
 Name: age, Length: 200, dtype: int64,
 'occupation': 0       4
 1       4
 2      13
 3      18
 4      19
        ..
 195    11
 196    11
 197    13
 198     0
 199     0
 Name: occupation, Length: 200, dtype: int64,
 'zip': 0       35
 1      118
 2       99
 3       55
 4       41
       ... 
 195     48
 196    113
 197     83
 198    106


In [25]:
for key,value in model_input.items():
    print(key,value.shape)

movie_id (200,)
user_id (200,)
gender (200,)
age (200,)
occupation (200,)
zip (200,)
genres (200, 5)
genres_weight (200, 5, 1)


In [26]:
# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')

model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(model_input, data[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Epoch 1/10




1/1 - 3s - loss: 14.3005 - mse: 14.3005 - val_loss: 13.4220 - val_mse: 13.4220
Epoch 2/10
1/1 - 0s - loss: 14.2004 - mse: 14.2004 - val_loss: 13.3142 - val_mse: 13.3142
Epoch 3/10
1/1 - 0s - loss: 14.0697 - mse: 14.0697 - val_loss: 13.2023 - val_mse: 13.2023
Epoch 4/10
1/1 - 0s - loss: 13.9342 - mse: 13.9342 - val_loss: 13.0831 - val_mse: 13.0831
Epoch 5/10
1/1 - 0s - loss: 13.7912 - mse: 13.7912 - val_loss: 12.9552 - val_mse: 12.9552
Epoch 6/10
1/1 - 0s - loss: 13.6387 - mse: 13.6387 - val_loss: 12.8178 - val_mse: 12.8178
Epoch 7/10
1/1 - 0s - loss: 13.4758 - mse: 13.4758 - val_loss: 12.6699 - val_mse: 12.6699
Epoch 8/10
1/1 - 0s - loss: 13.3006 - mse: 13.3006 - val_loss: 12.5084 - val_mse: 12.5084
Epoch 9/10
1/1 - 0s - loss: 13.1101 - mse: 13.1101 - val_loss: 12.3325 - val_mse: 12.3325
Epoch 10/10
1/1 - 0s - loss: 12.9038 - mse: 12.9038 - val_loss: 12.1406 - val_mse: 12.1406
