In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr_torch.models import DeepFM
from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat,get_feature_names#, get_varlen_feature_names

In [5]:
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

In [7]:
df_ratings = pd.read_csv("../data/ml-1m/ratings.dat",sep="::", header=None)
df_ratings.columns = ['user_id', 'item_id', 'rating', 'timestamp']
df_ratings.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [8]:
df_users = pd.read_csv("../data/ml-1m/users.dat",sep="::", header=None)
df_users.columns = ['user_id', 'gender', 'age', 'occupation','zip']
df_users.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [6]:
df_items = pd.read_csv("../data/ml-1m/movies.dat",sep="::", header=None)
df_items.columns = ['item_id', 'title', 'genres']
df_items.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
df = pd.merge(df_ratings, df_users, on='user_id')
df = pd.merge(df, df_items, on='item_id')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


In [15]:
sparse_features = ["item_id", "user_id", "gender", "age", "occupation", "zip", ]
target = ['rating']

In [16]:
# 1.Label Encoding for sparse features,and process sequence features
for feat in sparse_features:
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat])

In [10]:
# preprocess the sequence feature
key2index = {}
genres_list = list(map(split, df['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

In [18]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique())
                    for feat in sparse_features]
varlen_feature_columns = [VarLenSparseFeat('genres', len(
    key2index) + 1, max_len, 'mean')]  # Notice : value 0 is for padding for sequence input feature

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [19]:
# 3.generate input data for model
model_input = {name:df[name] for name in feature_names}
model_input['genres'] = genres_list

In [None]:
# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression')

model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(model_input, df[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

cpu
Train on 800167 samples, validate on 200042 samples, 3126 steps per epoch
Epoch 1/10
131s - loss:  0.9567 - mse:  0.9567 - val_mse:  1.2218
Epoch 2/10
106s - loss:  0.8266 - mse:  0.8266 - val_mse:  1.2220
Epoch 3/10
111s - loss:  0.8136 - mse:  0.8136 - val_mse:  1.2125
Epoch 4/10
110s - loss:  0.8003 - mse:  0.8003 - val_mse:  1.2244
Epoch 5/10
104s - loss:  0.7749 - mse:  0.7749 - val_mse:  1.2128
Epoch 6/10
3155s - loss:  0.7342 - mse:  0.7342 - val_mse:  1.1970
Epoch 7/10
355s - loss:  0.7093 - mse:  0.7093 - val_mse:  1.2430
Epoch 8/10
104s - loss:  0.6905 - mse:  0.6905 - val_mse:  1.2181
Epoch 9/10
163s - loss:  0.6751 - mse:  0.6751 - val_mse:  1.2046
Epoch 10/10
104s - loss:  0.6641 - mse:  0.6641 - val_mse:  1.2164
