In [1]:
from keras.layers import Input, Embedding, dot, Lambda
from keras.models import Model
from keras.regularizers import l2
import keras.backend as K
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# 读入文件
df = pd.read_csv('../input/movielens-1m-dataset/ratings.dat',sep='::',names=['uid','mid','ratings','time'])

user_ids = set(df.uid)
item_ids = set(df.mid)
n_user=len(user_ids)
n_item=len(item_ids)
print('Number of users = %d | Number of items = %d' % (n_user, n_item))

u_id2idx = dict(zip(user_ids, range(n_user)))
i_id2idx = dict(zip(item_ids, range(n_item)))

# 替换ID为Index
df = df.assign(uid=[u_id2idx[uid] for uid in df.uid])
df = df.assign(mid=[i_id2idx[iid] for iid in df.mid])

  return func(*args, **kwargs)


Number of users = 6040 | Number of items = 3706


In [2]:
y=df['ratings'].values
x=df[['uid','mid']].values
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=5)

In [3]:
def x_split(x):
    n=len(x)
    x_uid=x_mid=np.zeros(n)
    for i in range(n):
        x_uid[i]=x[i][0]
        x_mid[i]=x[i][1]
    return x_uid,x_mid

In [4]:
x_train_uid,x_train_mid=x_split(x_train)
x_test_uid,x_test_mid=x_split(x_test)
x_train_uid.shape

(800167,)

In [5]:
# 构造MF模型
reg = l2(1e-5)
n_latent_factor = 100

user_idx = Input(shape=[1], name='user_idx')
item_idx = Input(shape=[1], name='item_idx')

user_emb = Embedding(input_dim=n_user, output_dim=n_latent_factor, input_length=1, embeddings_regularizer=reg, name='user_embedding')
item_emb = Embedding(input_dim=n_item, output_dim=n_latent_factor, input_length=1, embeddings_regularizer=reg, name='item_embedding')

u_lf = user_emb(user_idx)
i_lf = item_emb(item_idx)

SqueezeEmbed = Lambda(lambda x: K.squeeze(x, 1))
u_lf = SqueezeEmbed(u_lf)
i_lf = SqueezeEmbed(i_lf)

pred_rating = dot([u_lf, i_lf], axes=-1)

mf = Model(inputs=[user_idx, item_idx], outputs=pred_rating, name='mf_model')
mf.compile(optimizer='adam', loss='mse', metrics=['mse'])
mf.summary()

Model: "mf_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_idx (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_idx (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 100)       604000      user_idx[0][0]                   
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 100)       370600      item_idx[0][0]                   
___________________________________________________________________________________________

2022-06-30 17:02:40.722428: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [6]:
# 模型训练
mf.fit(x=[x_train_uid,x_train_mid], y=y_train, epochs=3, batch_size=64)

2022-06-30 17:02:40.934566: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f67b14fdb90>

In [7]:
# 模型预测
p_ratings = mf.predict(x=[x_test_uid,x_test_mid])

# MAE
N = len(p_ratings)
e=np.abs(p_ratings.flatten() - y_test)
MAE = np.sum(e) / N
RMSE = np.sqrt(np.dot(e.T,e) / N)
print('MF Model MAE: %g' % MAE)
print('MF Model RMSE: %g' % RMSE)

MF Model MAE: 0.886668
MF Model RMSE: 1.16211
