<a href="https://colab.research.google.com/github/zhuowenzheng/nlptest/blob/master/PaddleMovieRecommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
#!pip uninstall paddlepaddle
!python -m pip install paddlepaddle-gpu==2.2.2
#!pip uninstall paddlepaddle

Collecting paddlepaddle-gpu==2.2.2
  Downloading paddlepaddle_gpu-2.2.2-cp37-cp37m-manylinux1_x86_64.whl (435.4 MB)
[K     |████████████████████████████████| 435.4 MB 28 kB/s 
Installing collected packages: paddlepaddle-gpu
Successfully installed paddlepaddle-gpu-2.2.2


In [6]:
import pandas as pd
import numpy as np
import paddle
import paddle.nn as nn
from paddle.io import Dataset

print(paddle.__version__)



2.2.2


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
df = pd.read_csv('/content/drive/My Drive/ml-20m/ratings.csv')
user_ids = df["userId"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = df["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
df["user"] = df["userId"].map(user2user_encoded)
df["movie"] = df["movieId"].map(movie2movie_encoded)

num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
df["rating"] = df["rating"].values.astype(np.float32)
# 最小和最大额定值将在以后用于标准化额定值
min_rating = min(df["rating"])
max_rating = max(df["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)



Number of users: 138493, Number of Movies: 26744, Min rating: 0.5, Max rating: 5.0


In [11]:
df = df.sample(frac=1, random_state=42)
x = df[["user", "movie"]].values
# 规范化0和1之间的目标。使训练更容易。
y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
# 假设对90%的数据进行训练，对10%的数据进行验证。
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)
y_train = y_train[: ,np.newaxis]
y_val = y_val[: ,np.newaxis]
y_train = y_train.astype(np.float32)
y_val = y_val.astype(np.float32)

# 自定义数据集
#映射式(map-style)数据集需要继承paddle.io.Dataset
class SelfDefinedDataset(Dataset):
    def __init__(self, data_x, data_y, mode = 'train'):
        super(SelfDefinedDataset, self).__init__()
        self.data_x = data_x
        self.data_y = data_y
        self.mode = mode

    def __getitem__(self, idx):
        if self.mode == 'predict':
           return self.data_x[idx]
        else:
           return self.data_x[idx], self.data_y[idx]

    def __len__(self):
        return len(self.data_x)
        
traindataset = SelfDefinedDataset(x_train, y_train)
for data, label in traindataset:
    print(data.shape, label.shape)
    print(data, label)
    break
train_loader = paddle.io.DataLoader(traindataset, batch_size = 128, shuffle = True)
for batch_id, data in enumerate(train_loader()):
    x_data = data[0]
    y_data = data[1]

    print(x_data.shape)
    print(y_data.shape)
    break

testdataset = SelfDefinedDataset(x_val, y_val)
test_loader = paddle.io.DataLoader(testdataset, batch_size = 128, shuffle = True)        
for batch_id, data in enumerate(test_loader()):
    x_data = data[0]
    y_data = data[1]

    print(x_data.shape)
    print(y_data.shape)
    break



(2,) (1,)
[122269   1154] [0.6666667]
[128, 2]
[128, 1]
[128, 2]
[128, 1]


In [12]:
EMBEDDING_SIZE = 50

class RecommenderNet(nn.Layer):
    def __init__(self, num_users, num_movies, embedding_size):
        super(RecommenderNet, self).__init__()
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        weight_attr_user = paddle.ParamAttr(
            regularizer = paddle.regularizer.L2Decay(1e-6),
            initializer = nn.initializer.KaimingNormal()
            )
        self.user_embedding = nn.Embedding(
            num_users,
            embedding_size,
            weight_attr=weight_attr_user
        )
        self.user_bias = nn.Embedding(num_users, 1)
        weight_attr_movie = paddle.ParamAttr(
            regularizer = paddle.regularizer.L2Decay(1e-6),
            initializer = nn.initializer.KaimingNormal()
            )
        self.movie_embedding = nn.Embedding(
            num_movies,
            embedding_size,
            weight_attr=weight_attr_movie
        )
        self.movie_bias = nn.Embedding(num_movies, 1)

    def forward(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        movie_vector = self.movie_embedding(inputs[:, 1])
        movie_bias = self.movie_bias(inputs[:, 1])
        dot_user_movie = paddle.dot(user_vector, movie_vector)
        x = dot_user_movie + user_bias + movie_bias
        x = nn.functional.sigmoid(x)

        return x



In [13]:
paddle.device.get_device()
#paddle.device.set_device("gpu")
#paddle.fluid.is_compiled_with_cuda()

'gpu:0'

In [17]:
!pip install visualdl

Collecting visualdl
  Downloading visualdl-2.2.3-py3-none-any.whl (2.7 MB)
[K     |████████████████████████████████| 2.7 MB 3.8 MB/s 
Collecting bce-python-sdk
  Downloading bce-python-sdk-0.8.64.tar.gz (127 kB)
[K     |████████████████████████████████| 127 kB 46.8 MB/s 
[?25hCollecting Flask-Babel>=1.0.0
  Downloading Flask_Babel-2.0.0-py3-none-any.whl (9.3 kB)
Collecting pre-commit
  Downloading pre_commit-2.19.0-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 46.0 MB/s 
Collecting flake8>=3.7.9
  Downloading flake8-4.0.1-py2.py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 2.2 MB/s 
Collecting shellcheck-py
  Downloading shellcheck_py-0.8.0.4-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 27.7 MB/s 
[?25hCollecting pyflakes<2.5.0,>=2.4.0
  Downloading pyflakes-2.4.0-py2.py3-none-any.whl (69 kB)
[K     |██████████

In [15]:
import matplotlib.pyplot as plt


In [19]:
model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)
model = paddle.Model(model)

optimizer = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=0.0003)
loss = nn.BCELoss()
metric = paddle.metric.Accuracy()

# 设置visualdl路径
log_dir = './visualdl'
callback = paddle.callbacks.VisualDL(log_dir=log_dir)

model.prepare(optimizer, loss, metric)
model.fit(train_loader, epochs=5, save_dir='/content/drive/My Drive/recommender/model/checkpoints', verbose=1, callbacks=callback)



The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/5


  format(lhs_dtype, rhs_dtype, lhs_dtype))


save checkpoint at /content/drive/My Drive/recommender/model/checkpoints/0
Epoch 2/5
save checkpoint at /content/drive/My Drive/recommender/model/checkpoints/1
Epoch 3/5
save checkpoint at /content/drive/My Drive/recommender/model/checkpoints/2
Epoch 4/5
save checkpoint at /content/drive/My Drive/recommender/model/checkpoints/3
Epoch 5/5
save checkpoint at /content/drive/My Drive/recommender/model/checkpoints/4
save checkpoint at /content/drive/My Drive/recommender/model/checkpoints/final


In [20]:
model.evaluate(test_loader, batch_size=64, verbose=1)



Eval begin...
step    50/15626 [..............................] - loss: 0.5787 - acc: 0.8525 - ETA: 1:45 - 7ms/step

  format(lhs_dtype, rhs_dtype, lhs_dtype))


Eval samples: 2000027


{'acc': 0.8548319597685431, 'loss': [0.5587272]}

In [27]:
movie_df = pd.read_csv('/content/drive/My Drive/ml-20m/movies.csv')

# 获取一个用户，查看他的推荐电影
user_id = df.userId.sample(1).iloc[0]
movies_watched_by_user = df[df.userId == user_id]
movies_not_watched = movie_df[
    ~movie_df["movieId"].isin(movies_watched_by_user.movieId.values)
]["movieId"]
movies_not_watched = list(
    set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))
)
movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]
user_encoder = user2user_encoded.get(user_id)
user_movie_array = np.hstack(
    ([[user_encoder]] * len(movies_not_watched), movies_not_watched)
)
testdataset = SelfDefinedDataset(user_movie_array, user_movie_array, mode = 'predict')
test_loader = paddle.io.DataLoader(testdataset, batch_size = 26743, shuffle = False, return_list=True,)   

ratings = model.predict(test_loader)
ratings = np.array(ratings)
ratings = np.squeeze(ratings, 0)
ratings = np.squeeze(ratings, 2)
ratings = np.squeeze(ratings, 0)
top_ratings_indices = ratings.argsort()[::-1][0:10]

print(top_ratings_indices)
recommended_movie_ids = [
    movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices
]

print("用户的ID为: {}".format(user_id))
print("====" * 8)
print("用户评分较高的电影：")
print("----" * 8)
top_movies_user = (
    movies_watched_by_user.sort_values(by="rating", ascending=False)
    .head(5)
    .movieId.values
)
movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.title, ":", row.genres)

print("----" * 8)
print("为用户推荐的10部电影：")
print("----" * 8)
recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.title, ":", row.genres)



Predict begin...
Predict samples: 26606
[  312 22500  8874 18868 15117 18571  8534   634   633  9924]
用户的ID为: 63322
用户评分较高的电影：
--------------------------------
Taxi Driver (1976) : Crime|Drama|Thriller
Rock, The (1996) : Action|Adventure|Thriller
Star Wars: Episode V - The Empire Strikes Back (1980) : Action|Adventure|Sci-Fi
Lawrence of Arabia (1962) : Adventure|Drama|War
Apocalypse Now (1979) : Action|Drama|War
--------------------------------
为用户推荐的10部电影：
--------------------------------
Shawshank Redemption, The (1994) : Crime|Drama
Song of the Little Road (Pather Panchali) (1955) : Drama
World of Apu, The (Apur Sansar) (1959) : Drama
Harakiri (Seppuku) (1962) : Drama
Decalogue, The (Dekalog) (1989) : Crime|Drama|Romance
Sorrow and the Pity, The (Le chagrin et la pitié) (1969) : Documentary|War
Cosmos (1980) : Documentary
Civil War, The (1990) : Documentary|War
Black Mirror (2011) : Drama|Sci-Fi
Fawlty Towers (1975-1979) : Comedy


In [18]:
mkdir /content/drive/My Drive/recommender

mkdir: cannot create directory ‘/content/drive/My’: Operation not supported
mkdir: cannot create directory ‘Drive/recommender’: No such file or directory
