## 数据处理与读取

In [1]:
import numpy as np
import random
from PIL import Image

class DataProcessor():
    # 初始化
    def __init__(self, use_poster=False):
        self.use_poster = use_poster
        
        # 声明数据文件路径
        user_info_path = '../datasets/ml-1m/users.dat'
        movie_info_path = '../datasets/ml-1m/movies.dat'
        if use_poster:
            rating_info_path = '../datasets/ml-1m/new_rating.txt'
        else:
            rating_info_path = '../datasets/ml-1m/ratings.dat'
        self.post_path = '../datasets/ml-1m/posters/'
        
        # 记录用户数据的最大ID
        self.max_user_id = 0
        self.max_user_age = 0
        self.max_user_job = 0
        
        # 获取用户数据
        self.user_info = self.get_user_info(user_info_path)
        
        # 获取电影数据
        self.movie_info, self.movie_titles, self.movie_cats = self.get_movie_info(movie_info_path)
        
        # 记录电影的最大ID
        self.max_movie_id = np.max(list(map(int, self.movie_info.keys())))
        self.max_movie_title = np.max([self.movie_titles[k] for k in self.movie_titles])
        self.max_movie_cat = np.max([self.movie_cats[k] for k in self.movie_cats])
        
        # 获取评分数据
        self.rating_info = self.get_rating_info(rating_info_path)
        
        # 构建数据集
        self.dataset = self.get_dataset(user_info=self.user_info, 
                                       movie_info=self.movie_info, rating_info=self.rating_info)
        
        # 划分数据集
        self.train_dataset = self.dataset[:int(len(self.dataset) * 0.9)]
        self.test_dataset = self.dataset[int(len(self.dataset) * 0.9):]
        
        # 打印测试
        print('用户数据量：{}，电影数据量：{}'.format(len(self.user_info), len(self.movie_info)))
        print('构建的数据集总量：{}，其中训练集：{}，测试集：{}'.format(len(self.dataset), 
                                                   len(self.train_dataset), len(self.test_dataset)))
        
    # 获取用户数据
    def get_user_info(self, path):
        def gender2num(gender):
            return 1 if gender == 'F' else 0

        with open(path, 'r') as f:
            data = f.readlines()

        user_info = {}
        
        for item in data:
            item = item.strip().split('::')
            user_id = item[0]
            user_info[user_id] = {
                'user_id': int(user_id),
                'gender': gender2num(item[1]),
                'age': int(item[2]),
                'job': int(item[3])
            }
            self.max_user_id = max(self.max_user_id, int(user_id))
            self.max_user_age = max(self.max_user_age, int(item[2]))
            self.max_user_job = max(self.max_user_job, int(item[3]))

        return user_info
    
    # 获取电影数据
    def get_movie_info(self, path):
        with open(path, 'r', encoding='ISO-8859-1') as f:
            data = f.readlines()

        # 建立3个字典，分别存放电影的所有、名称、类别信息
        movie_info, movie_titles, movie_cats = {}, {}, {}

        # 对电影名称、类别中不同的单词计数
        t_count, c_count = 1, 1

        # 按行读取数据并处理
        for item in data:
            item = item.strip().split('::')
            v_id = item[0]
            v_title = item[1][:-7]  # 去除title里的上映年份
            v_year = item[1][-5:-1] # 获取上映年份
            v_cat = item[2].split('|')

            # 统计电影名称包含的单词，并给每个单词一个序号，存放在movie_titles中
            titles = v_title.split()
            for t in titles:
                if t not in movie_titles:
                    movie_titles[t] = t_count
                    t_count += 1

            # 统计电影类别包含的单词，并给每个单词一个序号，存放在movie_cat中
            for c in v_cat:
                if c not in movie_cats:
                    movie_cats[c] = c_count
                    c_count += 1

            # 补0使电影名称对应的列表长度为15（最长的电影名称长度为15）
            title = [movie_titles[k] for k in titles]
            while len(title) < 15:
                title.append(0)

            # 补0使电影类别对应的列表长度为6（最多类别为6）
            cat = [movie_cats[k] for k in v_cat]
            while len(cat) < 6:
                cat.append(0)

            # 保存电影完整信息
            movie_info[v_id] = {
                'movie_id': int(v_id),
                'title': title,
                'cat': cat,
                'year': int(v_year)
            }

        return movie_info, movie_titles, movie_cats
    
    # 获取评分数据
    def get_rating_info(self, path):
        with open(path, 'r') as f:
            data = f.readlines()

        rating_info = {}

        for item in data:
            item = item.strip().split('::')
            user_id, movie_id, score = item[0], item[1], item[2]
            if user_id not in rating_info.keys():
                rating_info[user_id] = {movie_id: float(score)}
            else:
                rating_info[user_id][movie_id] = float(score)

        return rating_info
    
    # 构建数据集
    def get_dataset(self, user_info, movie_info, rating_info):
        dataset = []

        # 按照评分数据的key值索引数据
        for user_id in rating_info.keys():
            user_ratings = rating_info[user_id]
            for movie_id in user_ratings:
                dataset.append({
                    'user_info': user_info[user_id],
                    'movie_info': movie_info[movie_id],
                    'score': user_ratings[movie_id]
                })

        return dataset
    
    # 数据加载器
    def load_data(self, dataset=None, mode='train'):
        BATCHSIZE = 256  # 定义批次大小
        data_length = len(dataset)
        index_list = list(range(data_length))

        # 定义数据迭代加载器
        def data_generator():
            # 训练模式下，打乱训练数据
            if mode == 'train':
                random.shuffle(index_list)

            # 声明每个特征的列表
            user_id_list, user_gender_list, user_age_list, user_job_list = [], [], [], []
            movie_id_list, movie_title_list, movie_cat_list, movie_poster_list = [], [], [], []
            score_list = []

            # 按索引遍历输入数据集
            for idx, i in enumerate(index_list):
                # 获取特征数据并保存到对应特征列表中
                user_id_list.append(dataset[i]['user_info']['user_id'])
                user_gender_list.append(dataset[i]['user_info']['gender'])
                user_age_list.append(dataset[i]['user_info']['age'])
                user_job_list.append(dataset[i]['user_info']['job'])

                movie_id_list.append(dataset[i]['movie_info']['movie_id'])
                movie_title_list.append(dataset[i]['movie_info']['title'])
                movie_cat_list.append(dataset[i]['movie_info']['cat'])

                # 如果使用电影海报数据
                if self.use_poster:
                    movie_id = dataset[i]['movie_info']['movie_id']
                    poster = Image.open(poster_path + 'mov_id{}.jpg'.format(str(movie_id)))
                    poster = poster.resize([64, 64])
                    if len(poster.size) <= 2:
                        poster = poster.convert('RGB')

                    movie_poster_list.append(np.array(poster))

                score_list.append(int(dataset[i]['score']))

                # 如果读取到数据量达到定义的批次大小，则返回当前批次
                if len(user_id_list) == BATCHSIZE:
                    # 转换list到ndarray，并reshape到固定形状
                    user_id_arr = np.array(user_id_list)
                    user_gender_arr = np.array(user_gender_list)
                    user_age_arr = np.array(user_age_list)
                    user_job_arr = np.array(user_job_list)

                    movie_id_arr = np.array(movie_id_list)
                    movie_title_arr = np.reshape(np.array(movie_title_list), [BATCHSIZE, 1, 15]).astype(np.int64)
                    movie_cat_arr = np.reshape(np.array(movie_cat_list), [BATCHSIZE, 6]).astype(np.int64)

                    if self.use_poster:
                        movie_poster_arr = np.reshape(np.array(movie_poster_list)/127.5 - 1, 
                                                      [BATCHSIZE, 3, 64, 64]).astype(np.float32)
                    else:
                        movie_poster_arr = np.array([0.])

                    score_arr = np.reshape(np.array(score_list), [-1, 1]).astype(np.float32)

                    # 返回当前批次数据
                    yield [user_id_arr, user_gender_arr, user_age_arr, user_job_arr], \
                            [movie_id_arr, movie_title_arr, movie_cat_arr, movie_poster_arr], score_arr

                    # 清空数据
                    user_id_list, user_gender_list, user_age_list, user_job_list = [], [], [], []
                    movie_id_list, movie_title_list, movie_cat_list, movie_poster_list = [], [], [], []
                    score_list = []

        return data_generator

## 构建神经网络

In [24]:
import paddle
import paddle.nn as nn
from paddle.nn import Linear, Embedding, Conv2D
import paddle.nn.functional as F

import numpy as np
import random
import math

class Model(nn.Layer):
    def __init__(self, use_poster, use_mov_title, use_mov_cat, use_age_job, fc_sizes):
        super(Model, self).__init__()
        
        # 将传入的name信息和bool型参数添加到模型类中
        self.use_mov_poster = use_poster
        self.use_mov_title = use_mov_title
        self.use_usr_age_job = use_age_job
        self.use_mov_cat = use_mov_cat
        self.fc_sizes = fc_sizes
        
        # 获取数据集的信息，并构建训练和验证集的数据迭代器
        Dataset = DataProcessor(self.use_mov_poster)
        self.Dataset = Dataset
        self.trainset = self.Dataset.train_dataset
        self.valset = self.Dataset.test_dataset
        self.train_loader = self.Dataset.load_data(dataset=self.trainset, mode='train')
        self.valid_loader = self.Dataset.load_data(dataset=self.valset, mode='valid')

        """ define network layer for embedding usr info """
        USR_ID_NUM = Dataset.max_user_id + 1
        # 对用户ID做映射，并紧接着一个Linear层
        self.usr_emb = Embedding(num_embeddings=USR_ID_NUM, embedding_dim=32, sparse=False)
        self.usr_fc = Linear(in_features=32, out_features=32)
        
        # 对用户性别信息做映射，并紧接着一个Linear层
        USR_GENDER_DICT_SIZE = 2
        self.usr_gender_emb = Embedding(num_embeddings=USR_GENDER_DICT_SIZE, embedding_dim=16)
        self.usr_gender_fc = Linear(in_features=16, out_features=16)
        
        # 对用户年龄信息做映射，并紧接着一个Linear层
        USR_AGE_DICT_SIZE = Dataset.max_user_age + 1
        self.usr_age_emb = Embedding(num_embeddings=USR_AGE_DICT_SIZE, embedding_dim=16)
        self.usr_age_fc = Linear(in_features=16, out_features=16)
        
        # 对用户职业信息做映射，并紧接着一个Linear层
        USR_JOB_DICT_SIZE = Dataset.max_user_job + 1
        self.usr_job_emb = Embedding(num_embeddings=USR_JOB_DICT_SIZE, embedding_dim=16)
        self.usr_job_fc = Linear(in_features=16, out_features=16)
        
        # 新建一个Linear层，用于整合用户数据信息
        self.usr_combined = Linear(in_features=80, out_features=200)
        
        """ define network layer for embedding usr info """
        # 对电影ID信息做映射，并紧接着一个Linear层
        MOV_DICT_SIZE = Dataset.max_movie_id + 1
        self.mov_emb = Embedding(num_embeddings=MOV_DICT_SIZE, embedding_dim=32)
        self.mov_fc = Linear(in_features=32, out_features=32)
        
        # 对电影类别做映射
        CATEGORY_DICT_SIZE = len(Dataset.movie_cats) + 1
        self.mov_cat_emb = Embedding(num_embeddings=CATEGORY_DICT_SIZE, embedding_dim=32, sparse=False)
        self.mov_cat_fc = Linear(in_features=32, out_features=32)
        
        # 对电影名称做映射
        MOV_TITLE_DICT_SIZE = len(Dataset.movie_titles) + 1
        self.mov_title_emb = Embedding(num_embeddings=MOV_TITLE_DICT_SIZE, embedding_dim=32, sparse=False)
        self.mov_title_conv = Conv2D(in_channels=1, out_channels=1, kernel_size=(3, 1), stride=(2,1), padding=0)
        self.mov_title_conv2 = Conv2D(in_channels=1, out_channels=1, kernel_size=(3, 1), stride=1, padding=0)
        
        # 新建一个FC层，用于整合电影特征
        self.mov_concat_embed = Linear(in_features=96, out_features=200)

        user_sizes = [200] + self.fc_sizes
        acts = ["relu" for _ in range(len(self.fc_sizes))]
        self._user_layers = []
        for i in range(len(self.fc_sizes)):
            linear = Linear(
                in_features=user_sizes[i],
                out_features=user_sizes[i + 1],
                weight_attr=paddle.ParamAttr(
                    initializer=nn.initializer.Normal(
                        std=1.0 / math.sqrt(user_sizes[i]))))
            self.add_sublayer('linear_user_%d' % i, linear)
            self._user_layers.append(linear)
            if acts[i] == 'relu':
                act = nn.ReLU()
                self.add_sublayer('user_act_%d' % i, act)
                self._user_layers.append(act)
                
        #电影特征和用户特征使用了不同的全连接层，不共享参数
        movie_sizes = [200] + self.fc_sizes
        acts = ["relu" for _ in range(len(self.fc_sizes))]
        self._movie_layers = []
        for i in range(len(self.fc_sizes)):
            linear = nn.Linear(
                in_features=movie_sizes[i],
                out_features=movie_sizes[i + 1],
                weight_attr=paddle.ParamAttr(
                    initializer=nn.initializer.Normal(
                        std=1.0 / math.sqrt(movie_sizes[i]))))
            self.add_sublayer('linear_movie_%d' % i, linear)
            self._movie_layers.append(linear)
            if acts[i] == 'relu':
                act = nn.ReLU()
                self.add_sublayer('movie_act_%d' % i, act)
                self._movie_layers.append(act)
                
    # 定义计算用户特征的前向运算过程
    def get_usr_feat(self, usr_var):
        """ get usr features"""
        # 获取到用户数据
        usr_id, usr_gender, usr_age, usr_job = usr_var
        # 将用户的ID数据经过embedding和Linear计算，得到的特征保存在feats_collect中
        feats_collect = []
        usr_id = self.usr_emb(usr_id)
        usr_id = self.usr_fc(usr_id)
        usr_id = F.relu(usr_id)
        feats_collect.append(usr_id)
        
        # 计算用户的性别特征，并保存在feats_collect中
        usr_gender = self.usr_gender_emb(usr_gender)
        usr_gender = self.usr_gender_fc(usr_gender)
        usr_gender = F.relu(usr_gender)
        feats_collect.append(usr_gender)
        # 选择是否使用用户的年龄-职业特征
        if self.use_usr_age_job:
            # 计算用户的年龄特征，并保存在feats_collect中
            usr_age = self.usr_age_emb(usr_age)
            usr_age = self.usr_age_fc(usr_age)
            usr_age = F.relu(usr_age)
            feats_collect.append(usr_age)
            # 计算用户的职业特征，并保存在feats_collect中
            usr_job = self.usr_job_emb(usr_job)
            usr_job = self.usr_job_fc(usr_job)
            usr_job = F.relu(usr_job)
            feats_collect.append(usr_job)
        
        # 将用户的特征级联，并通过Linear层得到最终的用户特征
        usr_feat = paddle.concat(feats_collect, axis=1)
        user_features = F.tanh(self.usr_combined(usr_feat))
        #通过3层全链接层，获得用于计算相似度的用户特征和电影特征
        for n_layer in self._user_layers:
            user_features = n_layer(user_features)

        return user_features
    
    # 定义电影特征的前向计算过程
    def get_mov_feat(self, mov_var):
        """ get movie features"""
        # 获得电影数据
        mov_id, mov_title, mov_cat, mov_poster = mov_var
        feats_collect = []
        # 获得batchsize的大小
        batch_size = mov_id.shape[0]
        # 计算电影ID的特征，并存在feats_collect中
        mov_id = self.mov_emb(mov_id)
        mov_id = self.mov_fc(mov_id)
        mov_id = F.relu(mov_id)
        feats_collect.append(mov_id)
        
        # 如果使用电影的种类数据，计算电影种类特征的映射
        if self.use_mov_cat:
            # 计算电影种类的特征映射，对多个种类的特征求和得到最终特征
            mov_cat = self.mov_cat_emb(mov_cat)
            mov_cat = paddle.sum(mov_cat, axis=1, keepdim=False)

            mov_cat = self.mov_cat_fc(mov_cat)
            feats_collect.append(mov_cat)

        if self.use_mov_title:
            # 计算电影名字的特征映射，对特征映射使用卷积计算最终的特征
            mov_title = self.mov_title_emb(mov_title)
            mov_title = F.relu(self.mov_title_conv2(F.relu(self.mov_title_conv(mov_title))))
            mov_title = paddle.sum(mov_title, axis=2, keepdim=False)
            mov_title = F.relu(mov_title)
            mov_title = paddle.reshape(mov_title, [batch_size, -1])
            feats_collect.append(mov_title)
            
        # 使用一个全连接层，整合所有电影特征，映射为一个200维的特征向量
        mov_feat = paddle.concat(feats_collect, axis=1)
        mov_features = F.tanh(self.mov_concat_embed(mov_feat))

        for n_layer in self._movie_layers:
            mov_features = n_layer(mov_features)

        return mov_features
    
    # 定义个性化推荐算法的前向计算
    def forward(self, usr_var, mov_var):
        # 计算用户特征和电影特征
        usr_feat = self.get_usr_feat(usr_var)
        mov_feat = self.get_mov_feat(mov_var)

        #通过3层全连接层，获得用于计算相似度的用户特征和电影特征
#         for n_layer in self._user_layers:
#             user_features = n_layer(user_features)

#         for n_layer in self._movie_layers:
#             mov_features = n_layer(mov_features)

        # 根据计算的特征计算相似度
        res = F.cosine_similarity(usr_feat, mov_feat)
        
        # 将相似度扩大范围到和电影评分相同数据范围
        res = paddle.scale(res, scale=5)
        
        return usr_feat, mov_feat, res

## 模型训练

In [25]:
def train(model):
    # 配置训练参数
    paddle.set_device('cpu')
    lr = 0.001  # 学习率
    epoches = 10  # 训练轮次
    
    # 启动训练
    model.train()
    
    # 加载数据读取器
    data_loader = model.train_loader
    
    # 使用adam优化器
    opt = paddle.optimizer.Adam(learning_rate=lr, parameters=model.parameters())
    
    for epoch in range(0, epoches):
        for idx, data in enumerate(data_loader()):
            # 获得数据，并转为tensor格式
            usr, mov, score = data
            usr_v = [paddle.to_tensor(var) for var in usr]
            mov_v = [paddle.to_tensor(var) for var in mov]
            scores_label = paddle.to_tensor(score)
            # 计算出算法的前向计算结果
            usr_feat, mov_feat, scores_predict = model(usr_v, mov_v)
            # 计算loss
            loss = F.square_error_cost(scores_predict, scores_label)
            avg_loss = paddle.mean(loss)

            if idx % 500 == 0:
                print("epoch: {}, batch_id: {}, loss is: {}".format(epoch, idx, avg_loss.numpy()))
                
            # 损失函数下降，并清除梯度
            avg_loss.backward()
            opt.step()
            opt.clear_grad()

        # 每个epoch 保存一次模型
        paddle.save(model.state_dict(), '../models/epoch'+str(epoch)+'.pdparams')

In [26]:
# 启动训练
fc_sizes = [128, 64, 32]
model = Model(use_poster=False, use_mov_title=True, use_mov_cat=True, use_age_job=True, fc_sizes=fc_sizes)
train(model)

用户数据量：6040，电影数据量：3883
构建的数据集总量：1000209，其中训练集：900188，测试集：100021
epoch: 0, batch_id: 0, loss is: [4.898512]
epoch: 0, batch_id: 500, loss is: [1.1242074]
epoch: 0, batch_id: 1000, loss is: [1.1168172]
epoch: 0, batch_id: 1500, loss is: [1.3267958]
epoch: 0, batch_id: 2000, loss is: [1.4226235]
epoch: 0, batch_id: 2500, loss is: [1.2343428]
epoch: 0, batch_id: 3000, loss is: [1.2633283]
epoch: 0, batch_id: 3500, loss is: [1.3778837]
epoch: 1, batch_id: 0, loss is: [1.2331593]
epoch: 1, batch_id: 500, loss is: [1.2399149]
epoch: 1, batch_id: 1000, loss is: [1.1923554]
epoch: 1, batch_id: 1500, loss is: [1.2333798]
epoch: 1, batch_id: 2000, loss is: [1.3206844]
epoch: 1, batch_id: 2500, loss is: [1.3114982]
epoch: 1, batch_id: 3000, loss is: [1.0118706]
epoch: 1, batch_id: 3500, loss is: [1.1784894]
epoch: 2, batch_id: 0, loss is: [1.2836294]
epoch: 2, batch_id: 500, loss is: [1.2054243]
epoch: 2, batch_id: 1000, loss is: [1.1820899]
epoch: 2, batch_id: 1500, loss is: [1.1381238]
epoch: 2, 

## 模型评估

对训练的模型在验证集上做评估：
- 评分预测精度ACC(Accuracy)：将预测的float数字转成整数，计算预测评分和真实评分的匹配度。评分误差在0.5分以内的算正确，否则算错误。
- 评分预测误差（Mean Absolut Error）MAE：计算预测评分和真实评分之间的平均绝对误差。
- 均方根误差 （Root Mean Squard Error）RMSE：计算预测评分和真实值之间的平均平方误差。

In [29]:
from math import sqrt

def evaluation(model, params_file_path):
    model_state_dict = paddle.load(params_file_path)
    model.load_dict(model_state_dict)
    model.eval()

    acc_set = []
    avg_loss_set = []
    squaredError=[]
    for idx, data in enumerate(model.valid_loader()):
        usr, mov, score_label = data
        usr_v = [paddle.to_tensor(var) for var in usr]
        mov_v = [paddle.to_tensor(var) for var in mov]

        _, _, scores_predict = model(usr_v, mov_v)

        pred_scores = scores_predict.numpy()
        
        avg_loss_set.append(np.mean(np.abs(pred_scores - score_label)))
        squaredError.extend(np.abs(pred_scores - score_label)**2)

        diff = np.abs(pred_scores - score_label)
        diff[diff>0.5] = 1
        acc = 1 - np.mean(diff)
        acc_set.append(acc)
    RMSE=sqrt(np.sum(squaredError) / len(squaredError))
   
    return np.mean(acc_set), np.mean(avg_loss_set), RMSE

In [30]:
param_path = "../models/epoch"
for i in range(10):
    acc, mae, RMSE = evaluation(model, param_path+str(i)+'.pdparams')
    print("ACC:", acc, "MAE:", mae,'RMSE:',RMSE)

ACC: 0.20668767812924507 MAE: 0.9399596 RMSE: 18.014302766598174
ACC: 0.18804715810677944 MAE: 0.94934756 RMSE: 18.04095623194127
ACC: 0.20986424195460784 MAE: 0.9393253 RMSE: 18.028467517391515
ACC: 0.19535081569965068 MAE: 0.9456856 RMSE: 18.025422196236182
ACC: 0.19717271343255655 MAE: 0.945309 RMSE: 18.029201962300935
ACC: 0.19697357477285923 MAE: 0.9451507 RMSE: 18.024269162608338
ACC: 0.20950672320830516 MAE: 0.9392087 RMSE: 18.018675207961323
ACC: 0.20295047668310312 MAE: 0.94234097 RMSE: 18.019419501378692
ACC: 0.19949561479764108 MAE: 0.94398254 RMSE: 18.021888401010077
ACC: 0.1986533215412727 MAE: 0.9442707 RMSE: 18.02125425623724


## 保存特征

模型训练完成后，得到每个用户、电影对应的特征向量，接下来将这些特征向量保存到本地，这样在进行推荐时，不需要使用神经网络重新提取特征，节约时间成本。

保存特征的基本流程：
1. 加载预训练好的模型参数。
2. 输入数据集的数据，提取整个数据集的用户特征和电影特征。注意数据输入到模型前，要先转成内置的Tensor类型并保证尺寸正确。
3. 分别得到用户特征向量和电影特征向量，使用Pickle库保存字典形式的特征向量。

In [40]:
# 以ID为索引，以字典格式存储数据
from PIL import Image
import pickle

def get_usr_mov_features(model, params_file_path, poster_path):
    paddle.set_device('cpu') 
    usr_pkl = {}
    mov_pkl = {}
    
    # 定义将list中每个元素转成tensor的函数
    def list2tensor(inputs, shape):
        inputs = np.reshape(np.array(inputs).astype(np.int64), shape)
        return paddle.to_tensor(inputs)

    # 加载模型参数到模型中，设置为验证模式eval（）
    model_state_dict = paddle.load(params_file_path)
    model.load_dict(model_state_dict)
    model.eval()
    
    # 获得整个数据集的数据
    dataset = model.Dataset.dataset
    
    for i in range(len(dataset)):
        # 获得用户数据，电影数据，评分数据  
        # 本案例只转换所有在样本中出现过的user和movie，实际中可以使用业务系统中的全量数据
        usr_info, mov_info, score = dataset[i]['user_info'], dataset[i]['movie_info'],dataset[i]['score']
        usrid = str(usr_info['user_id'])
        movid = str(mov_info['movie_id'])

        # 获得用户数据，计算得到用户特征，保存在usr_pkl字典中
        if usrid not in usr_pkl.keys():
            usr_id_v = list2tensor(usr_info['user_id'], [1])
            usr_age_v = list2tensor(usr_info['age'], [1])
            usr_gender_v = list2tensor(usr_info['gender'], [1])
            usr_job_v = list2tensor(usr_info['job'], [1])

            usr_in = [usr_id_v, usr_gender_v, usr_age_v, usr_job_v]
            usr_feat = model.get_usr_feat(usr_in)

            usr_pkl[usrid] = usr_feat.numpy()
        
        # 获得电影数据，计算得到电影特征，保存在mov_pkl字典中
        if movid not in mov_pkl.keys():
            mov_id_v = list2tensor(mov_info['movie_id'], [1])
            mov_tit_v = list2tensor(mov_info['title'], [1, 1, 15])
            mov_cat_v = list2tensor(mov_info['cat'], [1, 6])

            mov_in = [mov_id_v, mov_tit_v, mov_cat_v, None]
            mov_feat = model.get_mov_feat(mov_in)

            mov_pkl[movid] = mov_feat.numpy() 
            
    print(len(usr_pkl.keys()))
    print(len(mov_pkl.keys()))
    
    # 保存特征到本地
    pickle.dump(usr_pkl, open('../models/usr_feat.pkl', 'wb'))
    pickle.dump(mov_pkl, open('../models/mov_feat.pkl', 'wb'))

In [41]:
param_path = '../models/epoch9.pdparams'
poster_path = '../datasets/ml-1m/posters/'

get_usr_mov_features(model, param_path, poster_path)

6040
3706
