# 用TensorFlow实现矩阵分解
## 数据导入与数据观察

In [None]:
import numpy as np
import pandas as pd

col_names = ['user', 'item', 'rate', 'timestamp']
df = pd.read_csv('./movielens/ml-1m/ratings.dat', sep = '::', header = None, names = col_names, engine = 'python')
df.info()

In [None]:
df.sort_values(by = ['user']).tail()

In [None]:
df.sort_values(by = ['user']).head()

In [None]:
df.sort_values(by = ['item']).tail()

In [None]:
df.sort_values(by = ['item']).head()

In [None]:
df['user'] -= 1
df['item'] -= 1

In [None]:
df['rate'] = df['rate'].astype('float64')

In [None]:
len(df['user'].unique())

In [None]:
len(df['item'].unique())
##item的id没有完全由1到3952

## 生成batch数据

In [1]:
import numpy as np
import pandas as pd

def read_and_process(filename, sep = '::'):
    col_names = ['user', 'item', 'rate', 'timestamp']
    df = pd.read_csv(filename, sep = sep, header = None, names = col_names, engine = 'python')
    df['user'] -= 1
    df['item'] -= 1
    for col in ('user', 'item'):
        df[col] = df[col].astype(np.float32)
    df['rate'] = df['rate'].astype(np.float32)
    return df

def get_data():
    df = read_and_process("./movielens/ml-1m/ratings.dat", sep = '::')
    rows = len(df)
    df = df.iloc[np.random.permutation(rows)].reset_index(drop = True)##打乱数据
    split_index = int(rows * 0.9)
    df_train = df[0: split_index]
    df_test = df[split_index:].reset_index(drop = True)
    print(df_train.shape, df_test.shape)
    return df_train, df_test
    
class ShuffleDataIterator(object):
    def __init__(self, inputs, batch_size = 10):
        ##注意这里的输入
        self.inputs = inputs
        self.batch_size = batch_size
        self.num_cols = len(self.inputs)
        self.len = len(self.inputs[0])
        self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))
        
    def __iter__(self):
        return self
    
    def __len__(self):
        return self.len
    
    def __next__(self):
        return self.next()
    
    def next(self):
        ids = np.random.randint(0, self.len, (self.batch_size,))
        out = self.inputs[ids, :]
        return [out[:, i] for i in range(self.num_cols)]
       
class OneEpochDataIterator(ShuffleDataIterator):
    def __init__(self, inputs, batch_size=10):
        super(OneEpochDataIterator, self).__init__(inputs, batch_size=batch_size)
        if batch_size > 0:
            self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size))
        else:
            self.idx_group = [np.arange(self.len)]
        self.group_id = 0
    ##next函数不能写在__init__下面
    def next(self):
        if self.group_id >= len(self.idx_group):
            self.group_id = 0
            raise StopIteration
        out = self.inputs[self.idx_group[self.group_id], :]
        self.group_id += 1
        return [out[:, i] for i in range(self.num_cols)]

## 构建神经网络和迭代优化部分

In [2]:
import tensorflow as tf

In [3]:
def inference_svd(user_batch, item_batch, user_num, item_num, dim = 5, device = '/cpu:0'):
    with tf.device('/cpu:0'):
        global_bias = tf.get_variable('global_bias', shape = [])
        w_bias_user = tf.get_variable('embd_bias_user', shape = [user_num])
        w_bias_item = tf.get_variable('embd_bias_item', shape = [item_num])
        
        bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name = 'bias_user')
        bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name = 'bias_item')
        
        w_user = tf.get_variable('embd_user', shape = [user_num, dim], initializer = tf.truncated_normal_initializer(stddev = 0.02))
        w_item = tf.get_variable('embd_item', shape = [item_num, dim], initializer = tf.truncated_normal_initializer(stddev = 0.02))
        
        embd_user = tf.nn.embedding_lookup(w_user, user_batch, name = 'embedding_user')
        embd_item = tf.nn.embedding_lookup(w_item, item_batch, name = 'embedding_item')
    
    with tf.device(device):
        infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)##tf.multiply是元素点乘
        infer = tf.add(infer, global_bias)
        infer = tf.add(infer, bias_user)
        infer = tf.add(infer, bias_item, name = 'svd_inference')
        
        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), name = 'svd_regularization')
    return infer, regularizer

def optimizer(infer, regularizer, rate_batch, learning_rate = 0.001, reg = 0.1, device = '/cpu:0'):
    global_step = tf.train.get_global_step()
    assert global_step is not None
    with tf.device(device):
        cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))
        penalty = tf.constant(reg, dtype = tf.float32, shape = [], name = 'l2')
        cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost, global_step = global_step)
    return cost, train_op

## 定义训练函数

In [4]:
import time 
from collections import deque
import numpy as np
import pandas as pd
from six import next
import tensorflow as tf
from tensorflow.core.framework import summary_pb2

np.random.seed(12321)

batch_size = 2000
user_num = 6040
item_num = 3952
dim = 15
epoch_max = 200
device = '/cpu:0'

def make_scalar_summary(name, val):
    return summary_pb2.Summary(value = [summary_pb2.Summary.Value(tag = name, simple_value = val)])

def svd(train, test):
    samples_per_batch = len(train) // batch_size
    
    iter_train = ShuffleDataIterator([train['user'], train['item'], train['rate']], batch_size = batch_size)##注意iuputs
    iter_test = OneEpochDataIterator([test['user'], test['item'], test['rate']], batch_size = -1)
    user_batch = tf.placeholder(tf.int32, shape = [None], name = 'id_user')
    item_batch = tf.placeholder(tf.int32, shape = [None], name = 'id_item')
    rate_batch = tf.placeholder(tf.float32, shape = [None])
    
    infer, regularizer = inference_svd(user_batch, item_batch, user_num = user_num, item_num = item_num, dim = dim, device = device)
    global_step = tf.train.get_or_create_global_step()
    cost, train_op = optimizer(infer, regularizer, rate_batch, learning_rate = 0.001, reg = 0.05, device = device)
    
    init_op = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        sess.run(init_op)
        summary_writer = tf.summary.FileWriter(logdir = './data', graph = sess.graph)
        print('{} {} {} {}'.format('epoch', 'train_error', 'val_error', 'elapsed_time'))
        errors = deque(maxlen = samples_per_batch)
        start = time.time()
        for i in range(epoch_max * samples_per_batch):
            users, items, rates = next(iter_train)
            _, pred_batch = sess.run([train_op, infer], feed_dict = {user_batch: users, item_batch: items, rate_batch: rates})
            pred_batch = np.clip(pred_batch, 1.0, 5.0)
            errors.append(np.power(pred_batch - rates, 2))
            if i % samples_per_batch == 0:
                train_err = np.sqrt(np.mean(errors))
                test_err2 = np.array([])
                for users, items, rates in iter_test:
                    pred_batch = sess.run(infer, feed_dict = {user_batch: users, item_batch: items})
                    pred_batch = np.clip(pred_batch, 1.0, 5.0)
                    test_err2 = np.append(test_err2, np.power((pred_batch - rates), 2))
                end = time.time()
                test_err = np.sqrt(np.mean(test_err2))
                print('{:3d} {:f} {:f} {:f}(s)'.format(i // samples_per_batch, train_err, test_err, end - start))
                train_err_summary = make_scalar_summary('training_error', train_err)
                test_err_summary = make_scalar_summary('testing_error', test_err)
                summary_writer.add_summary(train_err_summary, i)
                summary_writer.add_summary(test_err_summary, i)
                start = end

## 读取数据并训练

In [5]:
df_train, df_test = get_data()
svd(df_train, df_test)

(900188, 4) (100021, 4)
epoch train_error val_error elapsed_time
  0 2.453269 2.434387 0.152168(s)
  1 1.862537 1.114424 1.675086(s)
  2 0.987516 0.945208 1.737937(s)
  3 0.926431 0.925056 2.171259(s)
  4 0.913882 0.919216 2.222362(s)
  5 0.908909 0.915572 2.104740(s)
  6 0.904973 0.912864 1.676088(s)
  7 0.901452 0.910505 1.739690(s)
  8 0.897963 0.906348 1.887902(s)
  9 0.893615 0.902586 1.869329(s)
 10 0.889465 0.898687 1.874177(s)
 11 0.884281 0.894511 2.018069(s)
 12 0.879608 0.891036 1.931045(s)
 13 0.873678 0.887575 1.757693(s)
 14 0.871024 0.884662 1.577150(s)
 15 0.866396 0.882199 1.604204(s)
 16 0.861624 0.879178 1.608652(s)
 17 0.858588 0.876365 1.662186(s)
 18 0.854816 0.873721 1.606498(s)
 19 0.848769 0.870985 1.645293(s)
 20 0.846123 0.868464 1.677950(s)
 21 0.839872 0.866077 1.624702(s)
 22 0.836173 0.863375 1.619513(s)
 23 0.832099 0.861184 1.620034(s)
 24 0.826820 0.858698 1.647357(s)
 25 0.822007 0.856891 1.605642(s)
 26 0.818050 0.855212 1.646852(s)
 27 0.813763 0.85

# 用surprise库实现电影推荐

In [6]:
from surprise import KNNBaseline
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate

reader = Reader(line_format = 'user item rating timestamp', sep = '::')
data = Dataset.load_from_file('./movielens/ml-1m/ratings.dat', reader = reader)
algo = KNNBaseline()
perf = cross_validate(algo, data, measures = ['RMSE', 'MAE'], cv = 3, verbose = True)
##verbose = True是指打印出过程

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
MAE (testset)     0.7107  0.7092  0.7102  0.7100  0.0006  
RMSE (testset)    0.9009  0.8981  0.8989  0.8993  0.0012  
Fit time          26.08   27.80   26.41   26.76   0.74    
Test time         284.04  295.03  297.63  292.24  5.89    


In [7]:
with open('./movielens/ml-1m/movies.dat', 'r', encoding = 'ISO-8859-1') as f:
    movies_id_dic = {}
    id_movies_dic = {}
    for line in f.readlines():
        movies = line.strip().split('::')
        id_movies_dic[int(movies[0]) - 1] = movies[1]
        movies_id_dic[movies[1]] = int(movies[0]) - 1

In [8]:
movie_id = int(movies_id_dic['Toy Story (1995)'])
print(movie_id)

0


In [9]:
toy_story_neighbors = algo.get_neighbors(movie_id, k = 10)
print(toy_story_neighbors)

[543, 556, 887, 938, 970, 1240, 1302, 1530, 1664, 1668]


In [10]:
print('最接近《Toy Story (1995)》的10部电影是：')
for i in toy_story_neighbors:
    print(id_movies_dic[i])

最接近《Toy Story (1995)》的10部电影是：
Striking Distance (1993)
Mamma Roma (1962)
Land Before Time III: The Time of the Great Giving (1995)
Reluctant Debutante, The (1958)
Cat on a Hot Tin Roof (1958)
Braindead (1992)
Man Who Would Be King, The (1975)
Losing Chase (1996)
Bean (1997)
Tango Lesson, The (1997)


# 用pyspark实现矩阵分解与预测

## 配置spark的运行环境

In [11]:
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel

conf = SparkConf().setMaster('local').setAppName('movielenALS').set('spark.excutor.memory', '2g')
sc = SparkContext.getOrCreate(conf)
print(sc)

<SparkContext master=local appName=movielenALS>


## 载入数据和将数据转换为RDD格式

In [12]:
ratings_data = sc.textFile('./movielens/ml-1m/ratings.dat')
print(ratings_data.first())

1::1193::5::978300760


In [13]:
ratings_int = ratings_data.map(lambda x: x.split('::')[0:3])
print(ratings_int.first())

['1', '1193', '5']


In [14]:
from pyspark.mllib.recommendation import Rating

rates_data = ratings_int.map(lambda x: Rating(int(x[0]), int(x[1]), int(x[2])))
print(rates_data.first())

Rating(user=1, product=1193, rating=5.0)


## 预测
### 预测user14对item25的评分

In [15]:
sc.setCheckpointDir('checkpoint/')
ALS.checkpointInterval = 2
model = ALS.train(ratings = rates_data, rank = 20, iterations = 5, lambda_ = 0.02)
print(model.predict(14, 25))

2.870867324270309


### 预测item25的最值得推荐的10个user

In [16]:
print(model.recommendUsers(25, 10))

[Rating(user=3915, product=25, rating=6.347806214632151), Rating(user=1459, product=25, rating=6.279613647532278), Rating(user=512, product=25, rating=5.974529079431231), Rating(user=842, product=25, rating=5.825610793856784), Rating(user=4558, product=25, rating=5.800107840522846), Rating(user=3352, product=25, rating=5.758816556787024), Rating(user=4315, product=25, rating=5.7355643830448315), Rating(user=3149, product=25, rating=5.681583926008349), Rating(user=1583, product=25, rating=5.679878368589941), Rating(user=2687, product=25, rating=5.638816062611307)]


### 预测user14的最值得推荐的10个item

In [17]:
print(model.recommendProducts(14, 10))

[Rating(user=14, product=202, rating=7.603132457441072), Rating(user=14, product=2813, rating=7.337182882853998), Rating(user=14, product=2192, rating=7.252910718255526), Rating(user=14, product=2843, rating=7.218281540447825), Rating(user=14, product=263, rating=7.015929591785064), Rating(user=14, product=3486, rating=6.839844434537472), Rating(user=14, product=2931, rating=6.813029575511133), Rating(user=14, product=811, rating=6.775292346221384), Rating(user=14, product=2904, rating=6.763012772568462), Rating(user=14, product=1857, rating=6.647223383730378)]


### 预测出每个user最值得被推荐的3个item

In [18]:
print(model.recommendProductsForUsers(3).collect())

[(4551, (Rating(user=4551, product=2931, rating=6.702551600582929), Rating(user=4551, product=263, rating=6.110408128732663), Rating(user=4551, product=2932, rating=5.969799865949127))), (667, (Rating(user=667, product=2964, rating=5.985525908193226), Rating(user=667, product=2847, rating=5.612322211175831), Rating(user=667, product=1585, rating=5.425964838445017))), (5618, (Rating(user=5618, product=2493, rating=6.009643412940337), Rating(user=5618, product=3092, rating=5.994983455581075), Rating(user=5618, product=2964, rating=5.97109676684356))), (5354, (Rating(user=5354, product=128, rating=6.619294990310245), Rating(user=5354, product=3711, rating=6.188408466760133), Rating(user=5354, product=2466, rating=6.1691683400863))), (1894, (Rating(user=1894, product=3808, rating=5.3152813717261855), Rating(user=1894, product=53, rating=5.212014732703803), Rating(user=1894, product=974, rating=5.157308514481416))), (2493, (Rating(user=2493, product=3847, rating=7.353577987986553), Rating(u

### 预测出每个item最值得被推荐的3个user

In [19]:
print(model.recommendUsersForProducts(3).collect())

[(3586, (Rating(user=527, product=3586, rating=8.133818861879984), Rating(user=1989, product=3586, rating=7.4650017718018145), Rating(user=573, product=3586, rating=7.305048810668571))), (1084, (Rating(user=1588, product=1084, rating=5.932738279245949), Rating(user=2783, product=1084, rating=5.779389024234484), Rating(user=3723, product=1084, rating=5.738664544256398))), (3702, (Rating(user=527, product=3702, rating=6.421472662653028), Rating(user=1989, product=3702, rating=5.890798714546796), Rating(user=1574, product=3702, rating=5.841956593290477))), (3007, (Rating(user=1459, product=3007, rating=6.925120950119085), Rating(user=1927, product=3007, rating=6.019426815219136), Rating(user=407, product=3007, rating=6.017298175883218))), (667, (Rating(user=467, product=667, rating=6.550516750186947), Rating(user=4028, product=667, rating=5.80373110815198), Rating(user=4264, product=667, rating=5.479054227388586))), (1053, (Rating(user=41, product=1053, rating=9.00219005532764), Rating(us