# 用TensorFlow实现矩阵分解
## 数据导入与数据观察

In [1]:
import numpy as np
import pandas as pd

col_names = ['user', 'item', 'rate', 'timestamp']
df = pd.read_csv('./movielens/ml-1m/ratings.dat', sep = '::', header = None, names = col_names, engine = 'python')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
user         1000209 non-null int64
item         1000209 non-null int64
rate         1000209 non-null int64
timestamp    1000209 non-null int64
dtypes: int64(4)
memory usage: 30.5 MB


In [2]:
df.sort_values(by = ['user']).tail()

Unnamed: 0,user,item,rate,timestamp
999977,6040,1594,3,964828599
999976,6040,1587,1,956716374
999975,6040,3182,5,984195682
999983,6040,300,2,956704716
1000208,6040,1097,4,956715569


In [3]:
df.sort_values(by = ['user']).head()

Unnamed: 0,user,item,rate,timestamp
0,1,1193,5,978300760
29,1,745,3,978824268
30,1,2294,4,978824291
31,1,3186,4,978300019
32,1,1566,4,978824330


In [4]:
df.sort_values(by = ['item']).tail()

Unnamed: 0,user,item,rate,timestamp
84701,551,3952,4,976067330
253845,1544,3952,4,974742620
180689,1130,3952,3,975593522
35180,238,3952,4,976760112
372755,2177,3952,5,974609645


In [5]:
df.sort_values(by = ['item']).head()

Unnamed: 0,user,item,rate,timestamp
427702,2599,1,4,973796689
1966,18,1,4,978154768
683688,4089,1,5,965428947
596207,3626,1,4,966594018
465902,2873,1,5,972784317


In [6]:
df['user'] -= 1
df['item'] -= 1

In [7]:
df['rate'] = df['rate'].astype('float64')

In [8]:
len(df['user'].unique())

6040

In [9]:
len(df['item'].unique())
##item的id没有完全由1到3952

3706

## 生成batch数据

In [10]:
import numpy as np
import pandas as pd

def read_and_process(filename, sep = '::'):
    col_names = ['user', 'item', 'rate', 'timestamp']
    df = pd.read_csv(filename, sep = sep, header = None, names = col_names, engine = 'python')
    df['user'] -= 1
    df['item'] -= 1
    for col in ('user', 'item'):
        df[col] = df[col].astype(np.float32)
    df['rate'] = df['rate'].astype(np.float32)
    return df

def get_data():
    df = read_and_process("./movielens/ml-1m/ratings.dat", sep = '::')
    rows = len(df)
    df = df.iloc[np.random.permutation(rows)].reset_index(drop = True)##打乱数据
    split_index = int(rows * 0.9)
    df_train = df[0: split_index]
    df_test = df[split_index:].reset_index(drop = True)
    print(df_train.shape, df_test.shape)
    return df_train, df_test
    
class ShuffleDataIterator(object):
    def __init__(self, inputs, batch_size = 10):
        ##注意这里的输入
        self.inputs = inputs
        self.batch_size = batch_size
        self.num_cols = len(self.inputs)
        self.len = len(self.inputs[0])
        self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))
        
    def __iter__(self):
        return self
    
    def __len__(self):
        return self.len
    
    def __next__(self):
        return self.next()
    
    def next(self):
        ids = np.random.randint(0, self.len, (self.batch_size,))
        out = self.inputs[ids, :]
        return [out[:, i] for i in range(self.num_cols)]
       
class OneEpochDataIterator(ShuffleDataIterator):
    def __init__(self, inputs, batch_size=10):
        super(OneEpochDataIterator, self).__init__(inputs, batch_size=batch_size)
        if batch_size > 0:
            self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size))
        else:
            self.idx_group = [np.arange(self.len)]
        self.group_id = 0
    ##next函数不能写在__init__下面
    def next(self):
        if self.group_id >= len(self.idx_group):
            self.group_id = 0
            raise StopIteration
        out = self.inputs[self.idx_group[self.group_id], :]
        self.group_id += 1
        return [out[:, i] for i in range(self.num_cols)]

## 构建神经网络和迭代优化部分

In [11]:
import tensorflow as tf

In [12]:
def inference_svd(user_batch, item_batch, user_num, item_num, dim = 5, device = '/cpu:0'):
    with tf.device('/cpu:0'):
        global_bias = tf.get_variable('global_bias', shape = [])
        w_bias_user = tf.get_variable('embd_bias_user', shape = [user_num])
        w_bias_item = tf.get_variable('embd_bias_item', shape = [item_num])
        
        bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name = 'bias_user')
        bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name = 'bias_item')
        
        w_user = tf.get_variable('embd_user', shape = [user_num, dim], initializer = tf.truncated_normal_initializer(stddev = 0.02))
        w_item = tf.get_variable('embd_item', shape = [item_num, dim], initializer = tf.truncated_normal_initializer(stddev = 0.02))
        
        embd_user = tf.nn.embedding_lookup(w_user, user_batch, name = 'embedding_user')
        embd_item = tf.nn.embedding_lookup(w_item, item_batch, name = 'embedding_item')
    
    with tf.device(device):
        infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)##tf.multiply是元素点乘
        infer = tf.add(infer, global_bias)
        infer = tf.add(infer, bias_user)
        infer = tf.add(infer, bias_item, name = 'svd_inference')
        
        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), name = 'svd_regularization')
    return infer, regularizer

def optimizer(infer, regularizer, rate_batch, learning_rate = 0.001, reg = 0.1, device = '/cpu:0'):
    global_step = tf.train.get_global_step()
    assert global_step is not None
    with tf.device(device):
        cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))
        penalty = tf.constant(reg, dtype = tf.float32, shape = [], name = 'l2')
        cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost, global_step = global_step)
    return cost, train_op

## 定义训练函数

In [13]:
import time 
from collections import deque
import numpy as np
import pandas as pd
from six import next
import tensorflow as tf
from tensorflow.core.framework import summary_pb2

np.random.seed(12321)

batch_size = 2000
user_num = 6040
item_num = 3952
dim = 15
epoch_max = 200
device = '/cpu:0'

def make_scalar_summary(name, val):
    return summary_pb2.Summary(value = [summary_pb2.Summary.Value(tag = name, simple_value = val)])

def svd(train, test):
    samples_per_batch = len(train) // batch_size
    
    iter_train = ShuffleDataIterator([train['user'], train['item'], train['rate']], batch_size = batch_size)##注意iuputs
    iter_test = OneEpochDataIterator([test['user'], test['item'], test['rate']], batch_size = -1)
    user_batch = tf.placeholder(tf.int32, shape = [None], name = 'id_user')
    item_batch = tf.placeholder(tf.int32, shape = [None], name = 'id_item')
    rate_batch = tf.placeholder(tf.float32, shape = [None])
    
    infer, regularizer = inference_svd(user_batch, item_batch, user_num = user_num, item_num = item_num, dim = dim, device = device)
    global_step = tf.train.get_or_create_global_step()
    cost, train_op = optimizer(infer, regularizer, rate_batch, learning_rate = 0.001, reg = 0.05, device = device)
    
    init_op = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        sess.run(init_op)
        summary_writer = tf.summary.FileWriter(logdir = './data', graph = sess.graph)
        print('{} {} {} {}'.format('epoch', 'train_error', 'val_error', 'elapsed_time'))
        errors = deque(maxlen = samples_per_batch)
        start = time.time()
        for i in range(epoch_max * samples_per_batch):
            users, items, rates = next(iter_train)
            _, pred_batch = sess.run([train_op, infer], feed_dict = {user_batch: users, item_batch: items, rate_batch: rates})
            pred_batch = np.clip(pred_batch, 1.0, 5.0)
            errors.append(np.power(pred_batch - rates, 2))
            if i % samples_per_batch == 0:
                train_err = np.sqrt(np.mean(errors))
                test_err2 = np.array([])
                for users, items, rates in iter_test:
                    pred_batch = sess.run(infer, feed_dict = {user_batch: users, item_batch: items})
                    pred_batch = np.clip(pred_batch, 1.0, 5.0)
                    test_err2 = np.append(test_err2, np.power((pred_batch - rates), 2))
                end = time.time()
                test_err = np.sqrt(np.mean(test_err2))
                print('{:3d} {:f} {:f} {:f}(s)'.format(i // samples_per_batch, train_err, test_err, end - start))
                train_err_summary = make_scalar_summary('training_error', train_err)
                test_err_summary = make_scalar_summary('testing_error', test_err)
                summary_writer.add_summary(train_err_summary, i)
                summary_writer.add_summary(test_err_summary, i)
                start = end

## 读取数据并训练

In [14]:
df_train, df_test = get_data()
svd(df_train, df_test)

(900188, 4) (100021, 4)
epoch train_error val_error elapsed_time
  0 2.264596 2.247155 0.949369(s)
  1 1.733866 1.083049 1.637083(s)
  2 0.973648 0.939389 1.621075(s)
  3 0.921301 0.920139 1.627079(s)
  4 0.906705 0.911216 1.623076(s)
  5 0.897401 0.903684 1.604064(s)
  6 0.889131 0.897997 1.617072(s)
  7 0.882971 0.894352 1.651095(s)
  8 0.878475 0.890581 1.651095(s)
  9 0.874920 0.888186 1.649096(s)
 10 0.872439 0.886456 1.616070(s)
 11 0.869852 0.884717 1.654097(s)
 12 0.867301 0.883477 1.633082(s)
 13 0.863772 0.881777 1.655098(s)
 14 0.863058 0.880250 1.616072(s)
 15 0.859637 0.878805 1.625077(s)
 16 0.855847 0.876470 1.729148(s)
 17 0.853328 0.873999 1.904800(s)
 18 0.849782 0.871398 1.732571(s)
 19 0.843692 0.868679 1.920236(s)
 20 0.840729 0.866077 1.518883(s)
 21 0.834058 0.863526 1.591707(s)
 22 0.830021 0.860664 1.566000(s)
 23 0.825493 0.858318 1.605350(s)
 24 0.819790 0.855831 1.576938(s)
 25 0.814666 0.853995 1.610972(s)
 26 0.810550 0.852447 1.822249(s)
 27 0.806285 0.85

# 用surprise库实现电影推荐

In [12]:
from surprise import KNNBaseline
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate

reader = Reader(line_format = 'user item rating timestamp', sep = '::')
data = Dataset.load_from_file('./movielens/ml-1m/ratings.dat', reader = reader)
algo = KNNBaseline()
perf = cross_validate(algo, data, measures = ['RMSE', 'MAE'], cv = 3, verbose = True)
##verbose = True是指打印出过程

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8986  0.8995  0.8999  0.8993  0.0005  
MAE (testset)     0.7094  0.7106  0.7103  0.7101  0.0005  
Fit time          37.38   37.22   40.55   38.38   1.53    
Test time         242.62  256.01  256.72  251.78  6.49    


In [22]:
with open('./movielens/ml-1m/movies.dat', 'r', encoding = 'ISO-8859-1') as f:
    movies_id_dic = {}
    id_movies_dic = {}
    for line in f.readlines():
        movies = line.strip().split('::')
        id_movies_dic[int(movies[0]) - 1] = movies[1]
        movies_id_dic[movies[1]] = int(movies[0]) - 1

In [23]:
movie_id = int(movies_id_dic['Toy Story (1995)'])
print(movie_id)

0


In [24]:
toy_story_neighbors = algo.get_neighbors(movie_id, k = 5)
print(toy_story_neighbors)

[1209, 2084, 2239, 2433, 2702]


In [25]:
print('最接近《Toy Story (1995)》的5部电影是：')
for i in toy_story_neighbors:
    print(id_movies_dic[i])

最接近《Toy Story (1995)》的5部电影是：
Star Wars: Episode VI - Return of the Jedi (1983)
101 Dalmatians (1961)
My Bodyguard (1980)
Down in the Delta (1998)
Broken Vessels (1998)


# 用pyspark实现矩阵分解与预测

## 配置spark的运行环境

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel

conf = SparkConf().setMaster('local').setAppName('movielenALS').set('spark.excutor.memory', '2g')
sc = SparkContext.getOrCreate(conf)
print(sc)

<SparkContext master=local appName=movielenALS>


## 载入数据和将数据转换为RDD格式

In [2]:
ratings_data = sc.textFile('./movielens/ml-1m/ratings.dat')
print(ratings_data.first())

1::1193::5::978300760


In [3]:
ratings_int = ratings_data.map(lambda x: x.split('::')[0:3])
print(ratings_int.first())

['1', '1193', '5']


In [4]:
from pyspark.mllib.recommendation import Rating

rates_data = ratings_int.map(lambda x: Rating(int(x[0]), int(x[1]), int(x[2])))
print(rates_data.first())

Rating(user=1, product=1193, rating=5.0)


## 预测
### 预测user14对item25的评分

In [5]:
sc.setCheckpointDir('checkpoint/')
ALS.checkpointInterval = 2
model = ALS.train(ratings = rates_data, rank = 20, iterations = 5, lambda_ = 0.02)
print(model.predict(14, 25))

3.1378864967051903


### 预测item25的最值得推荐的10个user

In [6]:
print(model.recommendUsers(25, 10))

[Rating(user=784, product=25, rating=5.651756988737681), Rating(user=2534, product=25, rating=5.621451107580374), Rating(user=5597, product=25, rating=5.472435292382994), Rating(user=5147, product=25, rating=5.468984895916398), Rating(user=4424, product=25, rating=5.456845326714397), Rating(user=988, product=25, rating=5.442160283772315), Rating(user=3352, product=25, rating=5.431763828667746), Rating(user=4358, product=25, rating=5.431637084472695), Rating(user=2163, product=25, rating=5.416137601562427), Rating(user=703, product=25, rating=5.388911180008825)]


### 预测user14的最值得推荐的10个item

In [7]:
print(model.recommendProducts(14, 10))

[Rating(user=14, product=1412, rating=7.0135970908290055), Rating(user=14, product=3867, rating=6.90917693185623), Rating(user=14, product=219, rating=6.472137650068161), Rating(user=14, product=681, rating=6.369985037630551), Rating(user=14, product=2227, rating=6.366542862304887), Rating(user=14, product=3787, rating=6.319148004863857), Rating(user=14, product=2211, rating=6.237190807972827), Rating(user=14, product=335, rating=6.189816143876498), Rating(user=14, product=1669, rating=6.16972117832563), Rating(user=14, product=3951, rating=6.1462996493506505)]


### 预测出每个user最值得被推荐的3个item

In [8]:
print(model.recommendProductsForUsers(3).collect())

[(4551, (Rating(user=4551, product=681, rating=6.437673420253655), Rating(user=4551, product=3446, rating=6.400271364592988), Rating(user=4551, product=97, rating=6.333734839496007))), (667, (Rating(user=667, product=572, rating=5.339938918623279), Rating(user=667, product=2129, rating=5.080693677198974), Rating(user=667, product=2624, rating=4.954774703767761))), (5618, (Rating(user=5618, product=2175, rating=5.932296432569597), Rating(user=5618, product=715, rating=5.527905009740621), Rating(user=5618, product=572, rating=5.397451464018237))), (5354, (Rating(user=5354, product=1539, rating=6.667739961477263), Rating(user=5354, product=2482, rating=6.385325282672407), Rating(user=5354, product=1624, rating=6.311942548586201))), (1894, (Rating(user=1894, product=1178, rating=5.241462926054287), Rating(user=1894, product=3415, rating=5.216145643194673), Rating(user=1894, product=3307, rating=5.194890741680945))), (2493, (Rating(user=2493, product=718, rating=6.833165155142537), Rating(u

### 预测出每个item最值得被推荐的3个user

In [9]:
print(model.recommendUsersForProducts(3).collect())

[(3586, (Rating(user=5760, product=3586, rating=8.48948024212648), Rating(user=491, product=3586, rating=7.690863050541767), Rating(user=3, product=3586, rating=6.725953281361106))), (1084, (Rating(user=2313, product=1084, rating=5.765263888442956), Rating(user=5416, product=1084, rating=5.638787886544602), Rating(user=2269, product=1084, rating=5.582180528741271))), (3702, (Rating(user=3395, product=3702, rating=5.871346747673625), Rating(user=5203, product=3702, rating=5.786369976520234), Rating(user=2694, product=3702, rating=5.735921412884776))), (3007, (Rating(user=5416, product=3007, rating=6.653046624920215), Rating(user=3714, product=3007, rating=6.343810499179996), Rating(user=1989, product=3007, rating=6.281707325450128))), (667, (Rating(user=2992, product=667, rating=5.209155745547151), Rating(user=745, product=667, rating=5.005144815451128), Rating(user=2155, product=667, rating=4.562556927656612))), (1053, (Rating(user=5328, product=1053, rating=10.236335654071116), Rating