In [7]:
% matplotlib inline
import pandas as pd
import numpy as np
from scipy.io import loadmat
import matplotlib.pyplot as plt
import scipy.optimize as opt




In [8]:
def cofiCostFunc(param, Y, R, nm, nu, nf, l=0):
    """
    X:电影特征(nm,nf)
    theta:评分参数(nu,nf)
    Y : 评分矩阵 (nm, nu)
    R ：0-1矩阵，表示用户对某一电影有无评分
    nu : 用户数量
    nm : 电影数量
    nf : 自定义的特征的维度
    l : lambda for regularization
    """
    X,theta=deserialize(param,nm,nu,nf)
    # (X@Theta)*R含义如下： 因为X@Theta是我们用自定义参数算的评分，但是有些电影本来是没有人
    # 评分的，存储在R中，0-1表示。将这两个相乘，得到的值就是我们要的已经被评分过的电影的预测分数。
    error = 0.5*np.square((X@theta.T - Y)*R).sum()
    reg1 = .5*l*np.square(theta).sum()
    reg2 = .5*l*np.square(X).sum()
    
    return error + reg1 +reg2
def cofiGradient(param, Y, R, nm, nu, nf, l=0):
    """
    计算X和Theta的梯度，并序列化输出。
    """
    X,theta=deserialize(param,nm,nu,nf)
    X_grad = ((X@theta.T-Y)*R)@theta + l*X #(nm,nf)@(nf,nu)-(nm,nu)
    theta_grad = ((X@theta.T-Y)*R).T@X + l*theta#((nm,nf)@(nf,nu)).T@(nm,nf)
    
    return serialize(X_grad,theta_grad)
def serialize(X, Theta):
    #打包参数
    return np.r_[X.flatten(),Theta.flatten()]
def deserialize(seq, nm, nu, nf):
    #解包参数
    return seq[:nm*nf].reshape(nm, nf), seq[nm*nf:].reshape(nu, nf)



In [9]:
movies = []  # 包含所有电影的列表
with open('movie_ids.txt','r', encoding='latin-1') as f:
    for line in f:
        movies.append(' '.join(line.strip().split(' ')[1:]))
mat = loadmat('ex8_movies.mat')
Y, R = mat['Y'], mat['R']


In [10]:
my_ratings = np.zeros((1682,1))

my_ratings[0]   = 4
my_ratings[97]  = 2
my_ratings[6]   = 3
my_ratings[11]  = 5
my_ratings[53]  = 4
my_ratings[63]  = 5
my_ratings[65]  = 3
my_ratings[68]  = 5
my_ratings[182] = 4
my_ratings[225] = 5
my_ratings[354] = 5
Y = np.c_[Y, my_ratings]  # (1682, 944)
R = np.c_[R, my_ratings!=0]  # (1682, 944)
nm, nu = Y.shape


In [11]:
X = np.random.random((nm, nf))
theta = np.random.random((nu, nf))
param=serialize(X,theta)
l=0.3
nf=100


res = opt.minimize(fun=cofiCostFunc,
                   x0=param,
                   args=(Y, R, nm, nu, nf, l),
                   method='TNC',
                   jac=cofiGradient,
                   options={'maxiter': 100})

ret = res.x


In [12]:
fit_X, fit_Theta = deserialize(ret, nm, nu, nf)

In [13]:
# 所有用户的剧场分数矩阵
pred_mat = fit_X @ fit_Theta.T
pred_mat

array([[4.82962461, 3.91685134, 3.12450707, ..., 4.2623316 , 3.5601146 ,
        3.7675776 ],
       [1.90442107, 3.08937695, 2.03682028, ..., 3.43500391, 4.69510986,
        5.15849467],
       [4.45686389, 3.39821284, 2.52173006, ..., 3.06068587, 1.78509392,
        4.78901859],
       ...,
       [3.90958057, 5.68034152, 4.84676597, ..., 5.84371227, 5.25202303,
        8.4066665 ],
       [4.32335656, 6.48007137, 3.11908841, ..., 6.50026939, 4.21424328,
        7.73647403],
       [4.89222967, 6.03094126, 4.88977997, ..., 5.72773016, 4.27654788,
        7.21104202]])

In [17]:
Y

array([[5., 4., 0., ..., 0., 0., 4.],
       [3., 0., 0., ..., 0., 5., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])