In [1]:
users = {"小明": {"中国合伙人": 5.0, "太平轮": 3.0, "荒野猎人": 4.5, "老炮儿": 5.0, "我的少女时代": 3.0, "肖洛特烦恼": 4.5, "火星救援": 5.0},
         "小红":{"小时代4": 4.0, "荒野猎人": 3.0, "我的少女时代": 5.0, "肖洛特烦恼": 5.0, "火星救援": 3.0, "后会无期": 3.0},
         "小阳": {"小时代4": 2.0, "中国合伙人": 5.0, "我的少女时代": 3.0, "老炮儿": 5.0, "肖洛特烦恼": 4.5, "速度与激情7": 5.0},
         "小四": {"小时代4": 5.0, "中国合伙人": 3.0, "我的少女时代": 4.0, "匆匆那年": 4.0, "速度与激情7": 3.5, "火星救援": 3.5, "后会无期": 4.5},
         "六爷": {"小时代4": 2.0, "中国合伙人": 4.0, "荒野猎人": 4.5, "老炮儿": 5.0, "我的少女时代": 2.0},
         "小李":  {"荒野猎人": 5.0, "盗梦空间": 5.0, "我的少女时代": 3.0, "速度与激情7": 5.0, "蚁人": 4.5, "老炮儿": 4.0, "后会无期": 3.5},
         "隔壁老王": {"荒野猎人": 5.0, "中国合伙人": 4.0, "我的少女时代": 1.0, "Phoenix": 5.0, "甄嬛传": 4.0, "The Strokes": 5.0},
         "邻村小芳": {"小时代4": 4.0, "我的少女时代": 4.5, "匆匆那年": 4.5, "甄嬛传": 2.5, "The Strokes": 3.0}
        }

In [3]:
#定义欧式距离
def enclidean_dis(rating1,rating2):  #输入rating1和2都是打分dict
    distance=0
    commonRatings = False
    for key in rating1:
        if key in rating2:
            distance += (rating1[key]-rating2[key])**2
            commonRatings = True
        
    #两个打分序列有共同电影：
    if commonRatings:
        return distance
    
    #没有共同电影
    else:
        return -1

In [4]:
#定义曼哈顿距离
def manhattan_dis(rating1,rating2):
    distance=0
    commonRating=False
    for key in rating1:
        if key in rating2:
            distance += abs(rating1[key]-rating2[key])
            commonRating = True
            
    if commonRatings:
        return distance
    else:
        return -1


#### 余弦相似度公式：
![余弦相似度](./余弦相似度.png)

In [7]:
from math import sqrt
#定义余弦距离
def cos_dis(rating1,rating2):
    distance=0
    commonRatings=False
    dot_product_1=0
    dot_product_2=0
    for score in rating1.values():
        dot_product_1 += score**2
    for score in rating2.values():
        dot_product_2 += score**2
    for key in rating1:
        if key in rating2:
            distance += rating1[key]*rating2[key]
            commonRatings=True
            
    if commonRatings:
        return 1-distance/sqrt(dot_product_1*dot_product_2)  #公式如图
    else:
        return -1

#### 皮尔逊相似度
![pearson](./皮尔逊相似度.png)

In [9]:
#定义皮尔逊相似度
def pearson_dis(rating1,rating2):
    sum_xy = 0
    sum_x = 0
    sum_y = 0
    sum_x2 = 0
    sum_y2 = 0
    n = 0
    for key in rating1:
        if key in rating2:
            n += 1
            x = rating1[key]
            y = rating2[key]
            sum_xy += x*y
            sum_x += x
            sum_y += y
            sum_x2 += pow(x,2)
            sum_y2 += pow(y,2)  #y的2次方
    denominator = sqrt(sum_x2-pow(sum_x,2)/n)*sqrt(sum_y2-pow(sum_y,2)/n) #分母
    if denominator ==0:
        return 0
    else:
        return (sum_xy-(sum_x*sum_y)/n)/denominator  #皮尔逊相似度公式如图

In [10]:
#查找最近邻
def computeNearestNeighbor(username,users):
    distances = []
    for user in users:
        if user != username:
            distance = pearson_dis(users[user],users[username])
            distances.append((distance,user))
    distances.sort()
    return distances

#推荐
def recommend(username,users):
    nearest = computeNearestNeighbor(username,users)[0][1]  #得到最近排序第一的username
    recommendations = []
    neighborRatings = users[nearest] #得到最近排序第一的username的电影列表
    userRatings = users[username]  #得到输入username的电影列表
    for artist in neighborRatings:
        if not artist in userRatings:
            recommendations.append((artist,neighborRatings[artist]))
    results = sorted(recommendations,key=lambda artistTuple:artist[1],reverse=True) #对电影评分进行排序，最高分在最前
    for result in results:
        print(result[0],result[1])

In [13]:
recommend('隔壁老王',users)

小时代4 5.0
匆匆那年 4.0
速度与激情7 3.5
火星救援 3.5
后会无期 4.5


#### 隐语义模型
![LFM](./隐语义模型.png)

In [38]:
#隐语义模型,矩阵分解
import numpy as np
def matrix_factorization(R,P,Q,K,steps=5000,alpha=0.0002,beta=0.02):
    Q=Q.T
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j]>0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])  #损失函数公式
                    for k in range(K):
                        P[i][k] += alpha*(2*eij*Q[k][j]-beta*P[i][k])
                        Q[k][j] += alpha*(2*eij*P[i][k]-beta*Q[k][j])  #加正则化的梯度公式
        eR = np.dot(P,Q)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j]>0:
                    e += pow(R[i][j]-np.dot(P[i,:],Q[:,j]),2)
                    for k in range(K):
                        e += (beta/2)*(pow(P[i][k],2)+pow(Q[k][j],2)) #更新迭代e
        if e<0.001:
            break
    return P,Q.T

In [39]:
R = [[5,3,0,1],
     [4,0,3,1],
     [1,1,0,5],
     [1,0,0,4],
     [0,1,5,4]]
R = np.array(R)
N = len(R)
M = len(R[0])
K = 2

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

nP,nQ = matrix_factorization(R,P,Q,K)
nR = np.dot(nP,nQ.T)

In [40]:
nP,nQ

(array([[ 0.81130492,  2.1260465 ],
        [ 0.63640411,  1.55505294],
        [ 2.08595526, -0.12911803],
        [ 1.67386748, -0.08477618],
        [ 1.87839318,  0.70002641]]), array([[ 0.67531656,  2.14743732],
        [ 0.37777081,  1.14636003],
        [ 2.06050739,  1.24536421],
        [ 2.34650968, -0.41330348]]))

In [41]:
nR,R

(array([[5.11343925, 2.74370205, 4.31940199, 1.02503241],
        [3.76915294, 2.02306543, 3.24792263, 0.8506196 ],
        [1.13140725, 0.63999726, 4.13732725, 4.94807916],
        [0.9483389 , 0.53515426, 3.34343909, 3.96278455],
        [2.77177287, 1.51208442, 4.74223086, 4.11834443]]),
 array([[5, 3, 0, 1],
        [4, 0, 3, 1],
        [1, 1, 0, 5],
        [1, 0, 0, 4],
        [0, 1, 5, 4]]))