In [1]:
import numpy as np
from numpy import linalg as la

In [2]:
from numpy import *
U,Sigma,VT=np.linalg.svd([[1, 1],[7, 7]])

In [3]:
U

array([[-0.14142136, -0.98994949],
       [-0.98994949,  0.14142136]])

In [4]:
Sigma

array([1.00000000e+01, 2.82797782e-16])

In [5]:
VT

array([[-0.70710678, -0.70710678],
       [ 0.70710678, -0.70710678]])

In [6]:
def loadExData():
    return[[1, 1, 1, 0, 0],
            [2, 2, 2, 0, 0],
            [1, 1, 1, 0, 0],
            [5, 5, 5, 0, 0],
            [1, 1, 0, 2, 2],
            [0, 0, 0, 3, 3],
            [0, 0, 0, 1, 1]]

In [7]:
Data=loadExData()
U,Sigma,VT=linalg.svd(Data)
Sigma

array([9.72140007e+00, 5.29397912e+00, 6.84226362e-01, 4.11502614e-16,
       1.36030206e-16])

In [8]:
Sig3=mat([[Sigma[0], 0, 0],[0, Sigma[1], 0], [0, 0, Sigma[2]]])
U[:,:3]*Sig3*VT[:3,:]

matrix([[ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
          7.75989921e-16,  7.71587483e-16],
        [ 2.00000000e+00,  2.00000000e+00,  2.00000000e+00,
          3.00514919e-16,  2.77832253e-16],
        [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
          2.18975112e-16,  2.07633779e-16],
        [ 5.00000000e+00,  5.00000000e+00,  5.00000000e+00,
          3.00675663e-17, -1.28697294e-17],
        [ 1.00000000e+00,  1.00000000e+00, -5.48397422e-16,
          2.00000000e+00,  2.00000000e+00],
        [ 3.21319929e-16,  4.43562065e-16, -3.48967188e-16,
          3.00000000e+00,  3.00000000e+00],
        [ 9.71445147e-17,  1.45716772e-16, -1.52655666e-16,
          1.00000000e+00,  1.00000000e+00]])

In [9]:
# 欧式距离函数 -> 相似度
def ecludSim(inA,inB):
    # la.form求A和B的欧式距离
    # 距离与相似度程相反的关系，因此取倒数
    # 为防止分母为0，求得的距离再加1
    # 计算结果为(0, 1]
    # 缺点：一个特征不一样就会导致距离很大
    #print(inA, inB)
    return 1.0/(1.0 + la.norm(inA - inB))

# pearson相似度
def pearsSim(inA,inB):
    # 判断两组数据与某一直线的拟合程度
    # 优点：数据不规范时效果好
    if len(inA) < 3 : return 1.0
    # 结果为[-1,1]，需要转到[0,1]区间
    return 0.5+0.5*corrcoef(inA, inB, rowvar = 0)[0][1]

# 余弦相似度：只考虑两组数据之间的夹角
def cosSim(inA,inB):
    # 不会因为文章的长度不同导致结果偏差太大
    # 结果为[-1,1]，需要转到[0,1]区间
    num = float(inA.T.dot(inB))
    denom = la.norm(inA)*la.norm(inB)
    return 0.5+0.5*(num/denom)

In [10]:
myMat=np.array(loadExData())
ecludSim(myMat[:,0],myMat[:,4])  # 三种距离的计算结果都和书上不一样，难道是用的数据不同？

0.13367660240019172

In [11]:
ecludSim(myMat[:,0],myMat[:,0])

1.0

In [12]:
cosSim(myMat[:,0],myMat[:,4])

0.5472455591261534

In [13]:
cosSim(myMat[:,0],myMat[:,0])

0.9999999999999999

In [14]:
pearsSim(myMat[:,0],myMat[:,4])

0.23768619407595815

In [15]:
pearsSim(myMat[:,0],myMat[:,0])

1.0

In [16]:
def standEst(dataMat, user, simMeas, dishId):
    totalSim, totalScore = 0, 0
    # 找出user对其它dish的点评
    for j in range(dataMat.shape[1]):
        if dataMat[user, j] == 0:continue
        # 假如user对j有点评，找出对dishID和j都有点评的人
        overlap = np.array(dataMat[:,j]>0) & np.array(dataMat[:,dishId]>0)
        # 根据这些人对j的评价和对dishId的评价，计算j和dishId的相似度
        if dataMat[overlap].shape[0] == 0:sim = 0
        else: sim = simMeas(dataMat[overlap, j], dataMat[overlap, dishId])
        # 以相似度为权值，根据user对j的评价来估计user对dishId的评价
        totalScore += sim * dataMat[user, j]
        totalSim += sim
    if totalScore == 0:return 0
    return totalScore/totalSim
    
# dataMat:一行代表一个User，一列代表一个菜
def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
    # 1 Look for things the user hasn’t yet rated: look for values with 0 in the user-item matrix.
    notRatedDishId = np.arange(dataMat.shape[1])[dataMat[user,:]==0]
    # 2 Of all the items this user hasn’t yet rated, find a projected rating for each item:
    # that is, what score do we think the user will give to this item? 
    notRatedScorePredict = []
    for dishId in notRatedDishId:
        # print (dishId)
        notRatedScorePredict.append((dishId, estMethod(dataMat, user, simMeas, dishId)))
    # 3 Sort the list in descending order and return the first N items.
    notRatedScorePredict.sort(key=lambda p:p[1],reverse=True)
    return notRatedScorePredict

In [17]:
def loadExData_2():
    return[[1, 1, 0, 2, 2],
            [2, 0, 0, 3, 3],
            [1, 0, 0, 1, 1],
            [1, 1, 1, 2, 0],
            [2, 2, 2, 0, 0],
            [1, 1, 1, 0, 0],
            [5, 5, 5, 0, 0]]

In [18]:
myMat=np.array(loadExData_2())
myMat[0,1]=myMat[0,0]=myMat[1,0]=myMat[2,0]=4
myMat[3,3]=2
myMat

array([[4, 4, 0, 2, 2],
       [4, 0, 0, 3, 3],
       [4, 0, 0, 1, 1],
       [1, 1, 1, 2, 0],
       [2, 2, 2, 0, 0],
       [1, 1, 1, 0, 0],
       [5, 5, 5, 0, 0]])

In [19]:
recommend(myMat, 2)  # 跟书上结果不一样，不知道哪里搞错了

[(2, 2.5), (1, 2.0243290220056256)]

In [20]:
recommend(myMat, 2, simMeas=ecludSim)

[(2, 3.0), (1, 2.8266504712098603)]

In [21]:
recommend(myMat, 2, simMeas=pearsSim)

[(2, 2.5), (1, 2.0)]

In [22]:
def loadExData2():
    return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]

In [34]:
from numpy import linalg as la
myMat = np.array(loadExData2())
U,Sigma,VT=la.svd(myMat)   # 从文件夹里找到的函数
Sigma   # 跟书上不一样？

array([15.77075346, 11.40670395, 11.03044558,  4.84639758,  3.09292055,
        2.58097379,  1.00413543,  0.72817072,  0.43800353,  0.22082113,
        0.07367823])

In [35]:
Sig2=Sigma**2
sum(Sig2)

541.9999999999995

In [36]:
sum(Sig2)*0.9

487.7999999999996

In [37]:
sum(Sig2[:2])

378.8295595113579

In [38]:
sum(Sig2[:3])

500.5002891275793

In [71]:
def svdEst(dataMat, user, simMeas, dishId):
    # 先对dataMat做SVD分解
    U,Sigma,VT = la.svd(dataMat)
    # 把Sigma转成对角矩阵
    Sig4 = np.eye(4) * Sigma[:4]
    # 对dataMat变形，仅包含前4个奇异特征
    # 不知道书上为什么是U*sigma*data.T，但前面的介绍说是U*sigma*VT
    # 计算结果与书上不太一样，但总体上差不多
    xformedItems = U[:,:4].dot(Sig4).dot(VT[:4,:])
    #xformedItems = (dataMat.T.dot(U[:,:4]).dot(np.mat(Sig4).I)).T
    totalSim, totalScore = 0, 0
    # 找出user对其它dish的点评
    for j in range(dataMat.shape[1]):
        if dataMat[user, j] == 0:continue
        # 为什么要把overlap那一段去掉了呢？
        sim = simMeas(xformedItems[:, j], xformedItems[:, dishId]) # 计算相似度时使用去噪之后的data
        print ('the %d and %d similarity is: %f' % (dishId, j, sim))
        # 以相似度为权值，根据user对j的评价来估计user对dishId的评价
        totalScore += sim * dataMat[user, j]
        totalSim += sim
    if totalScore == 0:return 0
    return totalScore/totalSim

In [72]:
recommend(myMat, 1, estMethod=svdEst)

the 0 and 3 similarity is: 0.498297
the 0 and 5 similarity is: 0.498702
the 0 and 10 similarity is: 0.540399
the 1 and 3 similarity is: 0.497432
the 1 and 5 similarity is: 0.497414
the 1 and 10 similarity is: 0.539074
the 2 and 3 similarity is: 0.497835
the 2 and 5 similarity is: 0.497894
the 2 and 10 similarity is: 0.539558
the 4 and 3 similarity is: 0.495847
the 4 and 5 similarity is: 0.500120
the 4 and 10 similarity is: 0.501394
the 6 and 3 similarity is: 0.750797
the 6 and 5 similarity is: 0.636231
the 6 and 10 similarity is: 0.623186
the 7 and 3 similarity is: 0.500010
the 7 and 5 similarity is: 0.502570
the 7 and 10 similarity is: 0.544420
the 8 and 3 similarity is: 0.500932
the 8 and 5 similarity is: 0.502238
the 8 and 10 similarity is: 0.544006
the 9 and 3 similarity is: 0.530070
the 9 and 5 similarity is: 0.518088
the 9 and 10 similarity is: 0.517555


[(4, 3.334001055799355),
 (9, 3.330895617349481),
 (7, 3.3248674803258678),
 (8, 3.324616235400013),
 (0, 3.32438068443843),
 (2, 3.3243000943728127),
 (1, 3.324276526003641),
 (6, 3.3164990076214846)]

In [73]:
recommend(myMat, 1, estMethod=svdEst,simMeas=pearsSim)

the 0 and 3 similarity is: 0.260009
the 0 and 5 similarity is: 0.267319
the 0 and 10 similarity is: 0.304773
the 1 and 3 similarity is: 0.259671
the 1 and 5 similarity is: 0.266348
the 1 and 10 similarity is: 0.303700
the 2 and 3 similarity is: 0.259959
the 2 and 5 similarity is: 0.266751
the 2 and 10 similarity is: 0.304128
the 4 and 3 similarity is: 0.269958
the 4 and 5 similarity is: 0.282481
the 4 and 10 similarity is: 0.259976
the 6 and 3 similarity is: 0.693294
the 6 and 5 similarity is: 0.543264
the 6 and 10 similarity is: 0.517233
the 7 and 3 similarity is: 0.237709
the 7 and 5 similarity is: 0.248966
the 7 and 10 similarity is: 0.286199
the 8 and 3 similarity is: 0.261446
the 8 and 5 similarity is: 0.270112
the 8 and 10 similarity is: 0.307803
the 9 and 3 similarity is: 0.299239
the 9 and 5 similarity is: 0.288007
the 9 and 10 similarity is: 0.261465


[(4, 3.3477049430350485),
 (9, 3.3393469031787215),
 (7, 3.3221301186012315),
 (8, 3.321806644564644),
 (0, 3.3212579735685965),
 (2, 3.3210621579633552),
 (1, 3.321009939503826),
 (6, 3.309765500739301)]

In [78]:
def printMat(inMat, thresh=0.8):
    for i in range(32):
        for k in range(32):
            if float(inMat[i,k]) > thresh:
                print ('1,')
            else: print ('0,')
        print ('')

def imgCompress(numSV=3, thresh=0.8):
    myl = []
    for line in open('0_5.txt').readlines():
        myl.append(line[:32])
    myMat = np.array(myl)
    print ("****original matrix******")
    printMat(myMat, thresh)
    U,Sigma,VT = la.svd(myMat)
    SigN = np.eye(numSV) * Sigma[:numSV]
    reconMat = U[:,:numSV]*SigN*VT[:numSV,:]
    print ("****reconstructed matrix using %d singular values******" % numSV)
    printMat(reconMat, thresh)

In [79]:
imgCompress(2)

FileNotFoundError: [Errno 2] No such file or directory: '0_5.txt'