# 对于电源的推荐系统

## 首先实现ALS算法，这里使用之前使用梯度下降来实现的

[原始代码](../4BasicKnowledgePoints/6GradientDescentMethod.ipynb)

In [50]:
import numpy as np
import pandas as pd
import random

In [47]:
K=2
max_iter = 5000 #迭代次数多意味着步长比较小。
alpha = 0.0002
lamda = 0.004

def grad(R, K=1000, max_iter= 5000, alpha=0.001, lamda= 0.002, cost_threshold = 0.0001):
    m = len(R)
    n = len(R[0])
    
    P = np.random.rand(m, K)
    Q = np.random.rand(K, n)
    
    for step in range(max_iter):
        # 对所有的用户u和物品i做遍历。对对应的Pu和Qi向量进行梯度下降。
        for u in range(m):
            for i in range(n):
                # 对于每一个大于0的评分，求出评分误差。
                if R[u][i] > 0:
                    eui = np.dot(P[u, :],Q[:, i]) - R[u,i]
                    
                    # 带入梯度下降的公式，按照梯度下降算法更新当前的Pu和Qi。也就是按照K个隐藏维度来更新。
                    for k in range(K):
                        # 注意这里和公式不同的地方在于求和公式。由于求和是对i在求和，而本计算是包含在
                        # for i in range(n):当中的，就相对于每个步骤都减去了一个对于i的元素，所以不
                        # 用再求和了。
                        P[u][k] = P[u][k] - alpha * (2 * eui * Q[k][i] - 2 * lamda * P[u][k])
                        # 同样的
                        Q[k][i] = Q[k][i] - alpha * (2 * eui * P[u][k] - 2 * lamda * Q[k][i])
                
        # u和i遍历完成。所有特征向量都更新完成。可以计算预测评分矩阵。
        # predictR = np.dot(P, Q)
        # 计算当前的损失函数。
        cost = 0
        
        for u in range(m):
            for i in range(n):
                # 在评分矩阵R中为0的不计算损失函数，原因依然是为0的评分可能是用户没有评分。
                if R[u][i] > 0:
                    cost += (np.dot(P[u, :],Q[:, i]) - R[u,i]) ** 2
                    for k in range(K):
                        cost += lamda * (P[u][k] ** 2 + Q[k][i] ** 2)
        # 当损失函数小于某一个特定阈值时退出。
        if cost < cost_threshold:
            break
    return P, Q, cost

In [16]:
movies_columns = ['mid', 'title', 'descri', 'duration', 'issueTime', 'shootTime', 'language', 'category', 'actors', 'director']
movies = pd.read_csv("../../data/MovieRecommendationSystem/movies.csv", sep='^', header=0, names=movies_columns)
ratings = pd.read_csv("../../data/MovieRecommendationSystem/ratings.csv", header=0, names=['uid', 'mid', 'rating', 'timestamp'])
tags = pd.read_csv("../../data/MovieRecommendationSystem/tags.csv", header=0, names=['uid', 'mid', 'tag', 'timestamp'])

In [18]:
movies.head()

Unnamed: 0,mid,title,descri,duration,issueTime,shootTime,language,category,actors,director
0,2,Jumanji (1995),,104 minutes,"April 30, 1997",1995,English|Français,Adventure|Children|Fantasy,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Joe Johnston
1,3,Grumpier Old Men (1995),,101 minutes,"September 5, 2000",1995,English,Comedy|Romance,Walter Matthau|Jack Lemmon|Ann-Margret|Sophia ...,Howard Deutch
2,10,GoldenEye (1995),,130 minutes,"October 22, 2002",1995,English|Pусский|Español,Action|Adventure|Thriller,Pierce Brosnan|Sean Bean|Izabella Scorupco|Fam...,Martin Campbell
3,11,"American President, The (1995)",,106 minutes,"August 31, 1999",1995,English,Comedy|Drama|Romance,Michael Douglas|Annette Bening|Michael J. Fox|...,Rob Reiner
4,12,Dracula: Dead and Loving It (1995),,88 minutes,"June 29, 2004",1995,English|Deutsch,Comedy|Horror,Leslie Nielsen|Mel Brooks|Amy Yasbeck|Peter Ma...,Mel Brooks


In [19]:
ratings.head()

Unnamed: 0,uid,mid,rating,timestamp
0,1,1029,3.0,1260759179
1,1,1061,3.0,1260759182
2,1,1129,2.0,1260759185
3,1,1172,4.0,1260759205
4,1,1263,2.0,1260759151


In [30]:
ratings.describe()

Unnamed: 0,uid,mid,rating,timestamp
count,44269.0,44269.0,44269.0,44269.0
mean,349.443448,6650.060494,3.580542,1090807000.0
std,195.014306,21711.578435,1.061401,182975500.0
min,1.0,1.0,0.5,789652000.0
25%,185.0,1196.0,3.0,952583800.0
50%,367.0,1681.0,4.0,1032859000.0
75%,523.0,2501.0,4.0,1218399000.0
max,671.0,131168.0,5.0,1476641000.0


In [20]:
tags.head()

Unnamed: 0,uid,mid,tag,timestamp
0,15,100365,activist,1425876220
1,15,100365,documentary,1425876220
2,15,100365,uganda,1425876220
3,23,150,Ron Howard,1148672905
4,68,2174,music,1249808064


In [25]:
# 组成用户和电影的评分矩阵R

# 统计行数和列数
uid_number = ratings.groupby('uid').count().shape[0]
mid_number = ratings.groupby('mid').count().shape[0]
print(uid_number, mid_number)


671 2776


In [37]:
# 统计行名称和列名称。
R_index = list(ratings.groupby('uid').count().index)
R_column = list(ratings.groupby('mid').count().index)
# print(column)

In [38]:
R_list= [[0 for j in range(0, mid_number)] for i in range(0, uid_number)]
R = pd.DataFrame(data=R_list, index=R_index, columns= R_column)
R.shape

(671, 2776)

In [42]:
R.head()

Unnamed: 0,1,2,3,10,11,12,13,14,15,16,...,130522,130580,130628,130642,130682,130960,130970,130980,131013,131168
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
# 把评分填入评分矩阵中。

for index, row in ratings.iterrows():
    # print(row['uid'], row['mid'], row['rating'])
    R.loc[row['uid'], row['mid']] = row['rating']


In [45]:
R.head()

Unnamed: 0,1,2,3,10,11,12,13,14,15,16,...,130522,130580,130628,130642,130682,130960,130970,130980,131013,131168
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0
2,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0
4,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0


In [51]:
P, Q, cost = grad(R.values)

KeyboardInterrupt: 