# 1.	資料準備

In [3]:
sc

In [2]:
###################################################
#資料來源：http://grouplens.org/datasets/movielens/
###################################################

rawUserData = sc.textFile("u.data")
rawUserData.count()

100000

In [3]:
rawUserData.first()

'196\t242\t3\t881250949'

In [4]:
print(rawUserData.first())

196	242	3	881250949


In [5]:
rawUserData.take(5)

['196\t242\t3\t881250949',
 '186\t302\t3\t891717742',
 '22\t377\t1\t878887116',
 '244\t51\t2\t880606923',
 '166\t346\t1\t886397596']

In [6]:
for x in rawUserData.take(5): 
    print (x)

196	242	3	881250949
186	302	3	891717742
22	377	1	878887116
244	51	2	880606923
166	346	1	886397596


In [7]:
#Import Rating模組
from pyspark.mllib.recommendation import Rating

In [8]:
rawRatings = rawUserData.map(lambda line: line.split("\t")[:3] )
rawRatings.take(5)

[['196', '242', '3'],
 ['186', '302', '3'],
 ['22', '377', '1'],
 ['244', '51', '2'],
 ['166', '346', '1']]

In [9]:
ratingsRDD = rawRatings.map(lambda x: (x[0],x[1],x[2]))
ratingsRDD .take(5)

[('196', '242', '3'),
 ('186', '302', '3'),
 ('22', '377', '1'),
 ('244', '51', '2'),
 ('166', '346', '1')]

In [10]:
numRatings = ratingsRDD.count()
numRatings

100000

In [11]:
#不重複的 用distinct()
numUsers = ratingsRDD.map(lambda x: x[0] ).distinct().count()
numUsers 

943

In [12]:
numMovies = ratingsRDD.map(lambda x: x[1]).distinct().count() 
numMovies

1682

In [13]:
ratingsRDD.persist()

PythonRDD[20] at RDD at PythonRDD.scala:48

# 2.	如何訓練模型? 

In [14]:
#####################################
#API：mllib.recommendation.ALS
#http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.recommendation.ALS
#ALS 推薦演算法 alternating least squares
#####################################

from pyspark.mllib.recommendation import ALS

In [19]:
#ALS.train(ratings, rank, iterations=5, lambda_=0.01, seed, blocks, nonnegative=True)
#ALS.trainImplicit (ratings, rank, iterations=5, lambda_=0.01)
model = ALS.train(ratingsRDD, 100, 10, 0.01, seed=123)
ALS.trainImplicit
print(model)

<pyspark.mllib.recommendation.MatrixFactorizationModel object at 0x000002CD2B0484E0>


# 3.	如何使用模型進行推薦?

In [20]:
model.recommendProducts(201,10)

[Rating(user=201, product=1131, rating=5.030768654209071),
 Rating(user=201, product=1174, rating=5.010514347886925),
 Rating(user=201, product=1070, rating=5.000556798675801),
 Rating(user=201, product=324, rating=4.942672793769788),
 Rating(user=201, product=340, rating=4.928025131008385),
 Rating(user=201, product=772, rating=4.926153524165296),
 Rating(user=201, product=203, rating=4.921233384170009),
 Rating(user=201, product=447, rating=4.8995665922438025),
 Rating(user=201, product=205, rating=4.842412096945696),
 Rating(user=201, product=276, rating=4.837484022863789)]

In [17]:
model.predict(100, 20)

3.212898414006477

In [18]:
model.recommendUsers(product=200,num=10)

[Rating(user=550, product=200, rating=5.674708657679709),
 Rating(user=541, product=200, rating=5.405353685462823),
 Rating(user=89, product=200, rating=5.353820255237684),
 Rating(user=9, product=200, rating=5.339473144637408),
 Rating(user=489, product=200, rating=5.249388724985),
 Rating(user=581, product=200, rating=5.24469489009846),
 Rating(user=276, product=200, rating=5.228748830020084),
 Rating(user=511, product=200, rating=5.209897153563752),
 Rating(user=8, product=200, rating=5.172314294089483),
 Rating(user=696, product=200, rating=5.1695498616611)]

# 3.1	顯示推薦的電影的名稱

In [21]:
itemRDD = sc.textFile("u.item")
itemRDD.count()

1682

In [24]:
movieTitle= itemRDD.map( lambda line : line.split("|"))     \
                                   .map(lambda a: (int(a[0]),a[1]))       \
                                   .collectAsMap()
#len(movieTitle) 
#print(movieTitle)
type(movieTitle)


dict

In [25]:
movieTitle[1]

'Toy Story (1995)'

In [26]:
for i in range(1,6): 
    print (str(i)+":"+movieTitle[i])

1:Toy Story (1995)
2:GoldenEye (1995)
3:Four Rooms (1995)
4:Get Shorty (1995)
5:Copycat (1995)


In [27]:
recommendP= model.recommendProducts(100,5) 
for p in recommendP:
    print  ("對使用者"+ str(p[0]) +                    \
               "推薦電影"+ str(movieTitle[p[1]]) + \
               "推薦評分"+ str(p[2]))         

對使用者100推薦電影As Good As It Gets (1997)推薦評分4.908594090808872
對使用者100推薦電影Mary Poppins (1964)推薦評分4.875713789160742
對使用者100推薦電影Apt Pupil (1998)推薦評分4.867919472967934
對使用者100推薦電影Schindler's List (1993)推薦評分4.856180015301664
對使用者100推薦電影Titanic (1997)推薦評分4.853923220517173
