# Environment Preparation

In [4]:
import os
import sys
# set environment variable
os.environ['SPARK_HOME'] = "/home/bin_yin/tmp/spark-2.0.2-bin-hadoop2.7/"
os.environ['PYSPARK_SUBMIT_ARGS'] = "--master local[2] pyspark-shell" 
# we can check by os.environ.get("SPARK_HOME")

# Init sc
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="myAppName")

# 1. Data Description

**u.user**

u'1|24|M|technician|85711'
* [0]: users id
* [1]: age
* [2]: gender
* [3]: occupation
* [4]: zipcode


**u.item**

u'1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0'

* [0]: movie id
* [1]: movie title
* [2]: release date
* [3]: video release date
* [4]: IMDb URL
* unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western


**u.data**

u'196\t242\t3\t881250949'
* [0]: user id
* [1]: item id
* [2]: rating
* [3]: timestamp

# 2. Train & Predict

In [5]:
# get data
raw_data = sc.textFile('ml-100k/u.data')
raw_ratings = raw_data.map(lambda line:line.split('\t'))

ratings = raw_ratings.map(  lambda f:( int(f[0]), int(f[1]), float(f[2]))  )
ratings.first() # 用户ID，电影ID，得分

(196, 242, 3.0)

In [6]:
# check ratings data
import numpy as np
np.array(ratings.collect()).shape# 矩阵维度：(100000, 3)

(100000, 3)

In [7]:
%%time
# train model
from  pyspark.mllib.recommendation import ALS
model = ALS.train(ratings, rank=50, iterations=10, lambda_=0.01)

# 核心原理就是把ratings矩阵分解（协同过滤），类似为每个用户训练一个线性回归模型
# rank：线性模型维度

CPU times: user 30 ms, sys: 4 ms, total: 34 ms
Wall time: 7.45 s


In [8]:
# predict

model.predict(789, 123) # 用户789对电影123的打分是3.107

3.6064214156362575

# 3. TOP-K

In [9]:
# TOP-10 

model.recommendProducts(789, 10) # get top-10 movide id for user 789

[Rating(user=789, product=56, rating=6.433282385793653),
 Rating(user=789, product=192, rating=5.826250960884829),
 Rating(user=789, product=187, rating=5.8088165421500975),
 Rating(user=789, product=47, rating=5.759490453447884),
 Rating(user=789, product=182, rating=5.649288992949691),
 Rating(user=789, product=295, rating=5.506189508660589),
 Rating(user=789, product=514, rating=5.412293938331513),
 Rating(user=789, product=482, rating=5.372076382840295),
 Rating(user=789, product=23, rating=5.366697943160618),
 Rating(user=789, product=144, rating=5.327318132527473)]

In [10]:
rts = model.recommendProducts(789, 10) # get top-10 movide id for user 789
movie_ids = [r[1] for r in rts]
movie_ids

[56, 192, 187, 47, 182, 295, 514, 482, 23, 144]

In [11]:
# what is Top-10 movie name?

movie_raw_data = sc.textFile('ml-100k/u.item')
movies_by_id = movie_raw_data.map(lambda line: line.split('|')).map(lambda x:(x[0],x[1])).collect()
for m in movies_by_id:
    k = m[0]
    v = m[1]
    if(int(k) in movie_ids):
        print(v)

Taxi Driver (1976)
Ed Wood (1994)
Pulp Fiction (1994)
Die Hard (1988)
GoodFellas (1990)
Godfather: Part II, The (1974)
Raging Bull (1980)
Breakdown (1997)
Some Like It Hot (1959)
Annie Hall (1977)


# 4. Details

In [22]:
moviesForUser = ratings.keyBy(lambda f:f[0]).lookup(789) # UserID=789 的用户评价过的电影
moviesForUser # UserID, ProductID, Rating

[(789, 1012, 4.0),
 (789, 127, 5.0),
 (789, 475, 5.0),
 (789, 93, 4.0),
 (789, 1161, 3.0),
 (789, 286, 1.0),
 (789, 293, 4.0),
 (789, 9, 5.0),
 (789, 50, 5.0),
 (789, 294, 3.0),
 (789, 181, 4.0),
 (789, 1, 3.0),
 (789, 1008, 4.0),
 (789, 508, 4.0),
 (789, 284, 3.0),
 (789, 1017, 3.0),
 (789, 137, 2.0),
 (789, 111, 3.0),
 (789, 742, 3.0),
 (789, 248, 3.0),
 (789, 249, 3.0),
 (789, 1007, 4.0),
 (789, 591, 3.0),
 (789, 150, 5.0),
 (789, 276, 5.0),
 (789, 151, 2.0),
 (789, 129, 5.0),
 (789, 100, 5.0),
 (789, 741, 5.0),
 (789, 288, 3.0),
 (789, 762, 3.0),
 (789, 628, 3.0),
 (789, 124, 4.0)]

In [20]:
len(moviesForUser)# 用户评价过33部电影

33

In [24]:
# what is movies name?

movies_ids_for_user = [x[1] for x in moviesForUser]

movie_raw_data = sc.textFile('ml-100k/u.item')
movies_by_id = movie_raw_data.map(lambda line: line.split('|')).map(lambda x:(x[0],x[1])).collect()
for m in movies_by_id:
    k = m[0]
    v = m[1]
    if(int(k) in movies_ids_for_user):
        print('{0}: {1}'.format(k,v))

1: Toy Story (1995)
9: Dead Man Walking (1995)
50: Star Wars (1977)
93: Welcome to the Dollhouse (1995)
100: Fargo (1996)
111: Truth About Cats & Dogs, The (1996)
124: Lone Star (1996)
127: Godfather, The (1972)
129: Bound (1996)
137: Big Night (1996)
150: Swingers (1996)
151: Willy Wonka and the Chocolate Factory (1971)
181: Return of the Jedi (1983)
248: Grosse Pointe Blank (1997)
249: Austin Powers: International Man of Mystery (1997)
276: Leaving Las Vegas (1995)
284: Tin Cup (1996)
286: English Patient, The (1996)
288: Scream (1996)
293: Donnie Brasco (1997)
294: Liar Liar (1997)
475: Trainspotting (1996)
508: People vs. Larry Flynt, The (1996)
591: Primal Fear (1996)
628: Sleepers (1996)
741: Last Supper, The (1995)
742: Ransom (1996)
762: Beautiful Girls (1996)
1007: Waiting for Guffman (1996)
1008: I Shot Andy Warhol (1996)
1012: Private Parts (1997)
1017: Trees Lounge (1996)
1161: Palookaville (1996)


In [26]:
# 他对电影Dead Man Walking (1995)（MovieID=9）的评分是5.0
for x in moviesForUser:
    if(x[1]==9):
        print(x[2])

5.0


**Analysis**: 这里的电影就是这个用户评价过的电影。这个用户对Dead Man Walking (1995)（MovieID=9）的评分是5.0，说明他很喜欢这部电影。而我们给他推荐的TOP-10中，也有一部电影跟这个很接近：Die Hard (1988)。说明推荐还是有效果的

In [3]:
sc.stop()# Stop Spark