In [1]:
#-*- coding=utf-8 -*-
"""
spark_modelCF
Author: shi zheyang
Date:2020/5/27
"""
from pyspark.sql import SparkSession
from pyspark import SparkContext
from operator import itemgetter
import numpy as np

spark = SparkSession.builder.appName("MoviesLens_ItemCF").\
config('spark.driver.memory', '12g').getOrCreate() # config('spark.executor.memory','.12g').


In [2]:
ratings.show(truncate=False)

+------+-------+------+----------+
|userId|movieId|rating|timestamp |
+------+-------+------+----------+
|1     |2      |3.5   |1112486027|
|1     |29     |3.5   |1112484676|
|1     |32     |3.5   |1112484819|
|1     |47     |3.5   |1112484727|
|1     |50     |3.5   |1112484580|
|1     |112    |3.5   |1094785740|
|1     |151    |4.0   |1094785734|
|1     |223    |4.0   |1112485573|
|1     |253    |4.0   |1112484940|
|1     |260    |4.0   |1112484826|
|1     |293    |4.0   |1112484703|
|1     |296    |4.0   |1112484767|
|1     |318    |4.0   |1112484798|
|1     |337    |3.5   |1094785709|
|1     |367    |3.5   |1112485980|
|1     |541    |4.0   |1112484603|
|1     |589    |3.5   |1112485557|
|1     |593    |3.5   |1112484661|
|1     |653    |3.0   |1094785691|
|1     |919    |3.5   |1094785621|
+------+-------+------+----------+
only showing top 20 rows



In [3]:
ratings.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [4]:
# Load dataset
from pyspark.sql.functions import *
ratings_cast = ratings.select(col("userId").cast('int'), col("movieId").cast('int'), col("rating").cast('float'))
ratings_cast.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)



In [5]:
# Splite into servral datasets
train, test = ratings_cast.randomSplit([0.9, 0.1], seed = 10)
train.show()
test.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      2|   3.5|
|     1|     29|   3.5|
|     1|     47|   3.5|
|     1|     50|   3.5|
|     1|    112|   3.5|
|     1|    151|   4.0|
|     1|    223|   4.0|
|     1|    260|   4.0|
|     1|    293|   4.0|
|     1|    296|   4.0|
|     1|    318|   4.0|
|     1|    337|   3.5|
|     1|    367|   3.5|
|     1|    541|   4.0|
|     1|    589|   3.5|
|     1|    593|   3.5|
|     1|    653|   3.0|
|     1|    919|   3.5|
|     1|    924|   3.5|
|     1|   1009|   3.5|
+------+-------+------+
only showing top 20 rows

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|     32|   3.5|
|     1|    253|   4.0|
|     1|   1196|   4.5|
|     1|   1198|   4.5|
|     1|   1219|   4.0|
|     1|   1249|   4.0|
|     1|   1266|   4.0|
|     1|   1304|   3.0|
|     1|   1750|   3.5|
|     1|   2194|   3.5|
|     1|   2253|   3.5|
|     1|   2288|   4.0|
|     1|   2683|   3.5|
|     1|   276

In [6]:
print(train.count())
print(test.count())

18002672
1997591


In [7]:
train.take(5)

[Row(userId=1, movieId=2, rating=3.5),
 Row(userId=1, movieId=29, rating=3.5),
 Row(userId=1, movieId=47, rating=3.5),
 Row(userId=1, movieId=50, rating=3.5),
 Row(userId=1, movieId=112, rating=3.5)]

In [8]:
from pyspark.mllib.recommendation import Rating
numUsers = train.rdd.map(lambda x:x[0]).distinct().count()
numUsers

138493

In [9]:
numMovies = train.rdd.map(lambda x:x[1]).distinct().count()
numMovies

26307

In [13]:
import time
start = time.time()
from pyspark.mllib.recommendation import ALS
model = ALS.train(train.rdd, 10, 10, 0.01)
print(model)
print("time: ",time.time() - start,"s")

<pyspark.mllib.recommendation.MatrixFactorizationModel object at 0x7effacc0ac10>
time:  33.48362064361572 s


In [15]:
start = time.time()
print(model.recommendProducts(2,10))
print("time: ",time.time() - start,"s")

[Rating(user=2, product=105325, rating=10.161222473361937), Rating(user=2, product=71482, rating=9.688936807034604), Rating(user=2, product=61206, rating=9.441863402095091), Rating(user=2, product=113374, rating=9.024697710475424), Rating(user=2, product=110380, rating=8.994208295160893), Rating(user=2, product=76291, rating=8.960955235088754), Rating(user=2, product=26813, rating=8.740549736968376), Rating(user=2, product=99045, rating=8.451030749779932), Rating(user=2, product=89349, rating=8.390138547189189), Rating(user=2, product=95115, rating=8.369044478438536)]
time:  0.03570961952209473 s


In [22]:
# Read movies-name
movieName = spark.read.option("header", "true").csv("../data/ml-20m/movies.csv")
movieName.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [25]:
movieName = movieName.select(col("movieId").cast('int'),"title")
movieName_dict = list(map(lambda row:row.asDict(), movieName.collect()))
movieName_dict

[{'movieId': 1, 'title': 'Toy Story (1995)'},
 {'movieId': 2, 'title': 'Jumanji (1995)'},
 {'movieId': 3, 'title': 'Grumpier Old Men (1995)'},
 {'movieId': 4, 'title': 'Waiting to Exhale (1995)'},
 {'movieId': 5, 'title': 'Father of the Bride Part II (1995)'},
 {'movieId': 6, 'title': 'Heat (1995)'},
 {'movieId': 7, 'title': 'Sabrina (1995)'},
 {'movieId': 8, 'title': 'Tom and Huck (1995)'},
 {'movieId': 9, 'title': 'Sudden Death (1995)'},
 {'movieId': 10, 'title': 'GoldenEye (1995)'},
 {'movieId': 11, 'title': 'American President, The (1995)'},
 {'movieId': 12, 'title': 'Dracula: Dead and Loving It (1995)'},
 {'movieId': 13, 'title': 'Balto (1995)'},
 {'movieId': 14, 'title': 'Nixon (1995)'},
 {'movieId': 15, 'title': 'Cutthroat Island (1995)'},
 {'movieId': 16, 'title': 'Casino (1995)'},
 {'movieId': 17, 'title': 'Sense and Sensibility (1995)'},
 {'movieId': 18, 'title': 'Four Rooms (1995)'},
 {'movieId': 19, 'title': 'Ace Ventura: When Nature Calls (1995)'},
 {'movieId': 20, 'title'

In [27]:
final = {}
for dic in movieName_dict:
    movieId = dic['movieId']
    title = dic['title']
    final[movieId] = title

In [28]:
%time result = model.recommendProducts(2,10)
for p in result:
    print("对用户："+str(p[0])+"，推荐电影："+str(final[p[1]])+"，推荐评分："+str(p[2]))

CPU times: user 2.34 ms, sys: 609 µs, total: 2.95 ms
Wall time: 32 ms
对用户：2，推荐电影：Bad Milo (Bad Milo!) (2013)，推荐评分：10.161222473361937
对用户：2，推荐电影：Yatterman (Yattâman) (2009)，推荐评分：9.688936807034604
对用户：2，推荐电影：In the City of Sylvia (En la ciudad de Sylvia) (2007)，推荐评分：9.441863402095091
对用户：2，推荐电影：Old Lady and the Pigeons, The (La vieille dame et les pigeons) (1997)，推荐评分：9.024697710475424
对用户：2，推荐电影：Nitro Circus: The Movie (2012)，推荐评分：8.994208295160893
对用户：2，推荐电影：Karan Arjun (1995)，推荐评分：8.960955235088754
对用户：2，推荐电影：Calendar (1993)，推荐评分：8.740549736968376
对用户：2，推荐电影：Aftershock (Tangshan dadizhen) (2010)，推荐评分：8.451030749779932
对用户：2，推荐电影：Misérables, Les (1934)，推荐评分：8.390138547189189
对用户：2，推荐电影：Inauguration of the Pleasure Dome (1954)，推荐评分：8.369044478438536
