# 推荐系统

## 1.创建对象

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rs').getOrCreate()

## 2.读取数据集

In [2]:
df = spark.read.csv('data/movie_ratings_df.csv', inferSchema=True, header=True)
df

DataFrame[userId: int, title: string, rating: int]

## 3.数据分析

In [3]:
print((df.count(), len(df.columns)))

(100000, 3)


In [4]:
df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- rating: integer (nullable = true)



In [10]:
import random
df.show(10, False)

+------+------------+------+
|userId|title       |rating|
+------+------------+------+
|196   |Kolya (1996)|3     |
|63    |Kolya (1996)|3     |
|226   |Kolya (1996)|5     |
|154   |Kolya (1996)|3     |
|306   |Kolya (1996)|5     |
|296   |Kolya (1996)|4     |
|34    |Kolya (1996)|5     |
|271   |Kolya (1996)|4     |
|201   |Kolya (1996)|4     |
|209   |Kolya (1996)|4     |
+------+------------+------+
only showing top 10 rows



In [11]:
df.groupBy('userId').count().orderBy('count', ascending=False).show(10, False)

+------+-----+
|userId|count|
+------+-----+
|405   |737  |
|655   |685  |
|13    |636  |
|450   |540  |
|276   |518  |
|416   |493  |
|537   |490  |
|303   |484  |
|234   |480  |
|393   |448  |
+------+-----+
only showing top 10 rows



In [12]:
df.groupBy('userId').count().orderBy('count', ascending=True).show(10, False)

+------+-----+
|userId|count|
+------+-----+
|732   |20   |
|631   |20   |
|572   |20   |
|685   |20   |
|93    |20   |
|300   |20   |
|636   |20   |
|34    |20   |
|926   |20   |
|596   |20   |
+------+-----+
only showing top 10 rows



In [13]:
df.groupBy('title').count().orderBy('count', ascending=False).show(10, False)

+-----------------------------+-----+
|title                        |count|
+-----------------------------+-----+
|Star Wars (1977)             |583  |
|Contact (1997)               |509  |
|Fargo (1996)                 |508  |
|Return of the Jedi (1983)    |507  |
|Liar Liar (1997)             |485  |
|English Patient, The (1996)  |481  |
|Scream (1996)                |478  |
|Toy Story (1995)             |452  |
|Air Force One (1997)         |431  |
|Independence Day (ID4) (1996)|429  |
+-----------------------------+-----+
only showing top 10 rows



## 4.特征工程

In [14]:
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, IndexToString

In [15]:
stringIndexer = StringIndexer(inputCol='title', outputCol='title_new')
model = stringIndexer.fit(df)
indexed = model.transform(df)
indexed.show(10)

+------+------------+------+---------+
|userId|       title|rating|title_new|
+------+------------+------+---------+
|   196|Kolya (1996)|     3|    287.0|
|    63|Kolya (1996)|     3|    287.0|
|   226|Kolya (1996)|     5|    287.0|
|   154|Kolya (1996)|     3|    287.0|
|   306|Kolya (1996)|     5|    287.0|
|   296|Kolya (1996)|     4|    287.0|
|    34|Kolya (1996)|     5|    287.0|
|   271|Kolya (1996)|     4|    287.0|
|   201|Kolya (1996)|     4|    287.0|
|   209|Kolya (1996)|     4|    287.0|
+------+------------+------+---------+
only showing top 10 rows



In [16]:
indexed.groupBy('title_new').count().orderBy('count', ascending=False).show(10, False)

+---------+-----+
|title_new|count|
+---------+-----+
|0.0      |583  |
|1.0      |509  |
|2.0      |508  |
|3.0      |507  |
|4.0      |485  |
|5.0      |481  |
|6.0      |478  |
|7.0      |452  |
|8.0      |431  |
|9.0      |429  |
+---------+-----+
only showing top 10 rows



## 5.划分数据集

In [17]:
train, test = indexed.randomSplit([0.75,0.25])
train.count()

74740

In [18]:
test.count()

25260

## 6.训练模型

In [19]:
from pyspark.ml.recommendation import ALS
rec = ALS(
    maxIter=10,
    regParam=0.01,
    userCol='userId',
    itemCol='title_new',
    ratingCol='rating',
    nonnegative=True,
    coldStartStrategy='drop'
)
rec_model = rec.fit(train)

## 7.测试数据评估

In [20]:
predicted_ratings = rec_model.transform(test)
predicted_ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- title_new: double (nullable = false)
 |-- prediction: float (nullable = false)



In [21]:
predicted_ratings.orderBy(rand()).show()

+------+--------------------+------+---------+----------+
|userId|               title|rating|title_new|prediction|
+------+--------------------+------+---------+----------+
|   452|Jungle2Jungle (1997)|     5|    237.0| 1.8293351|
|   407| Men in Black (1997)|     4|     31.0|  3.136302|
|   385|Schindler's List ...|     2|     36.0| 3.4210434|
|    70|     Fantasia (1940)|     3|    155.0| 2.9688604|
|   782|  Ulee's Gold (1997)|     3|    133.0|   2.25551|
|   437|     Clockers (1995)|     4|    763.0| 2.2375045|
|   305|Lawrence of Arabi...|     4|    156.0| 3.6663966|
|    13|        Fargo (1996)|     5|      2.0|  4.984844|
|    59|Nightmare on Elm ...|     5|    306.0|  4.653916|
|    95|Terminal Velocity...|     2|    751.0| 2.1765442|
|   894|Tomorrow Never Di...|     3|    140.0|  3.201442|
|   616|Desperate Measure...|     3|    455.0|  3.136622|
|   768|    Rock, The (1996)|     4|     17.0| 3.5287642|
|   537|      M*A*S*H (1970)|     4|    111.0| 3.5107236|
|   592|Beauti

In [22]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName='rmse', predictionCol='prediction', labelCol='rating')
rmse = evaluator.evaluate(predicted_ratings)
rmse

1.0243697149271438

## 8.推荐

In [23]:
unique_movies = indexed.select('title_new').distinct()
unique_movies.count()

1664

In [24]:
a = unique_movies.alias('a')
user_id = 85
watched_movies = indexed.filter(indexed['userId'] == user_id).select('title_new').distinct()
watched_movies.count()

287

In [25]:
b = watched_movies.alias('b')
total_movies = a.join(b, a.title_new == b.title_new, how='left')
total_movies.show(10, False)

+---------+---------+
|title_new|title_new|
+---------+---------+
|558.0    |null     |
|305.0    |305.0    |
|299.0    |null     |
|596.0    |null     |
|769.0    |null     |
|934.0    |null     |
|496.0    |496.0    |
|1051.0   |null     |
|692.0    |null     |
|810.0    |null     |
+---------+---------+
only showing top 10 rows



In [26]:
remaining_movies = total_movies.where(col('b.title_new').isNull()).select(a.title_new).distinct()
remaining_movies.count()

1377

In [27]:
remaining_movies = remaining_movies.withColumn('userId', lit(int(user_id)))
remaining_movies.show(10, False)

+---------+------+
|title_new|userId|
+---------+------+
|558.0    |85    |
|299.0    |85    |
|596.0    |85    |
|769.0    |85    |
|934.0    |85    |
|1051.0   |85    |
|692.0    |85    |
|810.0    |85    |
|720.0    |85    |
|782.0    |85    |
+---------+------+
only showing top 10 rows



In [28]:
recommendations = rec_model.transform(remaining_movies).orderBy('prediction', ascending=False)
recommendations.show(5, False)

+---------+------+----------+
|title_new|userId|prediction|
+---------+------+----------+
|1433.0   |85    |5.1811595 |
|1470.0   |85    |4.722778  |
|1195.0   |85    |4.708426  |
|1358.0   |85    |4.6781645 |
|948.0    |85    |4.6041365 |
+---------+------+----------+
only showing top 5 rows



In [29]:
movie_title = IndexToString(inputCol='title_new', outputCol='title', labels=model.labels)
final_recommendations = movie_title.transform(recommendations)
final_recommendations.show(10, False)

+---------+------+----------+--------------------------------------------------------+
|title_new|userId|prediction|title                                                   |
+---------+------+----------+--------------------------------------------------------+
|1433.0   |85    |5.1811595 |Boys, Les (1997)                                        |
|1470.0   |85    |4.722778  |Some Mother's Son (1996)                                |
|1195.0   |85    |4.708426  |Pather Panchali (1955)                                  |
|1358.0   |85    |4.6781645 |Angel Baby (1995)                                       |
|948.0    |85    |4.6041365 |Widows' Peak (1994)                                     |
|1271.0   |85    |4.5511937 |Whole Wide World, The (1996)                            |
|728.0    |85    |4.5249815 |Once Upon a Time in the West (1969)                     |
|1182.0   |85    |4.519919  |Kaspar Hauser (1993)                                    |
|1127.0   |85    |4.5168757 |Wonderful, Hor