##### download dataset from here 
! wget http://files.grouplens.org/datasets/movielens/ml-25m.zip

In [1]:
! ls ../datasets/movie-ml-25/ml-25m

genome-scores.csv  links.csv   ratings.csv  tags.csv
genome-tags.csv    movies.csv  README.txt


In [2]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext('local', 'movie recommendation system')
spark = SparkSession(sc)

In [3]:
print(spark.version)

2.4.5


In [4]:
df = spark.read.csv("../datasets/movie-ml-25/ml-25m/ratings.csv", header=True, inferSchema=True)

In [5]:
df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [6]:
df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



In [10]:
random_seed = 1

In [21]:
training_data , test_data = df.randomSplit([0.7,0.3],random_seed)

In [22]:
from pyspark.ml.recommendation import ALS

In [23]:
als = ALS(
        maxIter=15,
        rank=10,
        seed=1234,
        ratingCol=('rating'),
        userCol='userId',
        itemCol='movieId'
)

In [24]:
type(training_data)

pyspark.sql.dataframe.DataFrame

In [25]:
training_data.count()

17502939

In [26]:
test_data.count()

7497156

In [27]:
model = als.fit(training_data)

In [28]:
predictions = model.transform(test_data)

In [29]:
predictions.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- prediction: float (nullable = false)



In [32]:
from pyspark.sql.functions import rand

In [33]:
predictions.orderBy(rand()).show(10)

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
| 87358| 122914|   5.0|1573218236| 4.1838374|
|124090|   4878|   5.0|1460421153|  4.232624|
| 75849|   1377|   3.0| 908042626| 3.3400426|
|142299|   1240|   4.0|1526806594| 3.5977924|
| 82527|     50|   5.0|1564196981|   4.33586|
|130080| 103249|   3.5|1530656777| 3.2026358|
|133762|  89745|   5.0|1448238684| 4.2210646|
| 47192|  35836|   2.0|1481260132| 2.9377623|
| 66277|   1019|   4.0| 994213671| 3.0548425|
| 78849|  51935|   2.5|1372027679| 3.0745847|
+------+-------+------+----------+----------+
only showing top 10 rows



In [34]:
predictions.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|114572|    148|   2.0| 838460783| 2.4473324|
|159730|    148|   3.0| 842162037| 2.7568665|
| 47989|    148|   2.0| 833173771| 3.1970022|
| 72337|    148|   2.0| 944246202| 2.8859456|
|108767|    148|   3.0|1276969740|  2.621679|
| 21531|    148|   3.0| 834035555|  3.017579|
| 99684|    148|   3.0|1027645782| 2.9729304|
| 35969|    148|   2.0| 835094487| 2.9234517|
| 29943|    148|   3.0|1049216998| 2.9995558|
|117168|    148|   4.0| 835820190| 2.9894955|
|  3411|    148|   3.0| 835966104| 2.7953959|
| 28229|    148|   1.0| 833850593| 2.6176789|
|148197|    148|   2.5|1207008368| 2.8564281|
|  6491|    148|   4.0|1500217059| 2.6247742|
|147301|    148|   3.0| 951070210| 2.7431169|
|111567|    148|   3.0| 945399307| 2.9379678|
| 98520|    148|   4.0|1034547175| 2.8708107|
| 73827|    148|   4.0|1490671894| 2.8109019|
| 66440|    148|   2.5|1099143605|

## EVALUATION AND PREDICTIONS ON TEST DATA 

In [35]:
from pyspark.ml.evaluation import RegressionEvaluator

In [36]:
evaluator = RegressionEvaluator(metricName='rmse', 
                                predictionCol='prediction',
                                labelCol='rating')
rmse = evaluator.evaluate(predictions)
print(rmse)

nan


## Recommend top movies that active user might like 