In [1]:
# import libraries

from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


In [2]:
# make a SparkSession object

spark = (SparkSession
         .builder
         .appName("MoviesALS")
         .config("spark.driver.host", "localhost")
         .getOrCreate())

In [3]:
# read json file

movie_ratings = spark.read.json('data/ratings.json')

In [4]:
movie_ratings.dtypes

[('movie_id', 'bigint'),
 ('rating', 'bigint'),
 ('timestamp', 'double'),
 ('user_id', 'bigint')]

In [5]:
movie_ratings.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- rating: long (nullable = true)
 |-- timestamp: double (nullable = true)
 |-- user_id: long (nullable = true)



In [6]:
# cast to Pandas dataframe

movies_df = movie_ratings.select('*').toPandas()

In [7]:
movies_df.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,858,4,956678732.0,6040
1,2384,4,956678754.0,6040
2,593,5,956678754.0,6040
3,1961,4,956678777.0,6040
4,1419,3,956678856.0,6040


In [8]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719949 entries, 0 to 719948
Data columns (total 4 columns):
movie_id     719949 non-null int64
rating       719949 non-null int64
timestamp    719949 non-null float64
user_id      719949 non-null int64
dtypes: float64(1), int64(3)
memory usage: 22.0 MB


In [9]:
#check what are different years in the timstamp column

pd.to_datetime(movies_df.timestamp.astype(int)).dt.year.value_counts()

1970    719949
Name: timestamp, dtype: int64

In [10]:
#check years

import datetime
date = datetime.datetime.fromtimestamp(movies_df.timestamp[7777])
date

datetime.datetime(2000, 4, 28, 8, 8, 46)

In [11]:
movie_ratings = movie_ratings.drop('timestamp')

In [12]:
#make a split of data
(training, test) = movie_ratings.randomSplit([.8, .2])

In [13]:
#create ALS instance

als = ALS(maxIter=10,
          rank=10,
          userCol='user_id',
          itemCol='movie_id',
          ratingCol='rating')

In [14]:
#fit the model

model = als.fit(training)

In [15]:
# evaluate the model

predictions = model.transform(test)
predictions.persist()

DataFrame[movie_id: bigint, rating: bigint, user_id: bigint, prediction: float]

In [16]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating',
                               predictionCol='prediction')

In [18]:
rmse = evaluator.evaluate(predictions)

In [19]:
print(rmse)

nan
