In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_dir = 'ml-latest-small/movies.csv'
ratings_dir = 'ml-latest-small/ratings.csv'
links_dir = 'ml-latest-small/links.csv'

## Initial View

In [33]:
ratings = pd.read_csv(ratings_dir)
# don't need timestamp for now
ratings = ratings.drop(['timestamp'], axis=1) 
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [34]:
print(min(ratings.rating))
print(max(ratings.rating))

0.5
5.0


We will test out between Scikit Suprise and MLlib in Spark packages for recommending MovieLens in regards to accuracy and training/predict performance. 

## Scikit Suprise

In [35]:
import time
from surprise import SVD
from surprise import Dataset
from surprise import Reader 
from surprise.model_selection import cross_validate


reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(ratings, reader)

start = time.time()

results = cross_validate(SVD(), data, cv=5)

TypeError: __init__() got an unexpected keyword argument 'n_jobs'

In [39]:
print("Average Test RMSE: ", results['test_rmse'].mean())
print("Average Test MAE: ", results['test_mae'].mean())
print("Average Fit Time: ", np.asarray(results['fit_time']).mean())
print("Average Test Time: ", np.asarray(results['test_time']).mean())

Average Test RMSE:  0.8729152653950389
Average Test MAE:  0.6704516516755772
Average Fit Time:  7.296851444244385
Average Test Time:  0.27019538879394533


## Implicit

In [3]:
import implicit

model = implicit.als.AlternatingLeastSquares(factors=50)

start = timeit.timeit()
model.fit(ratings)
stop = timeit.timeit()

print("Training time: ", stop-start)

ModuleNotFoundError: No module named 'implicit'

## MLLib Spark

In [36]:
# create DataFrame
sparkDf = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(ratings_dir)

sparkDf = sparkDf.select("*").drop("timestamp")

In [37]:
sparkDf.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [38]:
sparkDf.rdd.getNumPartitions()

1

In [39]:
(training, test) = sparkDf.randomSplit([0.8, 0.2], 24)

In [40]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
import timeit

als = ALS(maxIter=5, regParam=0.01, userCol="userId", 
          itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")

start = timeit.timeit()
model = als.fit(training)
stop = timeit.timeit()

print("Training time: ", stop-start)
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

Training time:  -0.0006483120000666531
Root-mean-square error = 1.0771752671526134


In [16]:
res = userRecs.select("recommendations").collect()

In [28]:
res[0][0]

[Row(movieId=1256, rating=7.111196994781494),
 Row(movieId=26258, rating=6.775794506072998),
 Row(movieId=3334, rating=6.516416072845459),
 Row(movieId=5650, rating=6.427963733673096),
 Row(movieId=1232, rating=6.194378852844238),
 Row(movieId=5152, rating=6.054859161376953),
 Row(movieId=27611, rating=6.054802894592285),
 Row(movieId=1747, rating=6.032325267791748),
 Row(movieId=3088, rating=6.001091957092285),
 Row(movieId=1298, rating=5.926971435546875)]