# Book Recommender System with PySpark ML

## Imports

In [None]:
# core
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import sklearn
import random, os
# spark & ML
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
# create spark session
spark = SparkSession.builder.appName('recsys').getOrCreate()

In [None]:
# load data into spark dataframe
ratings_df = spark.read.csv('../input/books-dataset/books_data/ratings.csv', sep=';',
                            inferSchema=True,header=True)
ratings_df.show()

In [None]:
# show schema
ratings_df.printSchema()

In [None]:
# load books data into spark dataframe
books_df = spark.read.csv('../input/books-dataset/books_data/books.csv', sep=';', inferSchema=True, header=True)
books_df = books_df.drop('Image-URL-S', 'Image-URL-M', 'Image-URL-L')
books_df.show()

In [None]:
# convert string to int for ALS
stringToInt = StringIndexer(inputCol='ISBN', outputCol='ISBN_int').fit(ratings_df)
ratings_df = stringToInt.transform(ratings_df)
ratings_df.show()

In [None]:
# split data into training and test datatset
train_df, test_df = ratings_df.randomSplit([0.8,0.2])

## Model

In [None]:
# ALS model
rec_model = ALS( maxIter=10 ,regParam=0.01,userCol='User-ID',itemCol='ISBN_int',ratingCol='Book-Rating', 
                nonnegative=True, coldStartStrategy="drop")

rec_model = rec_model.fit(train_df)

In [None]:
# making predictions on test set 
predicted_ratings=rec_model.transform(test_df)

## Evaluation

In [None]:
# calculate RMSE
evaluator = RegressionEvaluator(metricName='rmse', predictionCol='prediction',labelCol='Book-Rating')
rmse = evaluator.evaluate(predicted_ratings)
rmse

## Recommendation

# function to recommend top-n books for a user using trained model
def recommend_for_user(user_id, n):
    ratings_user = ratings_df.filter(col('User-Id')==user_id)
    pred_ratings_user = rec_model.transform(ratings_user.filter(col('Book-Rating')==0))
    recs_user = books_df.join(pred_ratings_user.select(['ISBN', 'prediction']), on='ISBN')
    recs_user = recs_user.sort('prediction', ascending=False).drop('prediction').limit(n)
    return recs_user

In [None]:
recs_user = recommend_for_user(31987, 5)
recs_user.show()