In [1]:
import os
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
 
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def get_counts_and_averages(ID_and_ratings_tuple):
    nratings = len(ID_and_ratings_tuple[1])
    return ID_and_ratings_tuple[0], (nratings, float(sum(x for x in ID_and_ratings_tuple[1]))/nratings)


class RecommendationEngine:
    """A movie recommendation engine
    """
 
    def __count_and_average_ratings(self):
        """Updates the movies ratings counts from 
        the current data self.ratings_RDD
        """
        logger.info("Counting movie ratings...")
        movie_ID_with_ratings_RDD = self.ratings_RDD.map(lambda x: (x[1], x[2])).groupByKey()
        movie_ID_with_avg_ratings_RDD = movie_ID_with_ratings_RDD.map(get_counts_and_averages)
        self.movies_rating_counts_RDD = movie_ID_with_avg_ratings_RDD.map(lambda x: (x[0], x[1][0]))
 
 
    def __train_model(self):
        """Train the ALS model with the current dataset
        """
        logger.info("Training the ALS model...")
        ratingforals = self.ratings_RDD.toDF()
        (training, test) = ratingforals.randomSplit([0.5, 0.5])
        # Build the recommendation model using ALS on the training data
        # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
        # c
        als = ALS(maxIter=3, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", 
                  coldStartStrategy="drop")
        self.model = als.fit(training)

        # Evaluate the model by computing the RMSE on the test data
        predictions = self.model.transform(test)
        evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                        predictionCol="prediction")
        rmse = evaluator.evaluate(predictions)
        logger.info("Root-mean-square error = " + str(rmse))
        logger.info("ALS model built!")
 
 
    def __init__(self, sc, movie, ratings):
        """Init the recommendation engine given a Spark context and a dataset path
        """
 
        logger.info("Starting up the Recommendation Engine: ")
 
        self.sc = sc
 
        # Load ratings data for later use
        
        #logger.info("Loading Ratings data...")
        #ratings_file_path = os.path.join(dataset_path, 'ratings.csv')
        #ratings_raw_RDD = self.sc.textFile(ratings_file_path)
        #ratings_raw_data_header = ratings_raw_RDD.take(1)[0]
        #self.ratings_RDD = ratings_raw_RDD.filter(lambda line: line!=ratings_raw_data_header)\
        #    .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),int(tokens[1]),float(tokens[2]))).cache()
        # Load movies data for later use
        #logger.info("Loading Movies data...")
        #movies_file_path = os.path.join(dataset_path, 'movies.csv')
        #movies_raw_RDD = self.sc.textFile(movies_file_path)
        #movies_raw_data_header = movies_raw_RDD.take(1)[0]
        #self.movies_RDD = movies_raw_RDD.filter(lambda line: line!=movies_raw_data_header)\
        #    .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),tokens[1],tokens[2])).cache()
        # Pre-calculate movies ratings counts
        self.movies_RDD = movie.rdd
        self.movies_titles_RDD = self.movies_RDD.map(lambda x: (int(x[0]),x[1])).cache()

        self.ratings_RDD = ratings.rdd
        self.__count_and_average_ratings()
 
        # Train the model
        #self.rank = 8
        #self.seed = 13
        #self.iterations = 10
        #self.regularization_parameter = 0.1
        self.__train_model()

In [2]:
def add_ratings(self, ratings):
    """Add additional movie ratings in the format (user_id, movie_id, rating)
    """
    # Convert ratings to an RDD
    new_ratings_RDD = self.sc.parallelize(ratings)
    # Add new ratings to the existing ones
    self.ratings_RDD = self.ratings_RDD.union(new_ratings_RDD)
    # Re-compute movie ratings count
    self.__count_and_average_ratings()
    # Re-train the ALS model with the new ratings
    self.__train_model()

    return ratings

# Attach the function to a class method
RecommendationEngine.add_ratings = add_ratings

In [3]:
def __predict_ratings(self, user_and_movie_RDD):
    """Gets predictions for a given (userID, movieID) formatted RDD
    Returns: an RDD with format (movieTitle, movieRating, numRatings)

    predicted_RDD = self.model.predictAll(user_and_movie_RDD)
    predicted_rating_RDD = predicted_RDD.map(lambda x: (x.product, x.rating))
    predicted_rating_title_and_count_RDD = \
        predicted_rating_RDD.join(self.movies_titles_RDD).join(self.movies_rating_counts_RDD)
    predicted_rating_title_and_count_RDD = \
        predicted_rating_title_and_count_RDD.map(lambda r: (r[1][0][1], r[1][0][0], r[1][1]))

    return predicted_rating_title_and_count_RDD
    """
    
    
def get_top_ratings(self, user_id, movies_count):
    """Recommends up to movies_count top unrated movies to user_id
    """
    # Get pairs of (userID, movieID) for user_id unrated movies
    #user_unrated_movies_RDD = self.ratings_RDD.filter(lambda rating: not rating[1]==user_id).map(lambda x: (user_id, x[1]))
    # Get predicted ratings
    #ratings = self.__predict_ratings(user_unrated_movies_RDD).filter(lambda r: r[2]>=25).takeOrdered(movies_count, key=lambda x: -x[1])
    userRecs = self.model.recommendForAllUsers(movies_count)
    ratings
    return ratings

# Attach the functions to class methods
RecommendationEngine.__predict_ratings = __predict_ratings
RecommendationEngine.get_top_ratings = get_top_ratings

In [4]:
def get_ratings_for_movie_ids(self, user_id, movie_ids):
    """Given a user_id and a list of movie_ids, predict ratings for them 
    """
    requested_movies_RDD = self.sc.parallelize(movie_ids).map(lambda x: (user_id, x))
    # Get predicted ratings
    ratings = self.__predict_ratings(requested_movies_RDD).collect()

    return ratings

# Attach the function to a class method
RecommendationEngine.get_ratings_for_movie_ids = get_ratings_for_movie_ids

In [5]:
from flask import Blueprint
main = Blueprint('main', __name__)
 
import json
#from engine import RecommendationEngine
 
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
 
from flask import Flask, request
 
@main.route("/<int:user_id>/ratings/top/<int:count>", methods=["GET"])
def top_ratings(user_id, count):
    logger.debug("User %s TOP ratings requested", user_id)
    top_ratings = recommendation_engine.get_top_ratings(user_id,count)
    return json.dumps(top_ratings)
 
@main.route("/<int:user_id>/ratings/<int:movie_id>", methods=["GET"])
def movie_ratings(user_id, movie_id):
    logger.debug("User %s rating requested for movie %s", user_id, movie_id)
    ratings = recommendation_engine.get_ratings_for_movie_ids(user_id, [movie_id])
    return json.dumps(ratings)
 
@main.route("/<int:user_id>/ratings", methods = ["POST"])
def add_ratings(user_id):
    # get the ratings from the Flask POST request object
    ratings_list = request.form.keys()[0].strip().split("\n")
    ratings_list = map(lambda x: x.split(","), ratings_list)
    # create a list with the format required by the negine (user_id, movie_id, rating)
    ratings = map(lambda x: (user_id, int(x[0]), float(x[1])), ratings_list)
    # add them to the model using then engine API
    recommendation_engine.add_ratings(ratings)
 
    return json.dumps(ratings)
 
def create_app(spark_context, movie, ratings):
    global recommendation_engine 
 
    recommendation_engine = RecommendationEngine(spark_context, movie, ratings)    
    
    app = Flask(__name__)
    app.register_blueprint(main)
    return app

In [6]:
import time, sys, cherrypy, os
from paste.translogger import TransLogger
#from app import create_app
from pyspark import SparkContext, SparkConf
import pyspark
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DoubleType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat, lit
from pyspark.sql.functions import udf
from pyspark.sql.types import *
 
def init_spark_context():
    # load spark context
    #spark = SparkSession.builder.getOrCreate()
    MAX_MEMORY = "5g"
    conf = SparkConf().setAll([("spark.app.name", "Spark_Processor"), ("spark.redis.port", "6379"), 
                                      ("spark.jars", "spark-redis-branch-2.4/target/spark-redis_2.11-2.5.0-SNAPSHOT-jar-with-dependencies.jar"), 
                                      ("spark.executor.memory", MAX_MEMORY), ("spark.driver.memory", MAX_MEMORY), 
                                      ("spark.memory.fraction", "0.6")])
    sc = SparkContext(conf=conf)
    #conf = SparkConf().setAppName("movie_recommendation-server")
    # IMPORTANT: pass aditional Python modules to each worker
    #sc = SparkContext(conf=conf, pyFiles=['engine.py', 'app.py'])
 
    return sc
 
def run_server(app):
 
    # Enable WSGI access logging via Paste
    app_logged = TransLogger(app)
 
    # Mount the WSGI callable object (app) on the root directory
    cherrypy.tree.graft(app_logged, '/')
 
    # Set the configuration of the web server
    cherrypy.config.update({
        'engine.autoreload.on': True,
        'log.screen': True,
        'server.socket_port': 5432,
        'server.socket_host': '0.0.0.0'
    })
 
    # Start the CherryPy WSGI web server
    cherrypy.engine.start()
    cherrypy.engine.block()
 
 
if __name__ == "__main__":
    # Init spark context and load libraries
    sc = init_spark_context()
    spark = SparkSession.builder.config(conf = sc.getConf()).getOrCreate()
    #dataset_path = os.path.join('datasets', 'ml-latest')
    read_moviedf = spark.read.format("org.apache.spark.sql.redis").option("table", "movie").option("key.column", "title").load()
    movieDf = read_moviedf.sort("title")
    datasets_path = os.path.join('..','movie_recommendation_system', 'datasets')
    complete_ratings_file = os.path.join(datasets_path, 'ml-latest', 'ratings.csv')
    ratingschema = StructType()\
        .add("userId", IntegerType(), True)\
        .add("movieId", IntegerType(), True)\
        .add("rating", DoubleType(), True)\
        .add("timeStamp", IntegerType(), True)
    '''
    tagsschema = StructType()\
        .add("userId", IntegerType(), True)\
        .add("movieId", IntegerType(), True)\
        .add("tag", StringType(), True)\
        .add("timeStamp", IntegerType(), True)
    linkschema = StructType()\
        .add("movieId", IntegerType(), True)\
        .add("IMDBID", IntegerType(), True)\
        .add("TMDBID", IntegerType(), True)
    '''
    ratingdf = spark.read.format("csv")\
        .option("header",True)\
        .schema(ratingschema)\
        .load(complete_ratings_file)
    '''
    tagdf = spark.read.format("csv")\
        .option("header",True)\
        .schema(tagsschema)\
        .load(tags_file)
    linkdf = spark.read.format("csv")\
        .option("header",True)\
        .schema(linkschema)\
        .load(links_file)
    '''
    #moviedf = movie.drop("genres")
    #genredf = movie.drop("title")
    #userdf = ratingdf.drop("movieId", "rating", "timeStamp")
    #userdf = userdf.select('userId').distinct()
    ratingdf = ratingdf.drop("timestamp")
    ratingdf = ratingdf.withColumn("key", (concat(lit("User ID: "),col("userId"),lit(", Movie ID: "),col("movieID"))))
    ratingdf = ratingdf.select("key", "userId", "movieId", "rating")
    
    import pyspark.sql.functions as f
    my_list = movieDf.select(f.collect_list('movieId')).first()[0]
    newRatingDf = ratingdf.filter(ratingdf['movieId'].isin(my_list))
    
    app = create_app(sc, movieDf, newRatingDf)
 
    # start web server
    run_server(app)

INFO:__main__:Starting up the Recommendation Engine: 
INFO:__main__:Counting movie ratings...
INFO:__main__:Training the ALS model...


NameError: name 'model' is not defined