# ITEM-BASED COLLABORATIVE FILTERING

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql.functions import lit

# Load up movie ID -> movie name dictionary
def loadMovieNames():
    movieNames = {}
    with open("/home/vijay/Desktop/Studies/Data world/github/Recommended System/ml-100k/ml-100k/u.item",encoding="ISO-8859-1") as f:
        for line in f:
            fields = line.split('|')        
            movieNames[int(fields[0])] = (fields[1])
    return movieNames

# Convert u.data lines into (userID, movieID, rating) rows
def parseInput(line):
    fields = line.value.split()
    return Row(userID = int(fields[0]), movieID = int(fields[1]), rating = float(fields[2]))



In [2]:

if __name__ == "__main__":
    # Create a SparkSession (the config bit is only for Windows!)
    spark = SparkSession.builder.appName("MovieRecs").getOrCreate()

    # Load up our movie ID -> name dictionary
    movieNames = loadMovieNames()

    # Get the raw data
    lines = spark.read.text("/home/vijay/Desktop/Studies/Data world/github/Recommended System/ml-100k/ml-100k/u.data").rdd

    # Convert it to a RDD of Row objects with (userID, movieID, rating)
    ratingsRDD = lines.map(parseInput)

    # Convert to a DataFrame and cache it
    ratings = spark.createDataFrame(ratingsRDD).cache()

    # Create an ALS collaborative filtering model from the complete data set
    als = ALS(maxIter=10, regParam=0.001, userCol="userID", itemCol="movieID", ratingCol="rating")
    model = als.fit(ratings)

    # Print out ratings from user 0:
    print("\nRatings for user ID 0:")
    userRatings = ratings.filter("userID = 0")
    for rating in userRatings.collect():
        print(movieNames[rating['movieID']], rating['rating'])
        

    print("\nTop 20 recommendations:")
    # Find movies rated more than 100 times
    ratingCounts = ratings.groupBy("movieID").count().filter("count > 100")
    # Construct a "test" dataframe for user 0 with every movie rated more than 100 times
    popularMovies = ratingCounts.select("movieID").withColumn('userID', lit(0))
    
    
    # Run our model on that list of popular movies for user ID 0
    recommendations = model.transform(popularMovies)

    # Get the top 20 movies with the highest predicted rating for this user
    topRecommendations = recommendations.sort(recommendations.prediction.desc()).take(20)

    for recommendation in topRecommendations:
        print (movieNames[recommendation['movieID']], recommendation['prediction'])

    spark.stop()


Ratings for user ID 0:
Empire Strikes Back, The (1980) 5.0
Gone with the Wind (1939) 1.0
Star Wars (1977) 5.0

Top 20 recommendations:
Army of Darkness (1993) 6.030475616455078
Mystery Science Theater 3000: The Movie (1996) 5.912016868591309
Ace Ventura: Pet Detective (1994) 5.344579696655273
Princess Bride, The (1987) 5.124341011047363
Star Wars (1977) 5.003631591796875
Empire Strikes Back, The (1980) 4.993403434753418
Blues Brothers, The (1980) 4.920345306396484
Return of the Jedi (1983) 4.721105098724365
Jackie Chan's First Strike (1996) 4.7001261711120605
Terminator, The (1984) 4.6889543533325195
Highlander (1986) 4.640594959259033
Raiders of the Lost Ark (1981) 4.6026201248168945
Beavis and Butt-head Do America (1996) 4.545560359954834
Nightmare on Elm Street, A (1984) 4.451655864715576
Star Trek: The Wrath of Khan (1982) 4.4491119384765625
Con Air (1997) 4.395421028137207
Hudsucker Proxy, The (1994) 4.2855305671691895
Austin Powers: International Man of Mystery (1997) 4.25958442