In [None]:
# pip install pyspark

In [3]:
from pyspark import SparkConf, SparkContext

In [4]:
def loadMovieNames():
    movieNames = {}
    with open("ml-100k/u.item") as f:
        for line in f:
            fields = line.split("|")
            movieNames[int(fields[0])] = fields[1]
    return movieNames

def parseInput():
    fields = line.split()
    return (int(fields[1], (float(fields[2], 1.0))))
    

In [None]:
if __name__ == "__main__":
    conf = SparkConf().setAppName("WorstMovies")
    sc = SparkContext(conf=conf)
    
    movieNames = loadMovieNames()
    
    #load u.data file
    lines = sc.textFile("hdfs:///user/maria_dev/ml-100k/u.data")
    
    #convert to (movieID, (rating, 1.0))
    movieRatings = lines.map(parseInput)
    
    #convert to (movieID, (sumOfRatings, totalRatings))
    ratingTotalsAndCount = movieRatings.reduceByKey(lambda movie1, movie2: (movie1[0]+movie2[0], movie1[1]+movie2[1]))
    
    avgRatings = ratingTotalAndCount.mapValues(lambda tot_count: tot_count[0]/tot_count[1])
    
    sortMovie = avgRatings.sortBy(lambda x:x[1])
    
    res = sortMovie.take(10)
    
    for r in res:
        print(movieNames[r[0]], r[1]) 

To run spark, need to type 'spark-submit' first in the ssh 

Spark 2.0
- Extends RDD to a "Dataframe" object

In [None]:
def parseInput2():
    fields = line.split()
    return Row(movieID = int(fields[1]), rating = float(fields[2])) 

In [None]:
#Spark 2.0 version

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions

if __name__ == "__main__":
    #create spark session
    spark = SparkSession.builder.appName("PopularMovies").getOrCreate()
    #getorCreate either create a new session or pick up from snippet of last session
    
    movieNames = loadMovieNames()
    
    lines = spark.sparkContext.textFile("hdfs:///user/maria_dev/ml-100k/u.data")
    
    movies = lines.map(parseInput2)
    
    movieData = spark.createDataFrame(movies)
    
    avgRating = movieData.groupBy("movieID").avg("rating")
    
    counts = movieData.groupBy("movieID").count()
    
    avg_count = counts.join(avgRating, "movieID")
    
    #get top 10 res
    topTen = avg_count.orderBy("avg(rating)").take(10)
    
    for movie in topTen:
        print(movieNames[movie[0]], movie[1], movie[2])

Now a bit more complicated: doing movie recommendation using MLLib

In [None]:
def parseInput3():
    fields = line.split()
    return Row(userID = int(fields[0]), movieID = int(fields[1]), rating = float(fields[2])) 

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql.functions import lit

if __name__ == "__main__":
    #create spark session
    spark = SparkSession.builder.appName("PopularMovies").getOrCreate()
    #getorCreate either create a new session or pick up from snippet of last session
    
    movieNames = loadMovieNames()
    
    lines = spark.read.text("hdfs:///user/maria_dev/ml-100k/u.data").rdd
    
    movies = lines.map(parseInput3)
    
    #use cache so that Spark wont keep recreate this dataset
    movieData = spark.createDataFrame(movies).cache()
    
    als = ALS(maxIter=5, regParam=0.01, userCol='userID', itemCol="movieID". ratingCol="rating")
    model=als.fit(movieData)
    
    #create manually user 0 with its preference to some movies, then predict its rating for other movies
    
    print("\nRatings for user 0:")
    userRatings = movieData.filter("userID=0")
    for rating in userRatings.collect():
        print movieNames[rating['movieID']], rating['rating']
        
    print('\nTop 20 Recommendations:')
    
    ratingCounts = movieData.groupBy('movieID').count().filter('count>100')
    #construct a test dataframe for user 0 with every movie rated more than 100 times
    popularMovies = ratingCounts.select('movieID').withColumn('userID', lit(0))
    
    #run the model on that list of popular movies for user ID 0
    recommendations = model.transform(popularMovies)
    
    topRecommendations = recommendations.sort(recommendations.prediction.desc()).take(20)
    
    for recommendation in topRecommendations:
        print(movieNames[recommendation['movieID']], reocmmendation['prediction'])