In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
sc = SparkContext("local[1]", "MOVIE-LENS")
sc.setLogLevel("WARN")

---

### Source data

In [None]:
rdd_ratings = sc.textFile("hdfs://192.168.93.128:9000/input/movie_lens/ratings.csv")
rdd_movies = sc.textFile("hdfs://192.168.93.128:9000/input/movie_lens/movies.csv")

In [None]:
ratings_header = rdd_ratings.first()
ratings_rdd = (
    rdd_ratings
    .filter(lambda x: x != ratings_header)
    .map(lambda x: tuple(x.split(",")))
)

movies_header = rdd_movies.first()
movies_rdd = (
    rdd_movies
    .filter(lambda x: x != movies_header)
    .map(lambda x: tuple(x.split(",")))
)

---

### Data structure?

In [None]:
print(ratings_header, ' -- ', movies_header, end='\n'*2)
print(ratings_rdd.first(), ' -- ', movies_rdd.first())

---

### Initial data cleaning

In [None]:
import datetime as dt

In [None]:
ratings_rdd = ratings_rdd.map(lambda r: (int(r[0]), int(r[1]), float(r[2]), dt.datetime.fromtimestamp(int(r[-1]))))
ratings_rdd.take(2)

In [None]:
movies_keyed_rdd = movies_rdd.map(lambda r: (int(r[0]), (r[1], r[2])))
movies_keyed_rdd.take(2)

---

### Ratings average?

In [None]:
ratings_keyed_rdd = ratings_rdd.map(lambda r: (r[1], r[2]))
ratings_keyed_rdd.take(2)

In [None]:

"""
METHOD 1
.groupByKey(<calculation>)
"""

ratings_avg_byGroup = \
ratings_keyed_rdd.groupByKey().map(lambda r: (r[0], (round(sum(r[-1])/len(r[-1]), 2),)))

ratings_avg_byGroup.take(10)

In [None]:

"""
METHOD 2
.aggregateByKey(zeroOp, seqFunc, combFunc)
"""

zeroValue = (0.0, 0)
ratings_avg_byAggregate_rdd = \
ratings_keyed_rdd.aggregateByKey(zeroValue,
                                 lambda zero_values, rdd_value: (zero_values[0]+rdd_value,  # accumulate
                                                                 zero_values[1]+1),         # count
                                 lambda rddL, rddR: (rddL[0]+rddR[0],  # accumulate sums
                                                     rddL[1]+rddR[1])  # accumulate counts
                                )

In [None]:
ratings_avg_byAggregate = \
ratings_avg_byAggregate_rdd.map(lambda r: (r[0], (round(r[1][0]/r[1][1], 2),)))

ratings_avg_byAggregate.take(10)

<br>

__Both `.groupByKey()` & `.aggregateByKey()` yield the same result?__

In [None]:
ratings_avg_byGroup.take(10) == ratings_avg_byAggregate.take(10)

---

### Top 10 Movies by user ratings?

In [None]:
" (movieId, avg_rating) in descending order "

ratings_sorted_rdd = \
ratings_avg_byAggregate.sortBy(lambda r: r[1][0], ascending=False)

In [None]:
movieId_top10 = ratings_sorted_rdd.map(lambda r: r[0]).take(10)
movieId_top10

In [None]:
movies_top10 = \
ratings_sorted_rdd.join(
    movies_keyed_rdd.filter(lambda r: r[0] in movieId_top10)
).map(lambda r: (r[1][-1][0], r[1][-1][-1], r[1][0][0])).collect()

movies_top10

#### Pretty print with pandas!

In [None]:
import pandas as pd

pd.DataFrame(movies_top10, columns={0: "Movie", 1: "Genre", 2: "User Rating"})

---