In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
sc = SparkContext("local[1]", "MOVIE-LENS")
sc.setLogLevel("WARN")

---

### Source data

In [3]:
rdd_ratings = sc.textFile("hdfs://192.168.93.128:9000/input/movie_lens/ratings.csv")
rdd_movies = sc.textFile("hdfs://192.168.93.128:9000/input/movie_lens/movies.csv")

In [4]:
ratings_header = rdd_ratings.first()
ratings_rdd = (
    rdd_ratings
    .filter(lambda x: x != ratings_header)
    .map(lambda x: tuple(x.split(",")))
)

movies_header = rdd_movies.first()
movies_rdd = (
    rdd_movies
    .filter(lambda x: x != movies_header)
    .map(lambda x: tuple(x.split(",")))
)

---

### Data structure?

In [5]:
print(ratings_header, ' -- ', movies_header, end='\n'*2)
print(ratings_rdd.first(), ' -- ', movies_rdd.first())

userId,movieId,rating,timestamp  --  movieId,title,genres

('1', '1', '4.0', '964982703')  --  ('1', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy')


---

### Initial data cleaning

In [6]:
import datetime as dt

In [7]:
ratings_rdd = ratings_rdd.map(lambda r: (int(r[0]), int(r[1]), float(r[2]), dt.datetime.fromtimestamp(int(r[-1]))))
ratings_rdd.take(2)

[(1, 1, 4.0, datetime.datetime(2000, 7, 31, 0, 15, 3)),
 (1, 3, 4.0, datetime.datetime(2000, 7, 30, 23, 50, 47))]

In [8]:
movies_keyed_rdd = movies_rdd.map(lambda r: (int(r[0]), (r[1], r[2])))
movies_keyed_rdd.take(2)

[(1, ('Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy')),
 (2, ('Jumanji (1995)', 'Adventure|Children|Fantasy'))]

---

### Ratings average?

In [9]:
ratings_keyed_rdd = ratings_rdd.map(lambda r: (r[1], r[2]))
ratings_keyed_rdd.take(2)

[(1, 4.0), (3, 4.0)]

In [10]:

"""
METHOD 1
.groupByKey(<calculation>)
"""

ratings_avg_byGroup = \
ratings_keyed_rdd.groupByKey().map(lambda r: (r[0], (round(sum(r[-1])/len(r[-1]), 2),)))

ratings_avg_byGroup.take(10)

[(1, (3.92,)),
 (3, (3.26,)),
 (6, (3.95,)),
 (47, (3.98,)),
 (50, (4.24,)),
 (70, (3.51,)),
 (101, (3.78,)),
 (110, (4.03,)),
 (151, (3.55,)),
 (157, (2.86,))]

In [11]:

"""
METHOD 2
.aggregateByKey(zeroOp, seqFunc, combFunc)
"""

zeroValue = (0.0, 0)
ratings_avg_byAggregate_rdd = \
ratings_keyed_rdd.aggregateByKey(zeroValue,
                                 lambda zero_values, rdd_value: (zero_values[0]+rdd_value,  # accumulate
                                                                 zero_values[1]+1),         # count
                                 lambda rddL, rddR: (rddL[0]+rddR[0],  # accumulate sums
                                                     rddL[1]+rddR[1])  # accumulate counts
                                )

In [12]:
ratings_avg_byAggregate = \
ratings_avg_byAggregate_rdd.map(lambda r: (r[0], (round(r[1][0]/r[1][1], 2),)))

ratings_avg_byAggregate.take(10)

[(1, (3.92,)),
 (3, (3.26,)),
 (6, (3.95,)),
 (47, (3.98,)),
 (50, (4.24,)),
 (70, (3.51,)),
 (101, (3.78,)),
 (110, (4.03,)),
 (151, (3.55,)),
 (157, (2.86,))]

<br>

__Both `.groupByKey()` & `.aggregateByKey()` yield the same result?__

In [13]:
ratings_avg_byGroup.take(10) == ratings_avg_byAggregate.take(10)

True

---

### Top 10 Movies by user ratings?

In [14]:
" (movieId, avg_rating) in descending order "

ratings_sorted_rdd = \
ratings_avg_byAggregate.sortBy(lambda r: r[1][0], ascending=False)

In [15]:
movieId_top10 = ratings_sorted_rdd.map(lambda r: r[0]).take(10)
movieId_top10

[131724, 5746, 6835, 3851, 1151, 1631, 2075, 176601, 92494, 102217]

In [16]:
movies_top10 = \
ratings_sorted_rdd.join(
    movies_keyed_rdd.filter(lambda r: r[0] in movieId_top10)
).map(lambda r: (r[1][-1][0], r[1][-1][-1], r[1][0][0])).collect()

movies_top10

[('The Jinx: The Life and Deaths of Robert Durst (2015)', 'Documentary', 5.0),
 ('Galaxy of Terror (Quest) (1981)', 'Action|Horror|Mystery|Sci-Fi', 5.0),
 ('Dylan Moran: Monster (2004)', 'Comedy|Documentary', 5.0),
 ('Alien Contamination (1980)', 'Action|Horror|Sci-Fi', 5.0),
 ("I'm the One That I Want (2000)", 'Comedy', 5.0),
 ('Lesson Faust (1994)', 'Animation|Comedy|Drama|Fantasy', 5.0),
 ('"Assignment', ' The (1997)"', 5.0),
 ('Mephisto (1981)', 'Drama|War', 5.0),
 ('Black Mirror', '(no genres listed)', 5.0),
 ('Bill Hicks: Revelations (1993)', 'Comedy', 5.0)]

#### Pretty print with pandas!

In [17]:
import pandas as pd

pd.DataFrame(movies_top10, columns={0: "Movie", 1: "Genre", 2: "User Rating"})

Unnamed: 0,0,1,2
0,The Jinx: The Life and Deaths of Robert Durst ...,Documentary,5.0
1,Galaxy of Terror (Quest) (1981),Action|Horror|Mystery|Sci-Fi,5.0
2,Dylan Moran: Monster (2004),Comedy|Documentary,5.0
3,Alien Contamination (1980),Action|Horror|Sci-Fi,5.0
4,I'm the One That I Want (2000),Comedy,5.0
5,Lesson Faust (1994),Animation|Comedy|Drama|Fantasy,5.0
6,"""Assignment","The (1997)""",5.0
7,Mephisto (1981),Drama|War,5.0
8,Black Mirror,(no genres listed),5.0
9,Bill Hicks: Revelations (1993),Comedy,5.0


---