In [None]:
"""
REQUIREMENT:
    Refactor SparkMovieData notebook 
      1. ratings.csv, movies.csv should be loaded from hadoop
      2. Spark Session using SparkConf, use 4 executor core, 
                                         max 4 excutor core , use Spark Cluster
      3. Write the result to hdfs 
            mostPopularMoviesDf.write.mode('overwrite')\
                              .csv("hdfs:....../output/top-movies.csv")

My Understanding:
    Read from HDFS

    spark-config
    4 executor cores
    4 max cores

    Write back to HDFS
"""

In [1]:
import findspark
findspark.init()

In [2]:


"""
Since Spark 2.x, Spark unified Spark APIs, DF, Datasets, & SQL.
SparkSession uses SparkContext internally.
"""

from pyspark.conf import SparkConf

config = SparkConf()
config.setMaster("spark://192.168.11.77:7077").setAppName("MOVIELENSonCLUSTER")


<pyspark.conf.SparkConf at 0x1f419fb7c88>

In [3]:
"""
Configure before creating SparkSession
"""

conf = \
(
    config
    .set("spark.executor.memory", "2g")
    .set("spark.executor.cores", 4)
    .set("spark.cores.max", 4)
    .set("spark.driver.memory", "2g")
)


In [4]:
from pyspark.sql import SparkSession

ss = SparkSession.builder.config(conf=conf).getOrCreate()

In [5]:
ss

In [6]:

"""
Read CSV from HDFS
"""

from pyspark.sql.types import StructType, IntegerType, DoubleType, StringType, LongType

schema_movies = (
    StructType()
    .add("movieId", IntegerType(), True)
    .add("title", StringType(), True)
    .add("genres", StringType(), True)
)
schema_ratings = (
    StructType()
    .add("userId", IntegerType(), True)
    .add("movieId", IntegerType(), True)
    .add("rating", DoubleType(), True)
    .add("timestamp", LongType(), True)
)

df_movies_full = (
    ss.read
    .format("csv")
    .option("header", True)
    .schema(schema_movies)
    .load("hdfs://192.168.93.128:9000/input/movie_lens/movies.csv")
)

df_ratings_full = (
    ss.read
    .format("csv")
    .option("header", True)
    .schema(schema_ratings)
    .load("hdfs://192.168.93.128:9000/input/movie_lens/ratings.csv")
)

In [9]:
df_movies_full.first(), df_ratings_full.first()

(Row(movieId=1, title='Toy Story (1995)', genres='Adventure|Animation|Children|Comedy|Fantasy'),
 Row(userId=1, movieId=1, rating=4.0, timestamp=964982703))

In [11]:
df_movies_full.schema, df_ratings_full.schema

(StructType(List(StructField(movieId,IntegerType,true),StructField(title,StringType,true),StructField(genres,StringType,true))),
 StructType(List(StructField(userId,IntegerType,true),StructField(movieId,IntegerType,true),StructField(rating,DoubleType,true),StructField(timestamp,LongType,true))))

In [13]:
"""
Sorted averaged ratings
"""

df_ratings_avg = \
df_ratings_full.groupBy("movieId").mean().sort("avg(rating)", ascending=False)[["movieId", "avg(rating)"]]

df_ratings_avg.show(2)

+-------+-----------+
|movieId|avg(rating)|
+-------+-----------+
| 157775|        5.0|
|   8911|        5.0|
+-------+-----------+
only showing top 2 rows



In [27]:
"""
Fetch top 10 first, then join with movies
"""



"""
top = 100000
df_ratings_top = df_ratings_avg.limit(top) --> Will reduce the dataset into 1 partition (outputs a single HDFS file)
"""
df_ratings_top = df_ratings_avg


df_top10 = df_ratings_top.join(df_movies_full, on="movieId", how="inner")
df_top10 = df_top10.withColumnRenamed("avg(rating)", "rating_average")

In [28]:
df_top10.write.mode('overwrite').csv("hdfs://192.168.93.128:9000/output/all-movies")

"""
df_top10.write.mode('overwrite').optoin("header", True).csv("hdfs://192.168.93.128:9000/output/all-movies")
"""

'\n.option("header", True)\ndf_top10.write.mode(\'overwrite\').csv("hdfs://192.168.93.128:9000/output/all-movies")\n'

In [29]:
df_top10.rdd.getNumPartitions()

200

In [None]:

"""
Each of those partitions above writes a separate file in HDFS
BAD Solution: .repartition(1) --> performance overhead + loses sorting
"""

df_top10.repartition(1).write.mode('overwrite').csv("hdfs://192.168.93.128:9000/output/all-movies")

In [None]:

"""
Each of those partitions above writes a separate file in HDFS
BETTER Solution: .coalesce() --> may reduce shuffling, but cannot totally stop shuffling

Coalesce helps in repartitioning, but with minimal reshuffling than .repartition(n)
Coalesce will try and reduce shuffling that usually happens while repartitioning.
"""

df_top10.coalesce().write.mode('overwrite').csv("hdfs://192.168.93.128:9000/output/all-movies")