In [5]:
import findspark
findspark.init()

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import os

spark = SparkSession.builder.master("local").appName("Online").getOrCreate()
sc = spark.sparkContext

In [25]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled","false")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","false")
spark.conf.set("spark.sql.execution.arrow.enabled",True)
spark.conf.set("spark.sql.execution.arrow.fallback.enabled",True)

"""
Online codeing round
https://www.learntospark.com/2020/11/spark-interview-question-coding-round.html
https://files.grouplens.org/datasets/movielens/ml-latest-small-README.html

1.Create a CSV file containing list of movies with number of users who rated the movie and average rating per movie. 
The file has to with three columns, i.e, MovieId, No of users, Average rating. Header column is not required. 
[Note - Use RDD for this Task (No Dataset or No Dataframes)]
2.Create a CSV file containing list of unique Genres and number of movies under each genres. 
The file should contain two columns i.e, Genres, No of movies. Column headers are not required. 
[Note - Use RDD for this Task (No Dataset or No Dataframes)].
3.Generate a output of format parquet that contains top 100 movies based on their ratings. 
This should have following fields in it. i.e, Rank (from 1 - 100), MovieId, Title, Average Rating.
"""

#WithoutInferSchema
#headerTrue will read first row and assign column names but type is String for all
#spark job created to read first column
filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"
users = sc.textFile(filepath + "Online/ratings.dat")

In [24]:
#Create a CSV file with MovieId, No of users, Average rating
#Create a CSV file containing list of movies with number of users who rated the movie and average rating per movie. 
#The file has to with three columns, i.e, MovieId, No of users, Average rating. Header column is not required. 
#[Note - Use RDD for this Task (No Dataset or No Dataframes)]
ratings.take(5)

['1::1193::5::978300760',
 '1::661::3::978302109',
 '1::914::3::978301968',
 '1::3408::4::978300275',
 '1::2355::5::978824291']

In [47]:
ratings_sch = StructType([
    StructField("userId",IntegerType(),False),
    StructField("movieId",IntegerType(),False),
    StructField("rating",IntegerType(),False),
    StructField("timestamp",IntegerType(),False)
])

ratings = sc.textFile(filepath + "Online/ratings.dat")

# Create RDD with (MovieId, (Rating, 1))
movie_ratings = ratings.map(lambda line: (int(line.split("::")[1]), (float(line.split("::")[2]), 1)))

# Aggregate ratings by MovieId
total_ratings = movie_ratings.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

# Calculate average rating per movie
average_ratings = total_ratings.mapValues(lambda x: x[0] / x[1])

# Count the number of users who rated each movie
user_counts = total_ratings.mapValues(lambda x: x[1])

# Combine MovieId, No of users, Average rating
result = user_counts.join(average_ratings)

# Save the result as a CSV file
result.map(lambda x: f"{x[0]},{x[1][0]},{x[1][1]}").saveAsTextFile(filepath + "Online/ratings_count")

In [45]:
user_counts.take(5)

[(1193, 1725), (661, 525), (914, 636), (3408, 1315), (2355, 1703)]

In [43]:
movie_ratings.take(5)

[(1193, (5.0, 1)),
 (661, (3.0, 1)),
 (914, (3.0, 1)),
 (3408, (4.0, 1)),
 (2355, (5.0, 1))]

In [63]:
#2.Create a CSV file containing list of unique Genres and number of movies under each genres. 
#The file should contain two columns i.e, Genres, No of movies. Column headers are not required. 
#[Note - Use RDD for this Task (No Dataset or No Dataframes)].

movies = sc.textFile(filepath + "Online/movies.dat")

# Extract Genres from each movie
movie_genres = movies.flatMap(lambda line: line.split("::")[2].split("|"))

# Count the occurrences of each genre
genre_counts = movie_genres.map(lambda genre: (genre, 1)).reduceByKey(lambda x, y: x + y)

# Save the result as a CSV file
genre_counts.map(lambda x: f"{x[0]},{x[1]}").saveAsTextFile(filepath + "Online/genre_count")

In [65]:
movies.take(5)

["1::Toy Story (1995)::Animation|Children's|Comedy",
 "2::Jumanji (1995)::Adventure|Children's|Fantasy",
 '3::Grumpier Old Men (1995)::Comedy|Romance',
 '4::Waiting to Exhale (1995)::Comedy|Drama',
 '5::Father of the Bride Part II (1995)::Comedy']

In [66]:
genre_counts.collect()

[('Animation', 105),
 ("Children's", 251),
 ('Comedy', 1200),
 ('Adventure', 283),
 ('Fantasy', 68),
 ('Romance', 471),
 ('Drama', 1603),
 ('Action', 503),
 ('Crime', 211),
 ('Thriller', 492),
 ('Horror', 343),
 ('Sci-Fi', 276),
 ('Documentary', 127),
 ('War', 143),
 ('Musical', 114),
 ('Mystery', 106),
 ('Film-Noir', 44),
 ('Western', 68)]

In [69]:
#3 Generate a Parquet file with top 100 movies based on ratings
#Generate a output of format parquet that contains top 100 movies based on their ratings. 
#This should have following fields in it. i.e, Rank (from 1 - 100), MovieId, Title, Average Rating.

# Read ratings data into a DataFrame
ratings_df = spark.read.text(filepath + "Online/ratings.dat")  # Replace with the actual path

# Create a DataFrame with MovieId, Average Rating
average_ratings_df = ratings_df.selectExpr("CAST(split(value, '::')[1] AS INT) AS MovieId",
                                           "CAST(split(value, '::')[2] AS FLOAT) AS Rating")

# Calculate average rating per movie
average_ratings_df = average_ratings_df.groupBy("MovieId").agg({"Rating": "avg"})

# Rank movies based on average rating
ranked_movies_df = average_ratings_df.orderBy("avg(Rating)", ascending=False) \
        .limit(100).withColumnRenamed("avg(Rating)", "AverageRating")

# Read movies data into a DataFrame
movies_df = spark.read.text(filepath + "Online/movies.dat")  # Replace with the actual path

# Extract MovieId and Title from movies data
movies_df = movies_df.selectExpr("CAST(split(value, '::')[0] AS INT) AS MovieId", "split(value, '::')[1] AS Title")

# Join with ranked movies to get the final result
top_movies_df = ranked_movies_df.join(movies_df, "MovieId") \
        .withColumn("Rank", row_number().over(Window.orderBy("AverageRating")))

# Save the result as a Parquet file
#top_movies_df.write.parquet(filepath + "Online/top100_movies")
top_movies_df.write.csv(filepath + "Online/top100_movies")

In [70]:
average_ratings_df.show()

+-------+------------------+
|MovieId|       avg(Rating)|
+-------+------------------+
|   1193| 4.390724637681159|
|    661|3.4647619047619047|
|    914| 4.154088050314465|
|   2355| 3.854374633000587|
|    919| 4.247962747380675|
|   2918|  4.11744738628649|
|   1035| 3.931972789115646|
|   2791|3.9711149624494513|
|   3105|3.7808823529411764|
|    720| 4.426940639269406|
|   1721|3.5834411384217333|
|   2294| 3.483720930232558|
|   3186|3.4779582366589326|
|   1566| 3.279317697228145|
|    783| 3.223076923076923|
|   1961| 4.053383458646617|
|   1962|3.8773006134969323|
|   2692| 4.224813432835821|
|   1028| 3.894164193867458|
|   1029| 3.688380281690141|
+-------+------------------+
only showing top 20 rows

