In [105]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from datetime import datetime
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import pyspark.sql.functions as func
from pyspark.sql.functions import col
from pyspark.sql import Window


# Initialize SparkSession
spark = SparkSession.builder.appName("Final_Project").getOrCreate()

# Read the files
movies_path  = "movies.csv"
ratings_path = "ratings.csv"
tags_path = "tags.csv"

movie_dataset = spark.read.csv(movies_path, header=True, inferSchema=True)
ratings_dataset = spark.read.csv(ratings_path, header=True, inferSchema=True)
tags_dataset = spark.read.csv(tags_path, header=True, inferSchema=True)

movie_dataset.printSchema()
ratings_dataset.printSchema()
tags_dataset.printSchema()



root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: integer (nullable = true)



In [22]:
#Q1: Extract the most rated top 5 movies?

print("Extract the most rated top 5 movies?")

movie_dataset.join(ratings_dataset,movie_dataset.movieId ==  ratings_dataset.movieId,"inner").sort(ratings_dataset.rating.desc()).show(5)



+-------+--------------------+--------------------+------+-------+------+---------+
|movieId|               title|              genres|userId|movieId|rating|timestamp|
+-------+--------------------+--------------------+------+-------+------+---------+
|    101|Bottle Rocket (1996)|Adventure|Comedy|...|     1|    101|   5.0|964980868|
|    216|Billy Madison (1995)|              Comedy|     1|    216|   5.0|964981208|
|    151|      Rob Roy (1995)|Action|Drama|Roma...|     1|    151|   5.0|964984041|
|    157|Canadian Bacon (1...|          Comedy|War|     1|    157|   5.0|964984100|
|     47|Seven (a.k.a. Se7...|    Mystery|Thriller|     1|     47|   5.0|964983815|
+-------+--------------------+--------------------+------+-------+------+---------+
only showing top 5 rows



In [23]:
#Q2: Specify the number of movies according to the tags?

print("Specify the number of movies according to the tags?")
movie_dataset.join(tags_dataset,movie_dataset.movieId ==  tags_dataset.movieId,"inner").groupBy("tag").count().sort(col("count").desc()).show(5)

+-----------------+-----+
|              tag|count|
+-----------------+-----+
| In Netflix queue|  131|
|      atmospheric|   36|
|thought-provoking|   24|
|        superhero|   24|
|          surreal|   23|
+-----------------+-----+
only showing top 5 rows



In [38]:
# Q3: Categorize all funny and actions movies?

print("Categorize all funny and actions movies?")
movie_dataset.join(tags_dataset,movie_dataset.movieId ==  tags_dataset.movieId,"inner").filter((tags_dataset.tag == "action") | (tags_dataset.tag == "funny")).dropDuplicates(['movieId']).show()

+-------+--------------------+--------------------+------+-------+------+----------+
|movieId|               title|              genres|userId|movieId|   tag| timestamp|
+-------+--------------------+--------------------+------+-------+------+----------+
| 148626|Big Short, The (2...|               Drama|   567| 148626| funny|1525287708|
| 101142|  Croods, The (2013)|Adventure|Animati...|   119| 101142| funny|1436563067|
| 167746|The Lego Batman M...|Action|Animation|...|   567| 167746| funny|1525285825|
|  79132|    Inception (2010)|Action|Crime|Dram...|   424|  79132|action|1457844927|
|   3793|        X-Men (2000)|Action|Adventure|...|   184|   3793|action|1537094381|
|   2959|   Fight Club (1999)|Action|Crime|Dram...|   599|   2959|action|1498456930|
|  99114|Django Unchained ...|Action|Drama|Western|    62|  99114| funny|1526078778|
| 183611|   Game Night (2018)|Action|Comedy|Cri...|    62| 183611| funny|1526244688|
| 168248|John Wick: Chapte...|Action|Crime|Thri...|    62| 168248

In [48]:
# Q4: Find the top-rated movie according to genres?

print("Find the top-rated movie according to genres?")
title_w = Window.partitionBy("rating")
movie_dataset.join(ratings_dataset,movie_dataset.movieId ==  ratings_dataset.movieId,"inner").withColumn("count", F.count("genres").over(title_w)).dropDuplicates(['movieId']).orderBy(col("rating").desc()).show()

+-------+--------------------+--------------------+------+-------+------+----------+-----+
|movieId|               title|              genres|userId|movieId|rating| timestamp|count|
+-------+--------------------+--------------------+------+-------+------+----------+-----+
|  26587|Decalogue, The (D...| Crime|Drama|Romance|   105|  26587|   5.0|1446573090|13211|
| 128087|Trinity and Sarta...|      Comedy|Western|    89| 128087|   5.0|1520409504|13211|
|     53|     Lamerica (1994)|     Adventure|Drama|    85|     53|   5.0| 889468268|13211|
| 163112|Winnie the Pooh G...|           Animation|   105| 163112|   5.0|1526207523|13211|
|  90943|Into the Abyss (2...|         Documentary|   419|  90943|   5.0|1372972779|13211|
|   4116|Hollywood Shuffle...|              Comedy|   275|   4116|   5.0|1049076933|13211|
|  86668|Louis Theroux: La...|         Documentary|   419|  86668|   5.0|1321854735|13211|
|   4135|Monster Squad, Th...|Adventure|Comedy|...|   562|   4135|   5.0|1368896069|13211|

In [121]:
# Q5: The most romantic movie according to ratings?

print("The most romantic movie according to ratings?")

ratings_data = ratings_dataset.selectExpr("movieId as ratings_movieId", "rating")
tags_data = tags_dataset.selectExpr("movieId as tags_movieId", "tag")
movie_rating_join = movie_dataset.join(ratings_data, ratings_data.ratings_movieId == movie_dataset.movieId, "inner")
tag_rating_join = movie_rating_join.join(tags_data,movie_rating_join.movieId ==  tags_data.tags_movieId,"inner")
tag_rating_join.filter( col("tag") == "romantic").sort(movie_rating_join.rating.desc()).dropDuplicates(['title']).show()

+-------+--------------------+--------------------+---------------+------+------------+--------+
|movieId|               title|              genres|ratings_movieId|rating|tags_movieId|     tag|
+-------+--------------------+--------------------+---------------+------+------------+--------+
|   7361|Eternal Sunshine ...|Drama|Romance|Sci-Fi|           7361|   5.0|        7361|romantic|
|   4144|In the Mood For L...|       Drama|Romance|           4144|   5.0|        4144|romantic|
+-------+--------------------+--------------------+---------------+------+------------+--------+

