# Big Data Storage Exam - Group Task
Group C: Yves, Timo, Dominik, Hanna

## Data Import and Preparation

In [0]:
# imports:
from pyspark.sql.types import *
from pyspark.sql.functions import explode, expr, col, avg, broadcast, year
import json
import ast

In [0]:
# create schema for movies dataframe
movies_schema = StructType([ 
    StructField("budget", IntegerType(),True), 
    StructField("genres", StringType(),True),  # nested columns are stored as strings and normalized later (same for other nested columns)
    StructField("homepage", StringType(),True), 
    StructField("id", StringType(), True), # ids are stored as string (same for all ids)
    StructField("keywords", StringType(), True), 
    StructField("original_language", StringType(), True),
    StructField("original_title", StringType(), True), 
    StructField("overview", StringType(), True), 
    StructField("popularity", FloatType(), True), 
    StructField("production_companies", StringType(), True), 
    StructField("production_countries", StringType(), True), 
    StructField("release_date", DateType(), True), 
    StructField("revenue", LongType(), True),  # values too big for int, therefore used dtype Long
    StructField("runtime", IntegerType(), True), 
    StructField("spoken_languages", StringType(), True), 
    StructField("status", StringType(), True), 
    StructField("tagline", StringType(), True), 
    StructField("title", StringType(), True)
  ])

# create schema for credits dataframe
credits_schema = StructType([ 
    StructField("movie_id", StringType(),True), 
    StructField("title", StringType(),True), 
    StructField("cast", StringType(),True), 
    StructField("crew", StringType(), True), 
  ])

# create schema for recommendations dataframe
recommendations_schema = StructType([ 
    StructField("movie_id", StringType(),True), 
    StructField("user_id", StringType(),True), 
    StructField("vote", IntegerType(),True)
  ])

In [0]:
# load data from CSVs into DataFrames 
df_movies_in = (spark.read.format("csv")
                          .schema(movies_schema) # apply schema
                          .option("header", "true")
                          .option("escape",'"') # additional quotes are ignored
                          .option("mode", "DROPMALFORMED") # drops malformed rows in csv (necessary because otherwise they will lead to problems later) -->PLEASE NOTE THAT THIS WILL LEAD TO SOME MOVIES NOT BEING CONSIDERED!
                          .load("dbfs:/FileStore/shared_uploads/tiheiss@gmail.com/groupC/groupC/movies_groupC.csv")) # please insert your own link to the movies csv here

df_credits_in = spark.read.format("csv") \
                          .schema(credits_schema) \
                          .option("header", "true") \
                          .option("escape",'"') \
                          .load("dbfs:/FileStore/shared_uploads/tiheiss@gmail.com/groupC/groupC/credits_groupC.csv") # please insert your own link to the credits csv here

df_recommendations_in = spark.read.format("csv") \
                                  .schema(recommendations_schema) \
                                  .option("header", "true") \
                                  .load("dbfs:/FileStore/shared_uploads/tiheiss@gmail.com/groupC/groupC/recommendations_groupC.csv") # please insert your own link to the recommendations csv here

In [0]:
# Pre-Processing:
# credits dataframe

"""
normalize credits dataframe --> leads to one dataframe for cast and one for crew
json.loads() to convert the json-string (e.g. in column "cast") into a list of dictionaries
explode() to normalize the column --> 1.NF
Note: for some reason, every value in the dictionary resulting from json.loads() is stored with the same dtype; the dtype is derived from the first element in the dict, therefore cast_id is 'converted' to string by adding quotes in the json-string (otherwise all values would be stored as integer, which leads to NULL in case of the name)
"""

df_cast = df_credits_in.rdd.map(lambda x: (x[0],                    # adding quotes in json string to "convert" cast_id to string  
                                           json.loads(x[2].replace('\"cast_id\":','\"cast_id\":\"').replace(', \"character\"','\", \"character\"')) 
                                           )
                               ).toDF(["movie_id", "cast"])

rdd = df_cast.select(df_cast.movie_id, explode(df_cast.cast)).rdd.map(lambda x: (x[0], 
                                                                                 x[1]["name"], 
                                                                                 x[1]["id"], 
                                                                                 x[1]["order"], 
                                                                                 x[1]["character"], 
                                                                                 x[1]["credit_id"], 
                                                                                 x[1]["cast_id"], 
                                                                                 x[1]["gender"]
                                                                                )
                                                                     )
cast_schema = StructType([ # schema for new cast dataframe
    StructField("movie_id", StringType(),True), 
    StructField("name", StringType(),True), 
    StructField("id", StringType(),True), 
    StructField("order", StringType(), True), 
    StructField("character", StringType(),True), 
    StructField("credit_id", StringType(),True), 
    StructField("cast_id", StringType(),True), 
    StructField("gender", StringType(),True)  # cast to IntegerType not possible here, has to be done separately (see below)
  ])

df_cast = spark.createDataFrame(rdd, cast_schema)
df_cast = df_cast.withColumn("gender", df_cast.gender.cast('int')).withColumn("order", df_cast.order.cast('int')) # convert gender and order to integer manually
df_cast.limit(3).display() # show top 3 rows (just to check)
df_cast.printSchema() # print schema (just to check)


df_crew = df_credits_in.rdd.map(lambda x: (x[0], json.loads(x[3]))).toDF(["movie_id", "crew"])
rdd = df_crew.select(df_crew.movie_id, explode(df_crew.crew)).rdd.map(lambda x: (x[0], 
                                                                                 x[1]["name"], 
                                                                                 x[1]["job"], 
                                                                                 x[1]["department"], 
                                                                                 x[1]["id"], 
                                                                                 x[1]["credit_id"], 
                                                                                 x[1]["gender"]
                                                                                )
                                                                     )
crew_schema = StructType([ # schema for new crew dataframe
    StructField("movie_id", StringType(),True), 
    StructField("name", StringType(),True), 
    StructField("job", StringType(),True), 
    StructField("department", StringType(), True), 
    StructField("id", StringType(),True), 
    StructField("credit_id", StringType(),True), 
    StructField("gender", StringType(),True)
  ])

df_crew = spark.createDataFrame(rdd, crew_schema)
df_crew = df_crew.withColumn("gender", df_crew.gender.cast('int')) # cast gender to integer
df_crew.limit(3).display()
df_crew.printSchema()

movie_id,name,id,order,character,credit_id,cast_id,gender
19995,Sam Worthington,65731,0,Jake Sully,5602a8a7c3a3685532001c9a,242,2
19995,Zoe Saldana,8691,1,Neytiri,52fe48009251416c750ac9cb,3,1
19995,Sigourney Weaver,10205,2,Dr. Grace Augustine,52fe48009251416c750aca39,25,1


root
 |-- movie_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- id: string (nullable = true)
 |-- order: integer (nullable = true)
 |-- character: string (nullable = true)
 |-- credit_id: string (nullable = true)
 |-- cast_id: string (nullable = true)
 |-- gender: integer (nullable = true)



movie_id,name,job,department,id,credit_id,gender
19995,Stephen E. Rivkin,Editor,Editing,1721,52fe48009251416c750aca23,0
19995,Rick Carter,Production Design,Art,496,539c47ecc3a36810e3001f87,2
19995,Christopher Boyes,Sound Designer,Sound,900,54491c89c3a3680fb4001cf7,0


root
 |-- movie_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- job: string (nullable = true)
 |-- department: string (nullable = true)
 |-- id: string (nullable = true)
 |-- credit_id: string (nullable = true)
 |-- gender: integer (nullable = true)



In [0]:
# Pre-Processing:
# recommendations dataframe
df_recommendations = df_recommendations_in # already clean and normalized
df_recommendations.limit(3).display()
df_recommendations.printSchema()

movie_id,user_id,vote
76493,70090,4
20764,47695,1
71547,57703,4


root
 |-- movie_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- vote: integer (nullable = true)



In [0]:
# Pre-Processing:
# movies dataframe
"""
extract genres from movie dataframe and normalize the column in a separate dataframe
-> json.loads() does not work here, therefore ast.literal_eval() is used to convert the json-string into a dictionary list (does basically the same as json.loads())
"""

df_genres = df_movies_in.withColumnRenamed("id", "movie_id").rdd.map(lambda x: (x[3], 
                                                                                ast.literal_eval(x[1].replace('\"id\": ','\"id\": \"') # add quotes -> "convert" id to string
                                                                                                     .replace(', \"name\"','\", \"name\"'))
                                                                               ) 
                                                                    ).toDF(["movie_id", "genres"])

rdd = df_genres.select(df_genres.movie_id, explode(df_genres.genres)).rdd.map(lambda x: (x[0], 
                                                                                         x[1]["name"], 
                                                                                         x[1]["id"]
                                                                                        )
                                                                             )
genres_schema = StructType([ # schema for new genre dataframe
    StructField("movie_id", StringType(),True), 
    StructField("name", StringType(),True), 
    StructField("id", StringType(),True)
  ])

df_genres = spark.createDataFrame(rdd, genres_schema)
df_genres.limit(3).display()
df_genres.printSchema()

# note: other columns (such as "keywords") could also be normalized, but since they are not used in the tasks below, we refrain from doing so (would be the same procedere as for genres)

df_movies = df_movies_in.drop("genres") # genres column can now be dropped as it is already in the newly created dataframe
df_movies.limit(3).display()
df_movies.printSchema()

movie_id,name,id
19995,Action,28
19995,Adventure,12
19995,Fantasy,14


root
 |-- movie_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- id: string (nullable = true)



budget,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title
237029119,http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""space war""}, {""id"": 3388, ""name"": ""space colony""}, {""id"": 3679, ""name"": ""society""}, {""id"": 3801, ""name"": ""space travel""}, {""id"": 9685, ""name"": ""futuristic""}, {""id"": 9840, ""name"": ""romance""}, {""id"": 9882, ""name"": ""space""}, {""id"": 9951, ""name"": ""alien""}, {""id"": 10148, ""name"": ""tribe""}, {""id"": 10158, ""name"": ""alien planet""}, {""id"": 10987, ""name"": ""cgi""}, {""id"": 11399, ""name"": ""marine""}, {""id"": 13065, ""name"": ""soldier""}, {""id"": 14643, ""name"": ""battle""}, {""id"": 14720, ""name"": ""love affair""}, {""id"": 165431, ""name"": ""anti war""}, {""id"": 193554, ""name"": ""power relations""}, {""id"": 206690, ""name"": ""mind and soul""}, {""id"": 209714, ""name"": ""3d""}]",en,Avatar,"In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.",150.43758,"[{""name"": ""Ingenious Film Partners"", ""id"": 289}, {""name"": ""Twentieth Century Fox Film Corporation"", ""id"": 306}, {""name"": ""Dune Entertainment"", ""id"": 444}, {""name"": ""Lightstorm Entertainment"", ""id"": 574}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}, {""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}]",2009-12-10,2787954796,162,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,Enter the World of Pandora.,Avatar
300022446,http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug abuse""}, {""id"": 911, ""name"": ""exotic island""}, {""id"": 1319, ""name"": ""east india trading company""}, {""id"": 2038, ""name"": ""love of one's life""}, {""id"": 2052, ""name"": ""traitor""}, {""id"": 2580, ""name"": ""shipwreck""}, {""id"": 2660, ""name"": ""strong woman""}, {""id"": 3799, ""name"": ""ship""}, {""id"": 5740, ""name"": ""alliance""}, {""id"": 5941, ""name"": ""calypso""}, {""id"": 6155, ""name"": ""afterlife""}, {""id"": 6211, ""name"": ""fighter""}, {""id"": 12988, ""name"": ""pirate""}, {""id"": 157186, ""name"": ""swashbuckler""}, {""id"": 179430, ""name"": ""aftercreditsstinger""}]",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems.",139.08261,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""name"": ""Jerry Bruckheimer Films"", ""id"": 130}, {""name"": ""Second Mate Productions"", ""id"": 19936}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2007-05-19,961027991,169,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End
244999130,http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name"": ""based on novel""}, {""id"": 4289, ""name"": ""secret agent""}, {""id"": 9663, ""name"": ""sequel""}, {""id"": 14555, ""name"": ""mi6""}, {""id"": 156095, ""name"": ""british secret service""}, {""id"": 158431, ""name"": ""united kingdom""}]",en,Spectre,"A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M battles political forces to keep the secret service alive, Bond peels back the layers of deceit to reveal the terrible truth behind SPECTRE.",107.376785,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""name"": ""Danjaq"", ""id"": 10761}, {""name"": ""B24"", ""id"": 69434}]","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}, {""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2015-10-26,880650185,148,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""}, {""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}, {""iso_639_1"": ""it"", ""name"": ""Italiano""}, {""iso_639_1"": ""de"", ""name"": ""Deutsch""}]",Released,A Plan No One Escapes,Spectre


root
 |-- budget: integer (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: float (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: date (nullable = true)
 |-- revenue: long (nullable = true)
 |-- runtime: integer (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)



In [0]:
# split genres table into fact and dimension table (there are only few distinct genres in the entire dataset, so this process reduces size significantly as data is not stored repeatedly; the dimension table can later be broadcasted)

# fact table: connects movies and genres table
df_movie_genres = df_genres.select(["movie_id", col("id").alias("genre_id")])
df_movie_genres.limit(3).display()
df_movie_genres.printSchema()

# genre dimension table
df_genres = df_genres.select(["id", "name"]).distinct()
df_genres.limit(3).display() # only 20 rows (=20 genres) in this dataframe
df_genres.printSchema()

movie_id,genre_id
19995,28
19995,12
19995,14


root
 |-- movie_id: string (nullable = true)
 |-- genre_id: string (nullable = true)



id,name
28,Action
12,Adventure
14,Fantasy


root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



**Note:** For perfectly normalized tables, more preparation would have to be done. However, the current state is considered sufficient for the following tasks.

## Data Storage

In [0]:
# store data in parquet files (reasons for parquet: see respective individual tasks)
df_movies.write.mode('overwrite').parquet("/tmp/out/movies.parquet")
df_cast.write.mode('overwrite').parquet("/tmp/out/cast.parquet")
df_crew.write.mode('overwrite').parquet("/tmp/out/crew.parquet")
df_recommendations.write.mode('overwrite').parquet("/tmp/out/recommendations.parquet")
df_genres.write.mode("overwrite").parquet("/tmp/out/genres.parquet")
df_movie_genres.write.mode('overwrite').parquet("/tmp/out/movie_genres.parquet")

## Group Tasks

In [0]:
# read in the parquet files created above
df_movies = spark.read.parquet("/tmp/out/movies.parquet")
df_cast = spark.read.parquet("/tmp/out/cast.parquet")
df_crew = spark.read.parquet("/tmp/out/crew.parquet")
df_recommendations = spark.read.parquet("/tmp/out/recommendations.parquet")
df_genres = spark.read.parquet("/tmp/out/genres.parquet")
df_movie_genres = spark.read.parquet("/tmp/out/movie_genres.parquet")

### Exercise 1
Which movie genres have the most movies with a runtime over 120 minutes and how many movies? Please list the top three movie genres.

##### Query:

In [0]:
# broadcast variable for genres is sensible here:
# - genres are used in a join-operation here
# - genres is a dimension table and also relatively small
broadcastGenres = broadcast(df_genres)

df_solution_1 = (df_movies.where(df_movies.runtime > 120) # filter movies: runtime over 120 minutes
                          .join(df_movie_genres, df_movie_genres.movie_id == df_movies.id) 
                          .join(broadcastGenres, df_movie_genres.genre_id == broadcastGenres.id) # join with broadcast variable
                          .groupby(broadcastGenres.name) # group by genre
                          .count() # counts the movies for each genre                       
                          .select(broadcastGenres.name, col("count").alias("no. of movies over 120min"))
                          .orderBy("count", ascending=False) # sort genre by number of movies in a descending order
                          .limit(3)) # select top 3

df_solution_1.display()

name,no. of movies over 120min
Drama,667
Action,297
Thriller,245


##### Answer:

In [0]:
result = (f"The top 3 movie genres with the most movies with a runtime over 120 minutes are: \n")
for i in range(3):
    result += f"{i+1}. {df_solution_1.collect()[i]['name']} with {df_solution_1.collect()[i]['no. of movies over 120min']} movies \n"

print(result)

The top 3 movie genres with the most movies with a runtime over 120 minutes are: 
1. Drama with 667 movies 
2. Action with 297 movies 
3. Thriller with 245 movies 



### Exercise 2
In how many movies did the actor Johnny Depp take part in as an actor? In how many of those did he also act as a producer?

##### Queries:

In [0]:
# counts number of movies in which Johnny Depp was actor
solution_2a = (df_cast.where(df_cast.name == "Johnny Depp") 
                     .select("movie_id") 
                     .distinct() # in case he played multiple roles in one movie
                     .count())
solution_2a

Out[12]: 40

In [0]:
"""
As an alternative an accumulator variable could be used here to count, but accumulators are not always reliable (in particular when using them with the map()-function).
Therefore, we will not use accumulator variables for this and for the following tasks.
Nevertheless a small example:
"""
accum = sc.accumulator(0) 
rdd = df_cast.where(df_cast.name == "Johnny Depp").select("movie_id").distinct().rdd.map(lambda x: accum.add(1))
rdd.collect()
accum.value

Out[13]: 40

In [0]:
# counts number of movies in which Johnny Depp was both actor and producer (checks if job contains "Producer" to also include e.g. "Executive Producer" (which should also be counted as a producer))
solution_2b = (df_cast.join(df_crew, df_cast.movie_id == df_crew.movie_id) 
                     .where((df_cast.name == "Johnny Depp") & (df_crew.name == "Johnny Depp") & (df_crew.job.contains("Producer")))
                     .select(df_cast.movie_id)
                     .distinct() # in case he played multiple roles / had multiple producing roles in one movie
                     .count())
solution_2b

Out[14]: 3

##### Answer:

In [0]:
print(f"Johnny Depp has appeared in {solution_2a} films as an actor, in {solution_2b} of which he also was a producer.")

Johnny Depp has appeared in 40 films as an actor, in 3 of which he also was a producer.


### Exercise 3
List the names and the revenue of the ten movies with the most revenue which were released before 2015. What are the ten financially most succesful movies when comparing the revenue to the budget?

##### Queries:

In [0]:
# top 10 films with the most revenue released before 2015
df_solution_3a = (df_movies.where(year(df_movies.release_date) < 2015) # pyspark's year() function to check if the year in the date is before 2015
                           .select(df_movies.title, df_movies.revenue) 
                           .orderBy(df_movies.revenue, ascending=False) # sort by revenue in descending order
                           .limit(10)) # take top 10

df_solution_3a.display()

title,revenue
Avatar,2787954796
Titanic,1845003402
The Avengers,1519534370
Frozen,1274203452
Iron Man 3,1215485685
Transformers: Dark of the Moon,1123753017
The Lord of the Rings: The Return of the King,1118908565
Skyfall,1108523531
Transformers: Age of Extinction,1091449308
The Dark Knight Rises,1084946897


In [0]:
# top 10 movies that were financially most successful
# criterion for financial success: revenue to budget ratio --> "how much revenue did a movie generate from one monetary unit budget".
df_solution_3b = (df_movies.where(year(df_movies.release_date) < 2015) # assumption: "before 2015" also applies for this second part of the question
                           .select(df_movies.title, df_movies.revenue, df_movies.budget, (df_movies.revenue/df_movies.budget).alias("revenue_to_budget_rate")) # ratio of revenue to budget
                           .orderBy("revenue_to_budget_rate", ascending=False) 
                           .limit(10))

df_solution_3b.display()

title,revenue,budget,revenue_to_budget_rate
Paranormal Activity,193331808,6117,31605.657675331044
Wild Hogs,253619776,13302,19066.28897910089
Garfield: A Tail of Two Kitties,141690931,9431,15023.95620824939
Observe and Report,24035595,1687,14247.537048014226
The Muppet Christmas Carol,27238020,2131,12781.80197090568
Scooby-Doo 2: Monsters Unleashed,181447854,21198,8559.668553637135
The Campaign,104874999,19560,5361.707515337424
Here Comes the Boom,73097152,16520,4424.7670702179175
The Book Thief,76607648,24066,3183.231446854484
The Blair Witch Project,248009994,81459,3044.5990498287483


##### Answers:

In [0]:
result = (f"The top 10 movies with the most revenue released before 2015 are:\n")
for i in range(10):
    result += f"{i+1}.  {df_solution_3a.collect()[i]['title']} with a revenue of {df_solution_3a.collect()[i]['revenue']}\n"
    
print(result)

The top 10 movies with the most revenue released before 2015 are:
1.  Avatar with a revenue of 2787954796
2.  Titanic with a revenue of 1845003402
3.  The Avengers with a revenue of 1519534370
4.  Frozen with a revenue of 1274203452
5.  Iron Man 3 with a revenue of 1215485685
6.  Transformers: Dark of the Moon with a revenue of 1123753017
7.  The Lord of the Rings: The Return of the King with a revenue of 1118908565
8.  Skyfall with a revenue of 1108523531
9.  Transformers: Age of Extinction with a revenue of 1091449308
10.  The Dark Knight Rises with a revenue of 1084946897



In [0]:
result = (f"The top ten financially most successful movies (before 2015) when comparing the revenue to the budget are: \n")
for i in range(10):
    result += f"{i+1}.  {df_solution_3b.collect()[i]['title']}, which generated {round(df_solution_3b.collect()[i]['revenue_to_budget_rate'])} revenue from each monetary unit budget.\n"

print(result)

The top ten financially most successful movies (before 2015) when comparing the revenue to the budget are: 
1.  Paranormal Activity, which generated 31606 revenue from each monetary unit budget.
2.  Wild Hogs, which generated 19066 revenue from each monetary unit budget.
3.  Garfield: A Tail of Two Kitties, which generated 15024 revenue from each monetary unit budget.
4.  Observe and Report, which generated 14248 revenue from each monetary unit budget.
5.  The Muppet Christmas Carol, which generated 12782 revenue from each monetary unit budget.
6.  Scooby-Doo 2: Monsters Unleashed, which generated 8560 revenue from each monetary unit budget.
7.  The Campaign, which generated 5362 revenue from each monetary unit budget.
8.  Here Comes the Boom, which generated 4425 revenue from each monetary unit budget.
9.  The Book Thief, which generated 3183 revenue from each monetary unit budget.
10.  The Blair Witch Project, which generated 3045 revenue from each monetary unit budget.



### Exercise 4
What is the movie genre that has a median rating of at least 3 (over all movies with at least ten recommendations) with the lowest average production budget considering all movies?

##### Query:

In [0]:
# create broadcast variable (see above)
broadcastGenres = broadcast(df_genres)

# step 1: movies with at least 10 recommendations
movies_min_10_rec = df_recommendations.groupBy("movie_id").count() \
                                      .where(col("count")>=10) \
                                      .select("movie_id").rdd.map(lambda x: x[0]).collect()

In [0]:
# step 2: genres with a median rating of at least 3 (considering movies with at least 10 recommendations)
genres_atleast_3 = (df_movie_genres.filter(df_movie_genres.movie_id.isin(movies_min_10_rec)) # only consider movies with at least ten recommendations
                                   .join(df_recommendations, df_recommendations.movie_id == df_movie_genres.movie_id)
                                   .groupBy(df_movie_genres.genre_id) # group by genre
                                   .agg(expr('percentile_approx(vote, 0.5)').alias('median_vote'))  # calculate the median rating of each genre
                                   .where(col("median_vote") >=3)
                                   .select(df_movie_genres.genre_id))

In [0]:
# step 3: genre (out of the above) with the lowest average production budget considering all movies
df_solution_4 = (df_movie_genres.join(df_movies, df_movies.id == df_movie_genres.movie_id) # join with movies to get budgets
                                .filter(df_movie_genres.genre_id.isin(genres_atleast_3.genre_id)) # only include movies with genres with median >=3
                                .groupBy(df_movie_genres.genre_id)
                                .agg(avg(col("budget")).alias("avg_budget"))   # calculates the average budget for each genre
                                .orderBy("avg_budget").limit(1) # take the one with the lowest avg_budget
                                .join(broadcastGenres, df_movie_genres.genre_id==broadcastGenres.id) # join to get genre name
                                .select(broadcastGenres.name, col("avg_budget")))

df_solution_4.display()

name,avg_budget
Foreign,501361.5161290322


##### Answer:

In [0]:
print(f"{df_solution_4.collect()[0]['name']} is the movie genre that has a median rating of at least 3 over all movies with at least ten recommendations with the lowest average production budget considering all movies. ")

Foreign is the movie genre that has a median rating of at least 3 over all movies with at least ten recommendations with the lowest average production budget considering all movies. 
