## Data Import and Preparation

In [0]:
from pyspark.sql.types import *

movies_schema = StructType([ 
    StructField("budget", IntegerType(),True), 
    StructField("genres", StringType(),True), 
    StructField("homepage", StringType(),True), 
    StructField("id", StringType(), True), 
    StructField("keywords", StringType(), True), 
    StructField("original_language", StringType(), True),
    StructField("original_title", StringType(), True), 
    StructField("overview", StringType(), True), 
    StructField("popularity", FloatType(), True), 
    StructField("production_companies", StringType(), True), 
    StructField("production_countries", StringType(), True), 
    StructField("release_date", DateType(), True), 
    StructField("revenue", LongType(), True),  # to big for int
    StructField("runtime", IntegerType(), True), 
    StructField("spoken_languages", StringType(), True), 
    StructField("status", StringType(), True), 
    StructField("tagline", StringType(), True), 
    StructField("title", StringType(), True)
  ])

credits_schema = StructType([ 
    StructField("movie_id", StringType(),True), 
    StructField("title", StringType(),True), 
    StructField("cast", StringType(),True), 
    StructField("crew", StringType(), True), 
  ])

recommendations_schema = StructType([ 
    StructField("movie_id", StringType(),True), 
    StructField("user_id", StringType(),True), 
    StructField("vote", IntegerType(),True)
  ])

In [0]:
df_movies_in = spark.read.format("csv") \
                         .schema(movies_schema) \
                         .option("header", "true")\
                         .option("escape",'"') \
                         .option("mode", "DROPMALFORMED") \
                         .load("dbfs:/FileStore/shared_uploads/tiheiss@gmail.com/groupC/groupC/movies_groupC.csv")
df_credits_in = spark.read.format("csv") \
                     .schema(credits_schema) \
                     .option("header", "true") \
                     .option("escape",'"') \
                     .load("dbfs:/FileStore/shared_uploads/tiheiss@gmail.com/groupC/groupC/credits_groupC.csv")
df_recommendations_in = spark.read.format("csv") \
                                  .schema(recommendations_schema) \
                                  .option("header", "true") \
                                  .load("dbfs:/FileStore/shared_uploads/tiheiss@gmail.com/groupC/groupC/recommendations_groupC.csv")

In [0]:
# Pre-Processing:
# credits dataframe
import json
from pyspark.sql.functions import explode

# for some reason, every element in dict from json string has to be the same dtype --> string is most sensible one to choose
df_cast = df_credits_in.rdd.map(lambda x: (x[0],json.loads(x[2].replace('\"cast_id\":','\"cast_id\":\"').replace(', \"character\"','\", \"character\"')))).toDF(["movie_id", "cast"])
rdd = df_cast.select(df_cast.movie_id,explode(df_cast.cast)).rdd.map(lambda x: (x[0], x[1]["name"], x[1]["id"], x[1]["order"], x[1]["character"], x[1]["credit_id"], x[1]["cast_id"], x[1]["gender"]))
cast_schema = StructType([ 
    StructField("movie_id", StringType(),True), 
    StructField("name", StringType(),True), 
    StructField("id", StringType(),True), 
    StructField("order", StringType(), True), 
    StructField("character", StringType(),True), 
    StructField("credit_id", StringType(),True), 
    StructField("cast_id", StringType(),True), 
    StructField("gender", StringType(),True) 
  ])
df_cast = spark.createDataFrame(rdd, cast_schema)
df_cast = df_cast.withColumn("gender",df_cast.gender.cast('int')).withColumn("order",df_cast.order.cast('int'))
df_cast.display()
df_cast.printSchema()

df_crew = df_credits_in.rdd.map(lambda x: (x[0],json.loads(x[3]))).toDF(["movie_id", "crew"])
rdd = df_crew.select(df_crew.movie_id,explode(df_crew.crew)).rdd.map(lambda x: (x[0], x[1]["name"], x[1]["job"], x[1]["department"], x[1]["id"], x[1]["credit_id"], x[1]["gender"]))
crew_schema = StructType([ 
    StructField("movie_id", StringType(),True), 
    StructField("name", StringType(),True), 
    StructField("job", StringType(),True), 
    StructField("department", StringType(), True), 
    StructField("id", StringType(),True), 
    StructField("credit_id", StringType(),True), 
    StructField("gender", StringType(),True) # cast to IntegerType not possible here, has to be done separately (see below)
  ])
df_crew = spark.createDataFrame(rdd, crew_schema)
df_crew = df_crew.withColumn("gender",df_crew.gender.cast('int'))
df_crew.display()
df_crew.printSchema()

movie_id,name,id,order,character,credit_id,cast_id,gender
19995,Sam Worthington,65731,0,Jake Sully,5602a8a7c3a3685532001c9a,242,2
19995,Zoe Saldana,8691,1,Neytiri,52fe48009251416c750ac9cb,3,1
19995,Sigourney Weaver,10205,2,Dr. Grace Augustine,52fe48009251416c750aca39,25,1
19995,Stephen Lang,32747,3,Col. Quaritch,52fe48009251416c750ac9cf,4,2
19995,Michelle Rodriguez,17647,4,Trudy Chacon,52fe48009251416c750ac9d3,5,1
19995,Giovanni Ribisi,1771,5,Selfridge,52fe48009251416c750ac9e1,8,2
19995,Joel David Moore,59231,6,Norm Spellman,52fe48009251416c750ac9dd,7,2
19995,CCH Pounder,30485,7,Moat,52fe48009251416c750ac9e5,9,1
19995,Wes Studi,15853,8,Eytukan,52fe48009251416c750ac9ed,11,2
19995,Laz Alonso,10964,9,Tsu'Tey,52fe48009251416c750ac9e9,10,2


root
 |-- movie_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- id: string (nullable = true)
 |-- order: integer (nullable = true)
 |-- character: string (nullable = true)
 |-- credit_id: string (nullable = true)
 |-- cast_id: string (nullable = true)
 |-- gender: integer (nullable = true)



movie_id,name,job,department,id,credit_id,gender
19995,Stephen E. Rivkin,Editor,Editing,1721,52fe48009251416c750aca23,0
19995,Rick Carter,Production Design,Art,496,539c47ecc3a36810e3001f87,2
19995,Christopher Boyes,Sound Designer,Sound,900,54491c89c3a3680fb4001cf7,0
19995,Christopher Boyes,Supervising Sound Editor,Sound,900,54491cb70e0a267480001bd0,0
19995,Mali Finn,Casting,Production,1262,539c4a4cc3a36810c9002101,1
19995,James Horner,Original Music Composer,Sound,1729,5544ee3b925141499f0008fc,2
19995,James Cameron,Director,Directing,2710,52fe48009251416c750ac9c3,2
19995,James Cameron,Writer,Writing,2710,52fe48009251416c750ac9d9,2
19995,James Cameron,Editor,Editing,2710,52fe48009251416c750aca17,2
19995,James Cameron,Producer,Production,2710,52fe48009251416c750aca29,2


root
 |-- movie_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- job: string (nullable = true)
 |-- department: string (nullable = true)
 |-- id: string (nullable = true)
 |-- credit_id: string (nullable = true)
 |-- gender: integer (nullable = true)



In [0]:
# Pre-Processing:
# recommendations dataframe
df_recommendations = df_recommendations_in # already clean
df_recommendations.display()
df_recommendations.printSchema()

movie_id,user_id,vote
76493,70090,4
20764,47695,1
71547,57703,4
9009,6379,5
15037,41954,1
279,36653,3
47327,8564,1
294272,30078,5
139998,31809,5
1430,31892,4


root
 |-- movie_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- vote: integer (nullable = true)



In [0]:
# Pre-Processing:
# movies dataframe

import ast
from pyspark.sql.types import DateType

df_genres = df_movies_in.withColumnRenamed("id", "movie_id").rdd.map(lambda x: (x[3], ast.literal_eval(x[1].replace('\"id\": ','\"id\": \"').replace(', \"name\"','\", \"name\"')))).toDF(["movie_id", "genres"])
rdd = df_genres.select(df_genres.movie_id,explode(df_genres.genres)).rdd.map(lambda x: (x[0], x[1]["name"], x[1]["id"]))
genres_schema = StructType([ 
    StructField("movie_id", StringType(),True), 
    StructField("name", StringType(),True), 
    StructField("id", StringType(),True)
  ])
df_genres = spark.createDataFrame(rdd, genres_schema)
df_genres.display()
df_genres.printSchema()

df_movies = df_movies_in.drop("genres")
df_movies.display()
df_movies.printSchema()

movie_id,name,id
19995,Action,28
19995,Adventure,12
19995,Fantasy,14
19995,Science Fiction,878
285,Adventure,12
285,Fantasy,14
285,Action,28
206647,Action,28
206647,Adventure,12
206647,Crime,80


root
 |-- movie_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- id: string (nullable = true)



budget,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title
237029119,http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""space war""}, {""id"": 3388, ""name"": ""space colony""}, {""id"": 3679, ""name"": ""society""}, {""id"": 3801, ""name"": ""space travel""}, {""id"": 9685, ""name"": ""futuristic""}, {""id"": 9840, ""name"": ""romance""}, {""id"": 9882, ""name"": ""space""}, {""id"": 9951, ""name"": ""alien""}, {""id"": 10148, ""name"": ""tribe""}, {""id"": 10158, ""name"": ""alien planet""}, {""id"": 10987, ""name"": ""cgi""}, {""id"": 11399, ""name"": ""marine""}, {""id"": 13065, ""name"": ""soldier""}, {""id"": 14643, ""name"": ""battle""}, {""id"": 14720, ""name"": ""love affair""}, {""id"": 165431, ""name"": ""anti war""}, {""id"": 193554, ""name"": ""power relations""}, {""id"": 206690, ""name"": ""mind and soul""}, {""id"": 209714, ""name"": ""3d""}]",en,Avatar,"In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.",150.43758,"[{""name"": ""Ingenious Film Partners"", ""id"": 289}, {""name"": ""Twentieth Century Fox Film Corporation"", ""id"": 306}, {""name"": ""Dune Entertainment"", ""id"": 444}, {""name"": ""Lightstorm Entertainment"", ""id"": 574}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}, {""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}]",2009-12-10,2787954796,162,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,Enter the World of Pandora.,Avatar
300022446,http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug abuse""}, {""id"": 911, ""name"": ""exotic island""}, {""id"": 1319, ""name"": ""east india trading company""}, {""id"": 2038, ""name"": ""love of one's life""}, {""id"": 2052, ""name"": ""traitor""}, {""id"": 2580, ""name"": ""shipwreck""}, {""id"": 2660, ""name"": ""strong woman""}, {""id"": 3799, ""name"": ""ship""}, {""id"": 5740, ""name"": ""alliance""}, {""id"": 5941, ""name"": ""calypso""}, {""id"": 6155, ""name"": ""afterlife""}, {""id"": 6211, ""name"": ""fighter""}, {""id"": 12988, ""name"": ""pirate""}, {""id"": 157186, ""name"": ""swashbuckler""}, {""id"": 179430, ""name"": ""aftercreditsstinger""}]",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems.",139.08261,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""name"": ""Jerry Bruckheimer Films"", ""id"": 130}, {""name"": ""Second Mate Productions"", ""id"": 19936}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2007-05-19,961027991,169,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End
244999130,http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name"": ""based on novel""}, {""id"": 4289, ""name"": ""secret agent""}, {""id"": 9663, ""name"": ""sequel""}, {""id"": 14555, ""name"": ""mi6""}, {""id"": 156095, ""name"": ""british secret service""}, {""id"": 158431, ""name"": ""united kingdom""}]",en,Spectre,"A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M battles political forces to keep the secret service alive, Bond peels back the layers of deceit to reveal the terrible truth behind SPECTRE.",107.376785,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""name"": ""Danjaq"", ""id"": 10761}, {""name"": ""B24"", ""id"": 69434}]","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}, {""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2015-10-26,880650185,148,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""}, {""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}, {""iso_639_1"": ""it"", ""name"": ""Italiano""}, {""iso_639_1"": ""de"", ""name"": ""Deutsch""}]",Released,A Plan No One Escapes,Spectre
250027592,http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853, ""name"": ""crime fighter""}, {""id"": 949, ""name"": ""terrorist""}, {""id"": 1308, ""name"": ""secret identity""}, {""id"": 1437, ""name"": ""burglar""}, {""id"": 3051, ""name"": ""hostage drama""}, {""id"": 3562, ""name"": ""time bomb""}, {""id"": 6969, ""name"": ""gotham city""}, {""id"": 7002, ""name"": ""vigilante""}, {""id"": 9665, ""name"": ""cover-up""}, {""id"": 9715, ""name"": ""superhero""}, {""id"": 9990, ""name"": ""villainess""}, {""id"": 10044, ""name"": ""tragic hero""}, {""id"": 13015, ""name"": ""terrorism""}, {""id"": 14796, ""name"": ""destruction""}, {""id"": 18933, ""name"": ""catwoman""}, {""id"": 156082, ""name"": ""cat burglar""}, {""id"": 156395, ""name"": ""imax""}, {""id"": 173272, ""name"": ""flood""}, {""id"": 179093, ""name"": ""criminal underworld""}, {""id"": 230775, ""name"": ""batman""}]",en,The Dark Knight Rises,"Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protect the late attorney's reputation and is subsequently hunted by the Gotham City Police Department. Eight years later, Batman encounters the mysterious Selina Kyle and the villainous Bane, a new terrorist leader who overwhelms Gotham's finest. The Dark Knight resurfaces to protect a city that has branded him an enemy.",112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""name"": ""Warner Bros."", ""id"": 6194}, {""name"": ""DC Entertainment"", ""id"": 9993}, {""name"": ""Syncopy"", ""id"": 9996}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2012-07-16,1084946897,165,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises
259973060,http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"": 839, ""name"": ""mars""}, {""id"": 1456, ""name"": ""medallion""}, {""id"": 3801, ""name"": ""space travel""}, {""id"": 7376, ""name"": ""princess""}, {""id"": 9951, ""name"": ""alien""}, {""id"": 10028, ""name"": ""steampunk""}, {""id"": 10539, ""name"": ""martian""}, {""id"": 10685, ""name"": ""escape""}, {""id"": 161511, ""name"": ""edgar rice burroughs""}, {""id"": 163252, ""name"": ""alien race""}, {""id"": 179102, ""name"": ""superhuman strength""}, {""id"": 190320, ""name"": ""mars civilization""}, {""id"": 195446, ""name"": ""sword and planet""}, {""id"": 207928, ""name"": ""19th century""}, {""id"": 209714, ""name"": ""3d""}]",en,John Carter,"John Carter is a war-weary, former military captain who's inexplicably transported to the mysterious and exotic planet of Barsoom (Mars) and reluctantly becomes embroiled in an epic conflict. It's a world on the brink of collapse, and Carter rediscovers his humanity when he realizes the survival of Barsoom and its people rests in his hands.",43.926994,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2012-03-07,284181237,132,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter
258041354,http://www.sonypictures.com/movies/spider-man3/,559,"[{""id"": 851, ""name"": ""dual identity""}, {""id"": 1453, ""name"": ""amnesia""}, {""id"": 1965, ""name"": ""sandstorm""}, {""id"": 2038, ""name"": ""love of one's life""}, {""id"": 3446, ""name"": ""forgiveness""}, {""id"": 3986, ""name"": ""spider""}, {""id"": 4391, ""name"": ""wretch""}, {""id"": 4959, ""name"": ""death of a friend""}, {""id"": 5776, ""name"": ""egomania""}, {""id"": 5789, ""name"": ""sand""}, {""id"": 5857, ""name"": ""narcism""}, {""id"": 6062, ""name"": ""hostility""}, {""id"": 8828, ""name"": ""marvel comic""}, {""id"": 9663, ""name"": ""sequel""}, {""id"": 9715, ""name"": ""superhero""}, {""id"": 9748, ""name"": ""revenge""}]",en,Spider-Man 3,"The seemingly invincible Spider-Man goes up against an all-new crop of villain – including the shape-shifting Sandman. While Spider-Man’s superpowers are altered by an alien organism, his alter ego, Peter Parker, deals with nemesis Eddie Brock and also gets caught up in a love triangle.",115.699814,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""name"": ""Laura Ziskin Productions"", ""id"": 326}, {""name"": ""Marvel Enterprises"", ""id"": 19551}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2007-05-01,890843369,139,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""}]",Released,The battle within.,Spider-Man 3
259987039,http://disney.go.com/disneypictures/tangled/,38757,"[{""id"": 1562, ""name"": ""hostage""}, {""id"": 2343, ""name"": ""magic""}, {""id"": 2673, ""name"": ""horse""}, {""id"": 3205, ""name"": ""fairy tale""}, {""id"": 4344, ""name"": ""musical""}, {""id"": 7376, ""name"": ""princess""}, {""id"": 10336, ""name"": ""animation""}, {""id"": 33787, ""name"": ""tower""}, {""id"": 155658, ""name"": ""blonde woman""}, {""id"": 162219, ""name"": ""selfishness""}, {""id"": 163545, ""name"": ""healing power""}, {""id"": 179411, ""name"": ""based on fairy tale""}, {""id"": 179431, ""name"": ""duringcreditsstinger""}, {""id"": 215258, ""name"": ""healing gift""}, {""id"": 234183, ""name"": ""animal sidekick""}]",en,Tangled,"When the kingdom's most wanted-and most charming-bandit Flynn Rider hides out in a mysterious tower, he's taken hostage by Rapunzel, a beautiful and feisty tower-bound teen with 70 feet of magical, golden hair. Flynn's curious captor, who's looking for her ticket out of the tower where she's been locked away for years, strikes a deal with the handsome thief and the unlikely duo sets off on an action-packed escapade, complete with a super-cop horse, an over-protective chameleon and a gruff gang of pub thugs.",48.68197,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""name"": ""Walt Disney Animation Studios"", ""id"": 6125}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2010-11-24,591783701,100,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,They're taking adventure to new lengths.,Tangled
279979950,http://marvel.com/movies/movie/193/avengers_age_of_ultron,99861,"[{""id"": 8828, ""name"": ""marvel comic""}, {""id"": 9663, ""name"": ""sequel""}, {""id"": 9715, ""name"": ""superhero""}, {""id"": 9717, ""name"": ""based on comic book""}, {""id"": 10629, ""name"": ""vision""}, {""id"": 155030, ""name"": ""superhero team""}, {""id"": 179431, ""name"": ""duringcreditsstinger""}, {""id"": 180547, ""name"": ""marvel cinematic universe""}, {""id"": 209714, ""name"": ""3d""}]",en,Avengers: Age of Ultron,"When Tony Stark tries to jumpstart a dormant peacekeeping program, things go awry and Earth’s Mightiest Heroes are put to the ultimate test as the fate of the planet hangs in the balance. As the villainous Ultron emerges, it is up to The Avengers to stop him from enacting his terrible plans, and soon uneasy alliances and unexpected action pave the way for an epic and unique global adventure.",134.27924,"[{""name"": ""Marvel Studios"", ""id"": 420}, {""name"": ""Prime Focus"", ""id"": 15357}, {""name"": ""Revolution Sun Studios"", ""id"": 76043}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2015-04-22,1405412748,141,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Age Has Come.,Avengers: Age of Ultron
250009336,http://harrypotter.warnerbros.com/harrypotterandthehalf-bloodprince/dvd/index.html,767,"[{""id"": 616, ""name"": ""witch""}, {""id"": 2343, ""name"": ""magic""}, {""id"": 3872, ""name"": ""broom""}, {""id"": 3884, ""name"": ""school of witchcraft""}, {""id"": 6333, ""name"": ""wizardry""}, {""id"": 10164, ""name"": ""apparition""}, {""id"": 10791, ""name"": ""teenage crush""}, {""id"": 12564, ""name"": ""werewolf""}]",en,Harry Potter and the Half-Blood Prince,"As Harry begins his sixth year at Hogwarts, he discovers an old book marked as 'Property of the Half-Blood Prince', and begins to learn more about Lord Voldemort's dark past.",98.885635,"[{""name"": ""Warner Bros."", ""id"": 6194}, {""name"": ""Heyday Films"", ""id"": 7364}]","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}, {""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2009-07-07,933987309,153,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Dark Secrets Revealed,Harry Potter and the Half-Blood Prince
250049705,http://www.batmanvsupermandawnofjustice.com/,209112,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 7002, ""name"": ""vigilante""}, {""id"": 9715, ""name"": ""superhero""}, {""id"": 9717, ""name"": ""based on comic book""}, {""id"": 9748, ""name"": ""revenge""}, {""id"": 163455, ""name"": ""super powers""}, {""id"": 195242, ""name"": ""clark kent""}, {""id"": 195243, ""name"": ""bruce wayne""}, {""id"": 229266, ""name"": ""dc extended universe""}]",en,Batman v Superman: Dawn of Justice,"Fearing the actions of a god-like Super Hero left unchecked, Gotham City’s own formidable, forceful vigilante takes on Metropolis’s most revered, modern-day savior, while the world wrestles with what sort of hero it really needs. And with Batman and Superman at war with one another, a new threat quickly arises, putting mankind in greater danger than it’s ever known before.",155.79045,"[{""name"": ""DC Comics"", ""id"": 429}, {""name"": ""Atlas Entertainment"", ""id"": 507}, {""name"": ""Warner Bros."", ""id"": 6194}, {""name"": ""DC Entertainment"", ""id"": 9993}, {""name"": ""Cruel & Unusual Films"", ""id"": 9995}, {""name"": ""RatPac-Dune Entertainment"", ""id"": 41624}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2016-03-23,873300134,151,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Justice or revenge,Batman v Superman: Dawn of Justice


root
 |-- budget: integer (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: float (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: date (nullable = true)
 |-- revenue: long (nullable = true)
 |-- runtime: integer (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)



In [0]:
from pyspark.sql.functions import col
df_movie_genres = df_genres.select(["movie_id", col("id").alias("genre_id")])
df_movie_genres.display()
df_movie_genres.printSchema()

df_genres = df_genres.select(["id", "name"]).distinct()
df_genres.display()
df_genres.printSchema()

movie_id,genre_id
19995,28
19995,12
19995,14
19995,878
285,12
285,14
285,28
206647,28
206647,12
206647,80


root
 |-- movie_id: string (nullable = true)
 |-- genre_id: string (nullable = true)



id,name
80,Crime
37,Western
35,Comedy
28,Action
10769,Foreign
9648,Mystery
27,Horror
12,Adventure
10749,Romance
10751,Family


root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



## Data Storage

In [0]:
df_movies.write.mode('overwrite').parquet("/tmp/out/movies.parquet")
df_cast.write.mode('overwrite').parquet("/tmp/out/cast.parquet")
df_crew.write.mode('overwrite').parquet("/tmp/out/crew.parquet")
df_recommendations.write.mode('overwrite').parquet("/tmp/out/recommendations.parquet")
df_genres.write.mode("overwrite").parquet("/tmp/out/genres.parquet")
df_movie_genres.write.mode('overwrite').parquet("/tmp/out/movie_genres.parquet")

## Group Tasks

In [0]:
df_movies = spark.read.parquet("/tmp/out/movies.parquet")
df_cast = spark.read.parquet("/tmp/out/cast.parquet")
df_crew = spark.read.parquet("/tmp/out/crew.parquet")
df_recommendations = spark.read.parquet("/tmp/out/recommendations.parquet")
df_genres = spark.read.parquet("/tmp/out/genres.parquet")
df_movie_genres = spark.read.parquet("/tmp/out/movie_genres.parquet")

### Exercise 1
Which movie genres have the most movies with a runtime over 120 minutes and how many movies? Please list the top three movie genres.

In [0]:
from pyspark.sql.functions import broadcast, col
broadcastGenres = broadcast(df_genres)

df_movies.where(df_movies.runtime > 120).join(df_movie_genres, df_movie_genres.movie_id == df_movies.id)\
         .join(broadcastGenres, df_movie_genres.genre_id == broadcastGenres.id)\
         .groupby(broadcastGenres.name)\
         .count()\
         .select(broadcastGenres.name, col("count").alias("no. of movies over 120min"))\
         .orderBy("count", ascending=False)\
         .limit(3)\
         .display()

name,no. of movies over 120min
Drama,667
Action,297
Thriller,245


### Exercise 2
In how many movies did the actor Johnny Depp take part in as an actor? In how many of those did he also act as a producer?

In [0]:
solution_2a = df_cast.where(df_cast.name == "Johnny Depp") \
                     .count()
solution_2a

Out[26]: 40

In [0]:
accum = sc.accumulator(0) # accumulator nicht unbedingt verlässlich
rdd = df_cast.where(df_cast.name == "Johnny Depp").rdd.map(lambda x: accum.add(1))
rdd.collect()
accum.value

Out[27]: 40

In [0]:
solution_2b = df_cast.join(df_crew, df_cast.movie_id == df_crew.movie_id) \
                     .where((df_cast.name == "Johnny Depp") & (df_crew.name == "Johnny Depp") & (df_crew.job == "Producer")) \
                     .count()
solution_2b

Out[28]: 2

In [0]:
print(f"Johnny Depp has appeared in {solution_2a} films as an actor, in {solution_2b} of which he was also a producer.")

Johnny Depp has appeared in 40 films as an actor, in 2 of which he was also a producer.


### Exercise 3
List the names and the revenue of the ten movies with the most revenue which were released before 2015. What are the ten financially most succesful movies when comparing the revenue to the budget?

In [0]:
from pyspark.sql.functions import year
from datetime import date
df_movies.where(year(df_movies.release_date) < 2015).select(df_movies.title, df_movies.revenue).orderBy(df_movies.revenue, ascending=False).limit(10).display()

title,revenue
Avatar,2787954796
Titanic,1845003402
The Avengers,1519534370
Frozen,1274203452
Iron Man 3,1215485685
Transformers: Dark of the Moon,1123753017
The Lord of the Rings: The Return of the King,1118908565
Skyfall,1108523531
Transformers: Age of Extinction,1091449308
The Dark Knight Rises,1084946897


In [0]:
df_movies.where(year(df_movies.release_date) < 2015).select(df_movies.title, df_movies.revenue, df_movies.budget, (df_movies.revenue/df_movies.budget).alias("revenue_to_budget_rate")).orderBy("revenue_to_budget_rate", ascending=False).limit(10).display()

title,revenue,budget,revenue_to_budget_rate
Paranormal Activity,193331808,6117,31605.657675331044
Wild Hogs,253619776,13302,19066.28897910089
Garfield: A Tail of Two Kitties,141690931,9431,15023.95620824939
Observe and Report,24035595,1687,14247.537048014226
The Muppet Christmas Carol,27238020,2131,12781.80197090568
Scooby-Doo 2: Monsters Unleashed,181447854,21198,8559.668553637135
The Campaign,104874999,19560,5361.707515337424
Here Comes the Boom,73097152,16520,4424.7670702179175
The Book Thief,76607648,24066,3183.231446854484
The Blair Witch Project,248009994,81459,3044.5990498287483


### Exercise 4
What is the movie genre that has a median rating of at least 3 (over all movies with at least ten recommendations) with the lowest average production budget considering all movies?

In [0]:
from pyspark.sql.functions import expr, col, avg, broadcast
broadcastGenres = broadcast(df_genres)
movies = df_recommendations.groupBy("movie_id").count().where(col("count")>=10).select("movie_id").rdd.map(lambda x: x[0]).collect()

In [0]:
genres_atleast_3 = df_movie_genres.filter(df_movie_genres.movie_id.isin(movies))\
                                   .join(df_recommendations, df_recommendations.movie_id == df_movie_genres.movie_id)\
                                   .groupBy(df_movie_genres.genre_id)\
                                   .agg(expr('percentile_approx(vote, array(0.5))')[0].alias('median_vote'))\
                                   .where(col("median_vote") >=3).select(df_movie_genres.genre_id)

In [0]:
df_movie_genres.join(df_movies, df_movies.id == df_movie_genres.movie_id)\
                .filter(df_movie_genres.genre_id.isin(genres_atleast_3.genre_id))\
               .groupBy(df_movie_genres.genre_id)\
               .agg(avg(col("budget")).alias("avg_budget"))\
               .orderBy("avg_budget").limit(1)\
               .join(broadcastGenres, df_movie_genres.genre_id==broadcastGenres.id)\
               .select(broadcastGenres.name, col("avg_budget")).display()

name,avg_budget
Foreign,501361.5161290322
