In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, count, desc, from_json , max
from pyspark.sql.types import ArrayType, StringType, StructType, StructField

spark = SparkSession.builder \
    .appName("TMDB Data Aggregation") \
    .getOrCreate()

data = spark.read.option('header','true').option("escape", "\"").csv('tmdb_5000_movies.csv')

data.show(5)

+---------+--------------------+--------------------+------+--------------------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+------------+----------+-------+--------------------+--------+--------------------+--------------------+------------+----------+
|   budget|              genres|            homepage|    id|            keywords|original_language|      original_title|            overview|popularity|production_companies|production_countries|release_date|   revenue|runtime|    spoken_languages|  status|             tagline|               title|vote_average|vote_count|
+---------+--------------------+--------------------+------+--------------------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+------------+----------+-------+--------------------+--------+--------------------+--------------------+------------+----------+
|        0|[{"id": 28, "name...

In [None]:
import subprocess
# Local file path
local_file_path = "tmdb_5000_movies.csv"

# HDFS directory where you want to store the file
hdfs_directory = "/path/to/hdfs/directory/"

# Use subprocess to execute the Hadoop command to copy the file to HDFS
subprocess.run(["hadoop", "fs", "-put", "tmdb_5000_movies.csv", hdfs_directory])


In [2]:
popular_films = data.groupBy("original_language") \
    .agg(max("popularity").cast('decimal(10,2)').alias("max_popularity")) \
        .orderBy(desc('max_popularity'))
    
popular_films.toPandas().to_csv("popular_film_per_lan.csv", index=False)
popular_films.show(10)


+-----------------+--------------+
|original_language|max_popularity|
+-----------------+--------------+
|               en|         99.69|
|               es|         90.81|
|               it|         88.38|
|               el|         28.86|
|               te|         16.25|
|               he|         14.08|
|               pl|         13.27|
|               nb|         12.98|
|               no|         11.21|
|               zh|          9.95|
+-----------------+--------------+
only showing top 10 rows



In [3]:
genre_schema = ArrayType(StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True)
]))

df_parsed = data.withColumn("genres_array", from_json(col("genres"), genre_schema))

df_exploded = df_parsed.withColumn("genre", explode(df_parsed["genres_array"]))

genres_aggregated = df_exploded.groupBy("genre.id", "genre.name") \
    .agg(count("id").alias("number_of_movies")).orderBy(desc('number_of_movies'))

genres_aggregated.toPandas().to_csv("Genres_Agggregations.csv", index=False)

genres_aggregated.show(5)


+-----+--------+----------------+
|   id|    name|number_of_movies|
+-----+--------+----------------+
|   18|   Drama|            2297|
|   35|  Comedy|            1722|
|   53|Thriller|            1274|
|   28|  Action|            1154|
|10749| Romance|             894|
+-----+--------+----------------+
only showing top 5 rows

