In [None]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=f9ee8aa1357da314266a5071da75b935881ae78e794093a57bc1cff6ed3eab3a
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


1. Load the Dataset:

In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("Movie Data Transformations").getOrCreate()

# Load CSV file into DataFrame
file_path = "/content/sample_data/movie_date.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the DataFrame
df.show()


+--------+-----------------+---------+------+----------+----------+
|movie_id|            title|    genre|rating|box_office|      date|
+--------+-----------------+---------+------+----------+----------+
|       1|        Inception|   Sci-Fi|   8.8| 830000000|2010-07-16|
|       2|  The Dark Knight|   Action|   9.0|1004000000|2008-07-18|
|       3|     Interstellar|   Sci-Fi|   8.6| 677000000|2014-11-07|
|       4|Avengers: Endgame|   Action|   8.4|2797000000|2019-04-26|
|       5|    The Lion King|Animation|   8.5|1657000000|1994-06-15|
|       6|      Toy Story 4|Animation|   7.8|1073000000|2019-06-21|
|       7|        Frozen II|Animation|   7.0|1450000000|2019-11-22|
|       8|            Joker|    Drama|   8.5|1074000000|2019-10-04|
|       9|         Parasite|    Drama|   8.6| 258000000|2019-05-30|
+--------+-----------------+---------+------+----------+----------+



2. Filter Movies by Genre ("Sci-Fi"):

In [None]:
sci_fi_movies = df.filter(df.genre == "Sci-Fi")
sci_fi_movies.show()


+--------+------------+------+------+----------+----------+
|movie_id|       title| genre|rating|box_office|      date|
+--------+------------+------+------+----------+----------+
|       1|   Inception|Sci-Fi|   8.8| 830000000|2010-07-16|
|       3|Interstellar|Sci-Fi|   8.6| 677000000|2014-11-07|
+--------+------------+------+------+----------+----------+



3. Top-Rated Movies:

In [None]:
top_rated_movies = df.orderBy(df.rating.desc()).limit(3)
top_rated_movies.show()


+--------+---------------+------+------+----------+----------+
|movie_id|          title| genre|rating|box_office|      date|
+--------+---------------+------+------+----------+----------+
|       2|The Dark Knight|Action|   9.0|1004000000|2008-07-18|
|       1|      Inception|Sci-Fi|   8.8| 830000000|2010-07-16|
|       3|   Interstellar|Sci-Fi|   8.6| 677000000|2014-11-07|
+--------+---------------+------+------+----------+----------+



4. Movies Released After 2010:

In [None]:
from pyspark.sql.functions import year

movies_after_2010 = df.filter(year(df.date) > 2010)
movies_after_2010.show()


+--------+-----------------+---------+------+----------+----------+
|movie_id|            title|    genre|rating|box_office|      date|
+--------+-----------------+---------+------+----------+----------+
|       3|     Interstellar|   Sci-Fi|   8.6| 677000000|2014-11-07|
|       4|Avengers: Endgame|   Action|   8.4|2797000000|2019-04-26|
|       6|      Toy Story 4|Animation|   7.8|1073000000|2019-06-21|
|       7|        Frozen II|Animation|   7.0|1450000000|2019-11-22|
|       8|            Joker|    Drama|   8.5|1074000000|2019-10-04|
|       9|         Parasite|    Drama|   8.6| 258000000|2019-05-30|
+--------+-----------------+---------+------+----------+----------+



Task 5: Calculate Average Box Office Collection by Genre


In [None]:
from pyspark.sql.functions import col, avg
avg_box_office_by_genre = df.groupBy("genre").agg(avg("box_office").alias("avg_box_office"))
avg_box_office_by_genre.show()

+---------+--------------------+
|    genre|      avg_box_office|
+---------+--------------------+
|    Drama|              6.66E8|
|Animation|1.3933333333333333E9|
|   Action|            1.9005E9|
|   Sci-Fi|             7.535E8|
+---------+--------------------+



Task 6: Add a New Column for Box Office in Billions

In [None]:

df_with_billions = df.withColumn("box_office_in_billions", col("box_office") / 1_000_000_000)
df_with_billions.show()

+--------+-----------------+---------+------+----------+----------+----------------------+
|movie_id|            title|    genre|rating|box_office|      date|box_office_in_billions|
+--------+-----------------+---------+------+----------+----------+----------------------+
|       1|        Inception|   Sci-Fi|   8.8| 830000000|2010-07-16|                  0.83|
|       2|  The Dark Knight|   Action|   9.0|1004000000|2008-07-18|                 1.004|
|       3|     Interstellar|   Sci-Fi|   8.6| 677000000|2014-11-07|                 0.677|
|       4|Avengers: Endgame|   Action|   8.4|2797000000|2019-04-26|                 2.797|
|       5|    The Lion King|Animation|   8.5|1657000000|1994-06-15|                 1.657|
|       6|      Toy Story 4|Animation|   7.8|1073000000|2019-06-21|                 1.073|
|       7|        Frozen II|Animation|   7.0|1450000000|2019-11-22|                  1.45|
|       8|            Joker|    Drama|   8.5|1074000000|2019-10-04|                 1.074|

Task 7: Sort Movies by Box Office Collection

In [None]:
sorted_by_box_office = df.orderBy(col("box_office").desc())
sorted_by_box_office.show()

+--------+-----------------+---------+------+----------+----------+
|movie_id|            title|    genre|rating|box_office|      date|
+--------+-----------------+---------+------+----------+----------+
|       4|Avengers: Endgame|   Action|   8.4|2797000000|2019-04-26|
|       5|    The Lion King|Animation|   8.5|1657000000|1994-06-15|
|       7|        Frozen II|Animation|   7.0|1450000000|2019-11-22|
|       8|            Joker|    Drama|   8.5|1074000000|2019-10-04|
|       6|      Toy Story 4|Animation|   7.8|1073000000|2019-06-21|
|       2|  The Dark Knight|   Action|   9.0|1004000000|2008-07-18|
|       1|        Inception|   Sci-Fi|   8.8| 830000000|2010-07-16|
|       3|     Interstellar|   Sci-Fi|   8.6| 677000000|2014-11-07|
|       9|         Parasite|    Drama|   8.6| 258000000|2019-05-30|
+--------+-----------------+---------+------+----------+----------+



 Task 8: Count the Number of Movies per Genre

In [None]:

movies_per_genre = df.groupBy("genre").count()
movies_per_genre.show()

+---------+-----+
|    genre|count|
+---------+-----+
|    Drama|    2|
|Animation|    3|
|   Action|    2|
|   Sci-Fi|    2|
+---------+-----+

