In [None]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark 3.5.0
!wget -q https://apache.osuosl.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

In [None]:
ls -l # check the .tgz is there

total 782040
drwxr-xr-x  2 root root      4096 Jan 13 10:59 [0m[01;34mdataset[0m/
drwx------  5 root root      4096 Jan 13 11:59 [01;34mdrive[0m/
drwxr-xr-x  1 root root      4096 Jan 11 17:02 [01;34msample_data[0m/
drwxr-xr-x 13 1000 1000      4096 Sep  9 02:08 [01;34mspark-3.5.0-bin-hadoop3[0m/
-rw-r--r--  1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz
-rw-r--r--  1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz.1


In [None]:
# unzip it
!tar xf spark-3.5.0-bin-hadoop3.tgz

In [None]:
!pip install -q findspark

In [None]:
!pip install py4j

# For maps
!pip install folium
!pip install plotly



In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

In [None]:
import findspark
findspark.init("spark-3.5.0-bin-hadoop3")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("Joins") \
        .master("local[*]") \
        .getOrCreate()

spark.version

'3.5.0'

In [None]:
spark

In [None]:
# Import sql functions
from pyspark.sql.functions import *

In [None]:
!mkdir -p dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/guitars.json -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/guitarPlayers.json -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/bands.json -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/employees.csv -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/salaries.csv -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/deptmanagers.csv -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/titles.csv -P /dataset
!ls /dataset

bands.json	  deptmanagers.csv.1  guitarPlayers.json    guitars.json.1  titles.csv
bands.json.1	  employees.csv       guitarPlayers.json.1  salaries.csv    titles.csv.1
deptmanagers.csv  employees.csv.1     guitars.json	    salaries.csv.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/imdb_top_1000.csv'

df = spark.read.csv(file_path, header=True, inferSchema=True)

df.printSchema()

df.show()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
root
 |-- Poster_Link: string (nullable = true)
 |-- Series_Title: string (nullable = true)
 |-- Released_Year: string (nullable = true)
 |-- Certificate: string (nullable = true)
 |-- Runtime: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- IMDB_Rating: double (nullable = true)
 |-- Overview: string (nullable = true)
 |-- Meta_score: string (nullable = true)
 |-- Director: string (nullable = true)
 |-- Star1: string (nullable = true)
 |-- Star2: string (nullable = true)
 |-- Star3: string (nullable = true)
 |-- Star4: string (nullable = true)
 |-- No_of_Votes: string (nullable = true)
 |-- Gross: string (nullable = true)

+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+------------------+--------------------+------------------

In [None]:
threshold = 8.0
high_rated_movies = df.filter(col("IMDB_Rating") > threshold).count()
print(f"Número de películas con calificación IMDB superior a {threshold}: {high_rated_movies}")

Número de películas con calificación IMDB superior a 8.0: 322


In [None]:
df = df.withColumn("Genre", explode(split(col("Genre"), ", ")))

genre_counts = df.groupBy("Genre").count().orderBy("count", ascending=False)
genre_counts.show()

+---------+-----+
|    Genre|count|
+---------+-----+
|    Drama|  724|
|   Comedy|  233|
|    Crime|  209|
|Adventure|  196|
|   Action|  189|
| Thriller|  137|
|  Romance|  125|
|Biography|  109|
|  Mystery|   99|
|Animation|   82|
|   Sci-Fi|   67|
|  Fantasy|   66|
|   Family|   56|
|  History|   56|
|      War|   51|
|    Music|   35|
|   Horror|   32|
|  Western|   20|
|Film-Noir|   19|
|    Sport|   19|
+---------+-----+
only showing top 20 rows



In [None]:
year_counts = df.groupBy("Released_Year").count().orderBy("Released_Year")
year_counts.show()

+-------------+-----+
|Released_Year|count|
+-------------+-----+
|         1920|    3|
|         1921|    3|
|         1922|    2|
|         1924|    3|
|         1925|    6|
|         1926|    3|
|         1927|    4|
|         1928|    5|
|         1930|    2|
|         1931|    9|
|         1932|    5|
|         1933|    8|
|         1934|    5|
|         1935|    9|
|         1936|    3|
|         1937|    2|
|         1938|    8|
|         1939|   13|
|         1940|   19|
|         1941|    4|
+-------------+-----+
only showing top 20 rows



In [None]:
certificate_counts = df.groupBy("Certificate").count().orderBy("count", ascending=False)
certificate_counts.show()

+-----------+-----+
|Certificate|count|
+-----------+-----+
|          U|  618|
|          A|  489|
|         UA|  457|
|          R|  361|
|       NULL|  235|
|         PG|  101|
|      PG-13|  100|
|     Passed|   92|
|          G|   32|
|   Approved|   30|
|      TV-PG|    7|
|         GP|    6|
|    Unrated|    3|
|         16|    3|
|      TV-MA|    3|
|        U/A|    3|
|      TV-14|    1|
+-----------+-----+



In [None]:
director_counts = df.groupBy("Director").count().orderBy("count", ascending=False)
director_counts.show()

+-----------------+-----+
|         Director|count|
+-----------------+-----+
| Alfred Hitchcock|   37|
|   Hayao Miyazaki|   33|
| Steven Spielberg|   29|
|  Martin Scorsese|   27|
|   Akira Kurosawa|   26|
|     Billy Wilder|   24|
|  Stanley Kubrick|   21|
|Christopher Nolan|   21|
|     Howard Hawks|   21|
|    David Fincher|   21|
|Quentin Tarantino|   21|
|      Woody Allen|   20|
|       Rob Reiner|   17|
|   Clint Eastwood|   17|
|  Charles Chaplin|   17|
|     Ridley Scott|   17|
|     Wes Anderson|   17|
|        Joel Coen|   15|
|   Alfonso Cuarón|   14|
|      John Huston|   14|
+-----------------+-----+
only showing top 20 rows



In [25]:
from pyspark.sql.window import Window

df = df.withColumn("Gross", df["Gross"].cast("double"))

window_spec = Window.partitionBy("Genre").orderBy(desc("Gross"))

df = df.withColumn("rank", row_number().over(window_spec))

result_df = df.filter("rank <= 2")

result_df.show()

+--------------------+--------------------+-------------+-----------+-------+---------+-----------+--------------------+--------------------+----------------+------------------+--------------------+------------------+------------------+---------------+---------+----+
|         Poster_Link|        Series_Title|Released_Year|Certificate|Runtime|    Genre|IMDB_Rating|            Overview|          Meta_score|        Director|             Star1|               Star2|             Star3|             Star4|    No_of_Votes|    Gross|rank|
+--------------------+--------------------+-------------+-----------+-------+---------+-----------+--------------------+--------------------+----------------+------------------+--------------------+------------------+------------------+---------------+---------+----+
|https://m.media-a...|      V for Vendetta|         2005|          A|132 min|   Action|        8.2|"In a future Brit...| plots to overthr...|              62|    James McTeigue|        Hugo Weavin