In [13]:
from pyspark.sql import SparkSession
import socket

# Descobre IP local do container
local_ip = socket.gethostbyname(socket.gethostname())

spark = SparkSession.builder \
    .appName("Analises") \
    .master("spark://spark-master:7077") \
    .config("spark.driver.host", local_ip) \
    .getOrCreate()

print("Conectado ao Spark master:", spark.sparkContext.master)

Conectado ao Spark master: spark://spark-master:7077


In [14]:
# Lê o Parquet
# 1. Defina o caminho de saída para o novo dataset
caminho_parquet = "/spark-data/musicas_limpas_cluste.parquet"

df = spark.read.parquet(caminho_parquet)

#df.select("main_genre").distinct().show(truncate=False)

df.select("main_genre").distinct().orderBy("main_genre").show(100, truncate=False)

generos_unicos = df.select("main_genre").distinct().count()
print(f"Quantidade de gêneros únicos: {generos_unicos}")

                                                                                

+-------------------+
|main_genre         |
+-------------------+
|acoustic           |
|alt-country        |
|alternative        |
|alternative rock   |
|ambient            |
|black metal        |
|blues              |
|britpop            |
|chillout           |
|chillwave          |
|christian          |
|classic rock       |
|classical          |
|cloud rap          |
|comedy             |
|country            |
|dance              |
|dancehall          |
|death metal        |
|deathcore          |
|disco              |
|doom metal         |
|dream pop          |
|drum and bass      |
|dub                |
|dubstep            |
|electro            |
|electronic         |
|electropop         |
|emo                |
|emo rap            |
|experimental       |
|folk               |
|funk               |
|garage rock        |
|gospel             |
|grime              |
|grunge             |
|hard rock          |
|hardcore           |
|heavy metal        |
|hip-hop            |
|house    

In [19]:
from pyspark.sql.functions import col, count, round

genero_dist = df.groupBy("main_genre").count()

total = df.count()

genero_dist.withColumn("percentual", round((col("count") / total) * 100, 2)) \
    .orderBy("count", ascending=False) \
    .show(87,truncate=False)


                                                                                

+-------------------+------+----------+
|main_genre         |count |percentual|
+-------------------+------+----------+
|hip-hop            |222984|44.78     |
|rock               |69637 |13.98     |
|pop                |30973 |6.22      |
|alternative rock   |13209 |2.65      |
|electronic         |10730 |2.15      |
|rap                |10225 |2.05      |
|country            |9880  |1.98      |
|folk               |9549  |1.92      |
|pop rock           |8490  |1.7       |
|jazz               |7512  |1.51      |
|indie rock         |7252  |1.46      |
|soul               |6624  |1.33      |
|trap               |6138  |1.23      |
|reggae             |5797  |1.16      |
|indie pop          |4483  |0.9       |
|punk               |4304  |0.86      |
|metal              |4099  |0.82      |
|heavy metal        |3878  |0.78      |
|blues              |3237  |0.65      |
|soundtrack         |2926  |0.59      |
|synthpop           |2517  |0.51      |
|metalcore          |2503  |0.5       |


In [20]:
anos_unicos = df.select("release_year").distinct().count()
print(f"Quantidade de anos únicos: {anos_unicos}")



Quantidade de anos únicos: 108


                                                                                

In [22]:
from pyspark.sql.functions import col, count, round

anos_dist = df.groupBy("release_year").count()

total = df.count()

anos_dist.withColumn("percentual", round((col("count") / total) * 100, 2)) \
    .orderBy("count", ascending=False) \
    .show(108,truncate=False)

                                                                                

+------------+-----+----------+
|release_year|count|percentual|
+------------+-----+----------+
|2018        |40069|8.05      |
|2019        |38143|7.66      |
|2017        |36566|7.34      |
|2020        |33800|6.79      |
|2016        |28085|5.64      |
|2024        |25652|5.15      |
|2021        |25000|5.02      |
|2015        |21990|4.42      |
|2014        |18073|3.63      |
|2013        |16539|3.32      |
|2012        |13806|2.77      |
|2023        |12193|2.45      |
|2022        |11937|2.4       |
|2011        |11678|2.34      |
|2010        |9703 |1.95      |
|2009        |9006 |1.81      |
|2007        |8265 |1.66      |
|2008        |8155 |1.64      |
|2006        |7958 |1.6       |
|2005        |6956 |1.4       |
|2003        |6133 |1.23      |
|2004        |6113 |1.23      |
|1999        |5230 |1.05      |
|2002        |5148 |1.03      |
|2001        |5017 |1.01      |
|1996        |4677 |0.94      |
|1998        |4639 |0.93      |
|1997        |4482 |0.9       |
|1995   

                                                                                

In [23]:
emotion_unicas = df.select("emotion_lower").distinct().count()
print(f"Quantidade de emoções únicas: {emotion_unicas}")

Quantidade de emoções únicas: 6


                                                                                

In [24]:
from pyspark.sql.functions import col, count, round

emotion_dist = df.groupBy("emotion_lower").count()

total = df.count()

emotion_dist.withColumn("percentual", round((col("count") / total) * 100, 2)) \
    .orderBy("count", ascending=False) \
    .show(108,truncate=False)

+-------------+------+----------+
|emotion_lower|count |percentual|
+-------------+------+----------+
|joy          |189360|38.02     |
|sadness      |156798|31.49     |
|anger        |95484 |19.17     |
|fear         |26001 |5.22      |
|love         |25381 |5.1       |
|surprise     |4974  |1.0       |
+-------------+------+----------+



In [25]:
spark.stop()