In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col as spark_col, greatest, abs, when, lit, coalesce, expr, first
from pyspark.sql.types import IntegerType

spark = SparkSession.builder \
    .appName("Music Data Metrics") \
    .getOrCreate()

numeric_cols = [
    "Tempo", "Loudness (db)", "Energy", "Danceability", "Positiveness",
    "Speechiness", "Liveness", "Acousticness", "Instrumentalness", "Popularity"
]

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/09 01:57:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/09 01:57:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/07/09 01:57:20 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
def popularity_poor():
    df = spark.read.parquet("/spark-data/musicas_limpas_cluste.parquet")
    
    genres = [row["main_genre"] for row in df.select("main_genre").distinct().collect()]
    
    results = []
    
    for genre in genres:
        df_filtered = df.filter(spark_col("main_genre") == genre)
        min_value = -9999
        best_metric = None
        
        for c in numeric_cols:
            if c != "Popularity":
                corr_value = df_filtered.stat.corr("Popularity", c)
    
                if corr_value > min_value:
                    min_value = corr_value
                    best_metric = c
        
        if best_metric is not None:
            results.append((genre, best_metric))
    
    result_df = spark.createDataFrame(results, ["main_genre", "maior_corr_coluna"])
    result_df.show(10)

In [9]:
def optimized():
    df = spark.read.parquet("/spark-data/musicas_limpas_cluste.parquet")
    
    corr_results = []
    for col in numeric_cols:
        if col != "Popularity":
            corr_df = df.groupBy("main_genre").agg(
                expr(f"corr(Popularity, `{col}`)").alias(f"corr_{col}")
            )
            corr_results.append(corr_df)
    
    
    final_result = corr_results[0]
    for df in corr_results[1:]:
        final_result = final_result.join(df, on="main_genre", how="outer")
    
    corr_columns = [c for c in final_result.columns if c.startswith('corr_')]
    
    max_value = greatest(*[spark_col(c) for c in corr_columns])
    max_column_expr = coalesce(*[
        when(spark_col(c) == max_value, lit(c.replace("corr_", ""))) 
        for c in corr_columns
    ])
    
    final_result = final_result.withColumn("maior_corr_coluna", max_column_expr)
    
    final_result.select("main_genre", "maior_corr_coluna").show(10)

In [5]:
import time

def medir_tempo_leitura(funcao):
    inicio = time.time()
    resultado = funcao()
    fim = time.time()
    duracao = fim - inicio
    print(f"Tempo de execução: {duracao:.4f} segundos")
    return resultado


In [12]:
medir_tempo_leitura(optimized)

[Stage 2722:=>(4 + 1) / 5][Stage 2723:> (0 + 0) / 5][Stage 2726:> (0 + 0) / 1]

+----------------+-----------------+
|      main_genre|maior_corr_coluna|
+----------------+-----------------+
|        acoustic|     Danceability|
|     alt-country|           Energy|
|     alternative|     Positiveness|
|alternative rock|           Energy|
|         ambient|     Acousticness|
|     black metal|    Loudness (db)|
|           blues|     Danceability|
|         britpop|            Tempo|
|        chillout|    Loudness (db)|
|       chillwave| Instrumentalness|
+----------------+-----------------+
only showing top 10 rows

Tempo de execução: 2.2772 segundos


                                                                                

In [13]:
medir_tempo_leitura(popularity_poor)

+-------------+-----------------+
|   main_genre|maior_corr_coluna|
+-------------+-----------------+
|   electropop|           Energy|
|         folk|           Energy|
| experimental|           Energy|
|    indie pop|     Danceability|
|post-hardcore|     Danceability|
|          pop|     Danceability|
|  alternative|     Positiveness|
|     pop rock|     Danceability|
|    math rock|           Energy|
|     new wave|     Positiveness|
|        k-pop|     Danceability|
|          rnb|            Tempo|
|        grime|            Tempo|
|      ambient|     Acousticness|
|     chillout|    Loudness (db)|
|    christian|           Energy|
|      screamo|            Tempo|
|        blues|     Danceability|
|drum and bass|     Danceability|
|        dance|           Energy|
+-------------+-----------------+
only showing top 20 rows

Tempo de execução: 67.9523 segundos
