In [40]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, when, udf, to_date
from pyspark.sql.types import IntegerType

spark = SparkSession.builder \
    .appName("Music Data Metrics") \
    .getOrCreate()

# Carregue seu arquivo parquet
df = spark.read.parquet("/spark-data/musicas_limpas_cluste.parquet")

In [41]:
df.printSchema()

root
 |-- Artist(s): string (nullable = true)
 |-- text: string (nullable = true)
 |-- Length: string (nullable = true)
 |-- Album: string (nullable = true)
 |-- Key: string (nullable = true)
 |-- Tempo: string (nullable = true)
 |-- Loudness (db): double (nullable = true)
 |-- Time signature: string (nullable = true)
 |-- Explicit: string (nullable = true)
 |-- Popularity: string (nullable = true)
 |-- Energy: string (nullable = true)
 |-- Danceability: string (nullable = true)
 |-- Positiveness: string (nullable = true)
 |-- Speechiness: string (nullable = true)
 |-- Liveness: string (nullable = true)
 |-- Acousticness: string (nullable = true)
 |-- Instrumentalness: string (nullable = true)
 |-- Good for Party: string (nullable = true)
 |-- Good for Work/Study: string (nullable = true)
 |-- Good for Relaxation/Meditation: string (nullable = true)
 |-- Good for Exercise: string (nullable = true)
 |-- Good for Running: string (nullable = true)
 |-- Good for Yoga/Stretching: string (nu

In [42]:
from pyspark.sql.functions import col

# Lista das colunas numéricas
numeric_cols = [
    "Tempo", "Loudness (db)", "Energy", "Danceability", "Positiveness",
    "Speechiness", "Liveness", "Acousticness", "Instrumentalness", "Popularity"
]

# Converter para double
df_casted = df
for c in numeric_cols:
    df_casted = df_casted.withColumn(c, col(c).cast("double"))


In [None]:
genres = [row["main_genre"] for row in df_casted.select("main_genre").distinct().collect()]

for genre in genres:
    print(f"\nCorrelações para o gênero: {genre}")
    df_filtered = df_casted.filter(col("main_genre") == genre)
    test = -9999
    for c in numeric_cols:
        if c != "Popularity":
            corr_value = df_filtered.stat.corr("Popularity", c)
            print(f"  Corr(Popularity, {c}) = {corr_value}")
            if corr_value > test:
                test = corr_value
                maior = c
    print("maior: ", maior)

In [46]:
corr_results = []
for col in numeric_cols:
    if col != "Popularity":
        corr_df = df_casted.groupBy("main_genre").agg(
            F.expr(f"corr(Popularity, `{col}`)").alias(f"corr_{col}")
        )
        corr_results.append(corr_df)


final_result = corr_results[0]
for df in corr_results[1:]:
    final_result = final_result.join(df, on="main_genre", how="outer")



# Mostrar os resultados
final_result.show(vertical=True, truncate=False)

[Stage 17482:>                                                      (0 + 1) / 1]

-RECORD 0--------------------------------------
 main_genre            | acoustic              
 corr_Tempo            | 0.013710778101471687  
 corr_Loudness (db)    | -0.04604451205234664  
 corr_Energy           | -0.1732683080141884   
 corr_Danceability     | 0.5327582811738572    
 corr_Positiveness     | -0.031971290418583764 
 corr_Speechiness      | -0.13498234322521316  
 corr_Liveness         | -0.20597427679267508  
 corr_Acousticness     | -0.08822460283153014  
 corr_Instrumentalness | 0.12406046716987244   
-RECORD 1--------------------------------------
 main_genre            | alt-country           
 corr_Tempo            | 0.04651511857993749   
 corr_Loudness (db)    | -0.29762406998842783  
 corr_Energy           | 0.15268903576059728   
 corr_Danceability     | -0.2587207845586629   
 corr_Positiveness     | -0.10294258305672534  
 corr_Speechiness      | -0.14154559193478033  
 corr_Liveness         | -0.09025950798968008  
 corr_Acousticness     | -0.188632476138

                                                                                