In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list, concat_ws, lower, regexp_replace, split, size, udf
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import CountVectorizer, StopWordsRemover
from pyspark.ml.linalg import SparseVector
import numpy as np
import socket
import time

# --- Configuração do Spark Session ---
local_ip = socket.gethostbyname(socket.gethostname())

spark = SparkSession.builder \
    .appName("Workload 2 - Similaridade de Artistas (Pesado)") \
    .master("spark://spark-master:7077") \
    .config("spark.driver.host", local_ip) \
    .config("spark.sql.shuffle.partitions", "400") \
    .getOrCreate()

print("Spark Session iniciada com sucesso!")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/09 04:26:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Session iniciada com sucesso!


In [4]:
# --- UDF para Calcular Similaridade de Cosseno ---
def cosine_similarity(v1, v2):
    # Assegura que os vetores não são nulos e têm o mesmo tamanho
    if v1 is None or v2 is None or v1.size != v2.size:
        return 0.0
    # Lida com vetores que podem ser totalmente nulos
    if v1.numNonzeros() == 0 or v2.numNonzeros() == 0:
        return 0.0
    
    # Converte para numpy arrays para cálculo
    vec1 = v1.toArray()
    vec2 = v2.toArray()
    
    dot_product = np.dot(vec1, vec2)
    norm_v1 = np.linalg.norm(vec1)
    norm_v2 = np.linalg.norm(vec2)
    
    # Evita divisão por zero
    if norm_v1 == 0 or norm_v2 == 0:
        return 0.0
        
    return float(dot_product / (norm_v1 * norm_v2))

# Registra a UDF
cosine_similarity_udf = udf(cosine_similarity, DoubleType())


def executar_workload_pesado():
    """
    Executa um workload pesado para encontrar o par de artistas mais similar
    por gênero, com base no vocabulário de suas músicas.
    """
    # --- 1. Carregar e Preparar os Dados ---
    caminho_parquet = "/spark-data/musicas_limpas.parquet"
    df = spark.read.parquet(caminho_parquet)

    # Filtra para os 5 gêneros com mais músicas para tornar o problema tratável, mas ainda pesado
    top_genres = [row['main_genre'] for row in df.groupBy("main_genre").count().orderBy(col("count").desc()).limit(5).collect()]
    df_filtrado = df.filter(col("main_genre").isin(top_genres))

    # --- 2. Processamento de Texto e Agregação por Artista ---
    # Agrega todas as letras de um artista em um único documento por gênero
    df_artistas = df_filtrado.groupBy("main_genre", "Artist(s)").agg(
        concat_ws(" ", collect_list("text")).alias("full_text")
    )

    # Limpa e tokeniza o texto
    df_tokenizado = df_artistas.withColumn("tokens_raw", split(lower(regexp_replace(col("full_text"), r'[\W_]+', ' ')), ' '))
    
    # Remove stopwords (palavras comuns)
    stopwords_remover = StopWordsRemover(inputCol="tokens_raw", outputCol="tokens")
    df_sem_stopwords = stopwords_remover.transform(df_tokenizado).select("main_genre", "Artist(s)", "tokens")

    # --- 3. Vetorização (TF - Term Frequency) ---
    # Cria um vocabulário e converte as palavras em vetores numéricos para cada gênero
    vectorizer = CountVectorizer(inputCol="tokens", outputCol="features", minDF=5) # minDF ignora palavras raras
    model = vectorizer.fit(df_sem_stopwords)
    df_vetorizado = model.transform(df_sem_stopwords)

    # --- 4. Self-Join e Cálculo da Similaridade (A PARTE PESADA) ---
    df1 = df_vetorizado.alias("df1")
    df2 = df_vetorizado.alias("df2")

    # Junta o DataFrame com ele mesmo, criando pares de artistas dentro do mesmo gênero
    # Garante que não comparemos um artista com ele mesmo (df1.`Artist(s)` < df2.`Artist(s)`)
    df_pares = df1.join(
        df2,
        (col("df1.main_genre") == col("df2.main_genre")) & \
        (col("df1.`Artist(s)`") < col("df2.`Artist(s)`")),
        "inner"
    ).select(
        col("df1.main_genre").alias("genre"),
        col("df1.`Artist(s)`").alias("artist1"),
        col("df2.`Artist(s)`").alias("artist2"),
        cosine_similarity_udf(col("df1.features"), col("df2.features")).alias("similarity")
    )
    
    # --- 5. Encontrar o Par Mais Similar por Gênero ---
    # Usa uma window function para encontrar o par com a maior similaridade em cada gênero
    from pyspark.sql.window import Window
    from pyspark.sql.functions import rank

    window_spec = Window.partitionBy("genre").orderBy(col("similarity").desc())
    df_ranked = df_pares.withColumn("rank", rank().over(window_spec))
    df_top_pares = df_ranked.filter(col("rank") == 1).select("genre", "artist1", "artist2", "similarity")

    print(f"Workload Pesado concluído.")
    df_top_pares.show(truncate=False)
    

# --- Medição do Tempo de Execução ---
print("Iniciando a execução do Workload Pesado...")
inicio = time.time()

executar_workload_pesado()

fim = time.time()
duracao = fim - inicio

print(f"\n--- Análise de Desempenho ---")
print(f"Tempo total de execução: {duracao:.2f} segundos")
print(f"----------------------------")

spark.stop()

Iniciando a execução do Workload Pesado...


                                                                                

Workload Pesado concluído.


25/07/09 04:31:03 WARN TaskSetManager: Lost task 0.0 in stage 16.0 (TID 41) (10.0.1.6 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1231, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1067, in read_udfs
    udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 529, in read_single_udf
    f, return_type = read_command(pickleSer, infile)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 90, in 

Py4JJavaError: An error occurred while calling o222.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: ShuffleMapStage 16 (showString at <unknown>:0) has failed the maximum allowable number of times: 4. Most recent failure reason:
org.apache.spark.shuffle.MetadataFetchFailedException: Missing an output location for shuffle 4 partition 109
	at org.apache.spark.MapOutputTracker$.validateStatus(MapOutputTracker.scala:1747)
	at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$11(MapOutputTracker.scala:1694)
	at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$11$adapted(MapOutputTracker.scala:1693)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.MapOutputTracker$.convertMapStatuses(MapOutputTracker.scala:1693)
	at org.apache.spark.MapOutputTrackerWorker.getMapSizesByExecutorIdImpl(MapOutputTracker.scala:1335)
	at org.apache.spark.MapOutputTrackerWorker.getMapSizesByExecutorId(MapOutputTracker.scala:1297)
	at org.apache.spark.shuffle.sort.SortShuffleManager.getReader(SortShuffleManager.scala:141)
	at org.apache.spark.shuffle.ShuffleManager.getReader(ShuffleManager.scala:63)
	at org.apache.spark.shuffle.ShuffleManager.getReader$(ShuffleManager.scala:57)
	at org.apache.spark.shuffle.sort.SortShuffleManager.getReader(SortShuffleManager.scala:73)
	at org.apache.spark.sql.execution.ShuffledRowRDD.compute(ShuffledRowRDD.scala:200)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:89)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:621)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:624)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.base/java.lang.Thread.run(Unknown Source)

	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2898)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2834)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2833)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2833)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:2057)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3096)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3036)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3025)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
