In [1]:
# 1. Criar SparkSession

from pyspark.sql import SparkSession
import socket

local_ip = socket.gethostbyname(socket.gethostname())

spark = SparkSession.builder \
    .appName("Palavras por Emoção") \
    .master("spark://spark-master:7077") \
    .config("spark.driver.host", local_ip) \
    .getOrCreate()

print("Conectado ao Spark master:", spark.sparkContext.master)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/09 03:50:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Conectado ao Spark master: spark://spark-master:7077


In [None]:
# 2. Ler o dataset Parquet

df = spark.read.parquet("/spark-data/musicas_limpas_cluster.parquet")

print(f"Sucesso")

                                                                                

Sucesso


In [3]:
# 3. Tokenizar as letras
from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern="\\W")
df_tokens = tokenizer.transform(df)

print(f"Sucesso")

Sucesso


In [4]:
# 4. Remover StopWords
from pyspark.ml.feature import StopWordsRemover

# Lista customizada de palavras irrelevantes
custom_stopwords = [
    "like", "yeah", "got", "get", "uh", "la", "na", "da", "ooh", "ah", "woo", "go", "ll",
    "hey", "yo", "uhh", "uhuh", "gonna", "wanna", "baby", "oh", "whoa", "know", "re", "chorus",
    "ain", "let", "ve", "one", "verse", "cause", "m"
]

# Juntar com stopwords padrão do inglês
remover = StopWordsRemover(inputCol="tokens", outputCol="tokens_filtrados")
remover.setStopWords(remover.getStopWords() + custom_stopwords)

df_tokens = remover.transform(df_tokens)

print(f"Sucesso")

Sucesso


In [5]:
# 5. Explodir tokens para que cada palavra seja uma linha e contar occorências
from pyspark.sql.functions import explode
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, desc

df_exploded = df_tokens.select("emotion_lower", explode("tokens_filtrados").alias("palavra"))

df_freq = df_exploded.groupBy("palavra", "emotion_lower").count()

window_word = Window.partitionBy("palavra").orderBy(desc("count"))

df_unique_emotion = df_freq.withColumn("rank", row_number().over(window_word)) \
                           .filter("rank == 1") \
                           .drop("rank")

print(f"Sucesso")

Sucesso


In [6]:
import time

start_time = time.time()

In [7]:
# 6. Agrupar por emoção + palavra
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, desc

window_emotion = Window.partitionBy("emotion_lower").orderBy(desc("count"))

df_top_words = df_unique_emotion.withColumn("rank", row_number().over(window_emotion)) \
                                .filter("rank <= 5")

print(f"Sucesso")

Sucesso


In [8]:
end_time = time.time()

print(f"Tempo: {end_time - start_time:.2f} segundos")

Tempo: 0.19 segundos


In [9]:
import time

inicio = time.time()

df_top_words.unpersist()
df_top_words.persist()
# 7. Obter o top 5 palavras mais frequentes por emoção
df_top_words.select("emotion_lower", "palavra", "count", "rank") \
            .orderBy("emotion_lower", "rank") \
            .show(100, truncate=False)

fim = time.time()

duracao = fim - inicio
throughput = 424/duracao

print(f"\n--- Análise de Desempenho ---")
print(f"Tempo total de execução: {duracao:.2f} segundos")
print(f"Throughput da execução: {throughput:.2f} MB/s")
print(f"----------------------------")



+-------------+---------+------+----+
|emotion_lower|palavra  |count |rank|
+-------------+---------+------+----+
|anger        |nigga    |159591|1   |
|anger        |bitch    |149983|2   |
|anger        |fuck     |141322|3   |
|anger        |shit     |140379|4   |
|anger        |niggas   |120800|5   |
|fear         |scared   |9130  |1   |
|fear         |afraid   |9127  |2   |
|fear         |strange  |4833  |3   |
|fear         |worried  |2315  |4   |
|fear         |nervous  |2104  |5   |
|joy          |love     |311506|1   |
|joy          |see      |191572|2   |
|joy          |time     |184364|3   |
|joy          |never    |176742|4   |
|joy          |want     |173612|5   |
|love         |loving   |11990 |1   |
|love         |longing  |1920  |2   |
|love         |naughty  |1902  |3   |
|love         |loyal    |1737  |4   |
|love         |tender   |1723  |5   |
|sadness      |away     |91410 |1   |
|sadness      |heart    |83310 |2   |
|sadness      |alone    |62525 |3   |
|sadness    

                                                                                