In [0]:
%pyspark

file_path = '/user/tw2770_nyu_edu/final-project/lyrics-emotion-rating'

df = spark.read.parquet(file_path)

df.show(20)

## Calculate the distribution of `rating` column

In [2]:
%pyspark

rating_distribution = df.groupBy("rating").agg(
    count("*").alias("count")
)

# Show the result
rating_distribution.orderBy("rating").show()

## Calculate the distribution among the emotions

In [4]:
%pyspark

from pyspark.sql.functions import col, round, avg, min, max, count, row_number
from pyspark.sql import Window

emotion_counts = df.groupBy("emotion").count()

# Calculate total number of records
total = df.count()

# Add a 'percentage' column
emotion_distribution = emotion_counts.withColumn(
    "percentage",
    round((col("count") / total) * 100, 2)
).orderBy(col("count").desc())

# Show the distribution with percentages
emotion_distribution.show()

## Analyze the relationship between `emotion` and `rating`

In [6]:
%pyspark

emotion_rating_stats = df.groupBy("emotion").agg(
    avg("rating").alias("average"),
    min("rating").alias("min"),
    max("rating").alias("max")
)

emotion_rating_stats.show()

In [7]:
%pyspark

# Group by rating and emotion, then count the occurrences
emotion_rating_distribution = df.groupBy("rating", "emotion").agg(
    count("*").alias("count")
).orderBy("rating", "emotion")

emotion_rating_distribution.show(60)

In [8]:
%pyspark

keyword_path = '/user/tw2770_nyu_edu/final-project/lyrics-emotion-keyword-rating'
keywords_df = spark.read.parquet(keyword_path)
keywords_df.show(100)

## Calculate occurences for each of the keyword

In [10]:
%pyspark
keyword_count = keywords_df.groupBy("keyword").agg(
    count("*").alias("total_count")
)

keyword_count.orderBy(col("total_count").desc()).show()

## Top 10 keywords

In [12]:
%pyspark
top_10_keywords = keyword_count.orderBy(col("total_count").desc()).limit(10).drop("total_count")

top_10_keywords.show()

## Calculate the distribution among the emotions for each keyword

In [14]:
%pyspark

keyword_emotion_count = keywords_df.groupBy("keyword", "emotion").agg(
    count("*").alias("count")
)

keyword_emotion_count.orderBy("keyword", col("count").desc()).show()

## Show the distribution among the emotions of the top 10 keywords

In [16]:
%pyspark

# Filter keyword_emotion_count for only the top 10 keywords
top_10_keyword_emotion_count = keyword_emotion_count.join(
    top_10_keywords, "keyword", "inner"
)

# Show the result
top_10_keyword_emotion_count.orderBy("keyword").show(60)

## Calculate the keyword-rating matrix


In [18]:
%pyspark
from pyspark.sql.functions import col, count, sum, round

# Group by keyword and rating to count occurrences
keyword_rating_counts = keywords_df.groupBy("keyword", "rating").agg(count("*").alias("count"))

# Sum counts by keyword to calculate total counts per keyword
keyword_totals = keyword_rating_counts.groupBy("keyword").agg(sum("count").alias("total_count"))

# Join keyword_rating_counts with keyword_totals to calculate percentages
keyword_rating_percentages = keyword_rating_counts.join(
    keyword_totals, on="keyword"
).withColumn(
    "percentage", round((col("count") / col("total_count")), 2)
)

# Pivot table to create matrix
keyword_rating_matrix = keyword_rating_percentages.groupBy("keyword").pivot("rating").agg(
    sum("percentage")
).fillna(0)

# Show the resulting matrix
keyword_rating_matrix.show(45)


## Calculate the keyword-genre matrix


In [20]:
%pyspark
# Group by keyword and tag to count occurrences
keyword_tag_counts = keywords_df.groupBy("keyword", "tag").agg(count("*").alias("count"))

# Calculate total counts per keyword
keyword_totals = keyword_tag_counts.groupBy("keyword").agg(sum("count").alias("total_count"))

# Join to calculate percentages
keyword_tag_percentages = keyword_tag_counts.join(
    keyword_totals, on="keyword"
).withColumn(
    "percentage", round((col("count") / col("total_count")), 2)
)

# Pivot table to create matrix
keyword_tag_matrix = keyword_tag_percentages.groupBy("keyword").pivot("tag").agg(
    sum("percentage")
).fillna(0)

# Show the resulting matrix
keyword_tag_matrix.show(45)