In [0]:
%pyspark

part_file_dir = '/user/tw2770_nyu_edu/final-project/chunk-merged'
emotion = spark.read.parquet(part_file_dir)

emotion.count()

In [1]:
%pyspark

emotion.show()

## Find the emotion with the highest score

In [3]:
%pyspark
from pyspark import StorageLevel
from pyspark.sql import Window, SparkSession
from pyspark.sql.functions import udf, col, regexp_replace, lower, greatest, ntile, when
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, FloatType


emotion = emotion.withColumn("max_score", greatest("sadness", "joy", "love", "anger", "fear", "surprise")) \
                   .withColumn("emotion",
                               when(col("max_score") == col("sadness"), "sadness")
                               .when(col("max_score") == col("joy"), "joy")
                               .when(col("max_score") == col("love"), "love")
                               .when(col("max_score") == col("anger"), "anger")
                               .when(col("max_score") == col("fear"), "fear")
                               .when(col("max_score") == col("surprise"), "surprise")
                               .otherwise("unknown"))

In [4]:
%pyspark

emotion.show()

## Drop unused columns

In [6]:
%pyspark

columns_to_drop = ["sadness", "joy", "love", "anger", "fear", "surprise", "max_score"]
emotion = emotion.drop(*columns_to_drop).persist()

emotion.show()

## Calculate the distribution among the emotions

In [8]:
%pyspark

# Group by 'emotion' and count occurrences
emotion_counts = emotion.groupBy("emotion").count()
emotion_counts.show()

In [9]:
%pyspark

from pyspark.sql.functions import col, round

# Calculate total number of records
total = emotion.count()

# Add a 'percentage' column
emotion_distribution = emotion_counts.withColumn(
    "percentage",
    round((col("count") / total) * 100, 2)
).orderBy(col("count").desc())

# Show the distribution with percentages
emotion_distribution.show()

In [10]:
%pyspark

df_path = '/user/tw2770_nyu_edu/final-project/lyrics_partitioned.parquet'
df = spark.read.parquet(df_path)
df.show()


## Merge `emotion` data with original dataset by `id` column

In [12]:
%pyspark
from pyspark.sql.functions import broadcast

df_merged = emotion.join(broadcast(df), on="id", how="inner").persist()
df_merged.show()

In [13]:
%pyspark

columns_to_drop = ["year", "features", "language", "lyrics_cleaned", "views_partition"]
final_df= df_merged.drop(*columns_to_drop).persist()

final_df.show()

## Create `rating` column from 1 to 10 by the percentile of `views` column

In [15]:
%pyspark
from pyspark.sql import Window

window_spec = Window.orderBy("views")
final_df = final_df.withColumn("rating", ntile(10).over(window_spec))

final_df.show()

In [16]:
%pyspark

final_df = final_df.drop("views").persist()

final_df.show()

In [17]:
%pyspark
final_df = final_df.dropDuplicates(["id", "title", "tag"])
print("Number of unique songs: ", final_df.count())
final_df.show(20)

## Partition By Top Keywords

In [19]:
%pyspark
from pyspark.sql.functions import col, array, when, lit, explode, expr

# Step 1: Define the list of keywords
keywords = [
    'like', 'life', 'love', 'yeah', 'shit', 'fuck', 'bitch', 'world', 'mind', 'heart', 'girl', 'die', 'money', 'baby', 'god', 'leave', 'best', 'alone', 'pain', 'stay', 'night', 'every night', 'need loving', 'nothing left', 'yo bitch', 'say goodbye', 'best friend', 'never ever', 'really want', 'new york', 'brand new', 'hip hop', 'say love', 'leave alone', 'never stop', 'never forget', 'years ago', 'juicy dead girls', 'ay ay ay', 'oh oh oh', 'na na na'
]

# Step 2: Create a column containing an array of matched keywords
keywords_df = final_df.withColumn(
    "matched_keywords",
    array(*[when(col("lyrics").contains(keyword), lit(keyword)) for keyword in keywords])
)

# Step 3: Remove nulls from the `matched_keywords` array
keywords_df = keywords_df.withColumn(
    "matched_keywords",
    expr("filter(matched_keywords, x -> x IS NOT NULL)")
)

print("Count of rows before exploding by keywords", keywords_df.count())

# Step 4: Explode the `matched_keywords` column to create one row per keyword
keywords_df = keywords_df.withColumn("keyword", explode(col("matched_keywords")))

print("Count of rows after exploding by keywords", keywords_df.count())
print("Number of unique songs with lyrics containing top keywords", keywords_df.select("Title").distinct().count())
keywords_df.show(10)

In [20]:
%pyspark
# Drop duplicates and only select the needed columns
result_df = keywords_df.dropDuplicates(["id", "rating", "emotion", "keyword"])
result_df = result_df.select("title", "tag", "artist", "rating", "emotion", "keyword")
print("Count of rows after droping duplicates:", result_df.count())
print("Number of unique books with review containing top keywords", result_df.select("Title").distinct().count())
result_df.show(20)

In [21]:
%pyspark

output_dir = '/user/tw2770_nyu_edu/final-project/lyrics-emotion-keyword-rating'

result_df.write \
    .partitionBy("emotion", "keyword", "rating") \
    .mode("overwrite") \
    .option("compression", "snappy") \
    .parquet(output_dir)