In [0]:
%spark.pyspark
file_path = '/user/tw2770_nyu_edu/final-project/lyrics_final.parquet'

df = spark.read.parquet(file_path)
df.show()

In [1]:
%spark.pyspark
df.printSchema()

# Top Keywords

### Romove punctuation of `lyrics` column

In [4]:
%spark.pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, NGram
from pyspark.sql.functions import explode, col, lower, regexp_replace

# Remove punctuations and convert text to lowercase
df = df.withColumn("lyrics", regexp_replace(lower(col("lyrics")), r'[^\w\s]', ''))

In [5]:
%spark.pyspark
df.printSchema()

In [6]:
%spark.pyspark
df.show()

### Split `lyrics` column into tokens

In [8]:
%spark.pyspark
# Tokenize the lyrics into individual words
tokenizer = Tokenizer(inputCol="lyrics", outputCol="words")
words_data = tokenizer.transform(df).persist()

In [9]:
%spark.pyspark
words_data.show()

## Unigram

In [11]:
%spark.pyspark
from pyspark.ml.feature import StopWordsRemover, Tokenizer
from pyspark.sql.functions import col, explode, concat_ws


default_stop_words = StopWordsRemover.loadDefaultStopWords("english")
unigram_stop_words = ["   ", 'im', 'dont', 'know', 'got', 'get', 'time', 'one', 'cant', 'see', 'de', 'way', 'take', 'come', 'aint', 'youre', 'la', 'ill', 'that', 'think', 'let', 'man', 'que', 'back', 'thats', 'feel', 'cause', 'still', 'day', 'away', 'always', 'ive', 'people', 'going', 'said', 'keep', 'niggas', 'fucking', 'nigga']
extend_uni_stop_words = default_stop_words + unigram_stop_words

In [12]:
%spark.pyspark
print(default_stop_words)

In [13]:
%spark.pyspark
print(extend_uni_stop_words)

In [14]:
%spark.pyspark
# Remove stop words using the custom list
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
remover.setStopWords(extend_uni_stop_words)
unigram_data = remover.transform(words_data)

In [15]:
%spark.pyspark
unigram_data.show()

In [16]:
%spark.pyspark
# Calculate unigram frequency
unigrams_flattened = unigram_data.select(explode(col("filtered_words")).alias("unigram"))
unigram_counts = unigrams_flattened.groupBy("unigram").count()
unigram_counts_sorted = unigram_counts.orderBy(col("count").desc())
unigram_counts_sorted.show(100)

In [17]:
%spark.pyspark

uni_words = ['like', 'life', 'love', 'yeah', 'shit', 'fuck', 'bitch', 'world', 'mind', 'heart', 'girl', 'die', 'money', 'baby', 'god', 'leave', 'best', 'alone', 'pain', 'stay', 'night', 'best']

## Bigram

In [19]:
%spark.pyspark

bigram_stop_words = ["   ", 'im', 'dont', 'know', 'got', 'get', 'time', 'one', 'cant', 'see', 'de', 'way', 'take', 'come', 'aint', 'youre', 'la', 'ill', 'that', 'think', 'let', 'man', 'que', 'back', 'thats', 'feel', 'cause', 'still', 'day', 'away', 'always', 'ive', 'people', 'going', 'said', 'keep', 'niggas', 'fucking', 'interview', 'nigga', 'looks', 'look']
extend_bi_stop_words = default_stop_words + bigram_stop_words

In [20]:
%spark.pyspark
words_data.show()

In [21]:
%spark.pyspark
# Remove stop words using the custom list
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
remover.setStopWords(extend_bi_stop_words)
bigrams_data = remover.transform(words_data)

In [22]:
%spark.pyspark
bigrams_data.show()

In [23]:
%spark.pyspark
bigram = NGram(n=2, inputCol="filtered_words", outputCol="bigrams")
bigram_data = bigram.transform(bigrams_data)

In [24]:
%spark.pyspark
bigram_data.show()

In [25]:
%spark.pyspark
# Explode the bigrams list to get individual bigrams as rows
bigrams_flattened = bigram_data.select(explode(col("bigrams")).alias("bigram"))
# Calculate frequency by grouping by bigram and counting the occurrences
bigram_counts = bigrams_flattened.groupBy("bigram").count()
# Sort the bigrams in descending order based on count
bigram_counts_sorted = bigram_counts.orderBy(col("count").desc())



In [26]:
%spark.pyspark
bigram_counts_sorted.show(100)

In [27]:
%spark.pyspark
bi_words = ['every night', 'need loving', 'nothing left', 'yo bitch', 'say goodbye', 'best friend', 'never ever', 'really want', 'new york', 'brand new', 'hip hop', 'say love', 'leave alone', 'never stop', 'never forget', 'years ago']

## Trigram

In [29]:
%spark.pyspark

trigram_stop_words = ["   ", 'im', 'dont', 'know', 'got', 'get', 'time', 'one', 'cant', 'see', 'de', 'way', 'take', 'come', 'aint', 'youre', 'la', 'ill', 'that', 'think', 'let', 'man', 'que', 'back', 'thats', 'feel', 'cause', 'still', 'day', 'away', 'always', 'ive', 'people', 'going', 'said', 'keep', 'niggas', 'fucking', 'interview', 'nigga', 'looks', 'look']
extend_tri_stop_words = default_stop_words + trigram_stop_words

In [30]:
%spark.pyspark

# Remove stop words using the custom list
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
remover.setStopWords(extend_tri_stop_words)
trigrams_data = remover.transform(words_data)

In [31]:
%spark.pyspark
trigrams_data.show()

In [32]:
%spark.pyspark
# Create trigrams using NGram
trigram = NGram(n=3, inputCol="filtered_words", outputCol="trigrams")
trigram_data = trigram.transform(trigrams_data)

In [33]:
%spark.pyspark
trigram_data.show()

In [34]:
%spark.pyspark
# Explode the trigrams list to get individual trigrams as rows
trigrams_flattened = trigram_data.select(explode(col("trigrams")).alias("trigram"))
# Calculate frequency by grouping by trigram and counting the occurrences
trigram_counts = trigrams_flattened.groupBy("trigram").count()
# Sort the trigrams in descending order based on count
trigram_counts_sorted = trigram_counts.orderBy(col("count").desc())

In [35]:
%spark.pyspark
trigram_counts_sorted.show(100)

In [36]:
%spark.pyspark
tri_words = ['juicy dead girls', 'ay ay ay', 'oh oh oh', 'na na na']

## Top Keywords
1. Emotion and Relationships: like, life, love, yeah, mind, heart, girl, die, baby, god, leave, best friend, say goodbye, really want, say love, leave alone, never stop, never forget
2. Hardship and Negativity: shit, fuck, bitch, world, pain, alone, nothing left, yo bitch, years ago
3. Lifestyle and Culture: money, stay, night, every night, need loving, new york, brand new, hip hop, juicy dead girls, ay ay ay, oh oh oh, na na na


In [38]:
%spark.pyspark

keywords = uni_words + bi_words + tri_words
print(keywords)

In [39]:
%spark.pyspark
len(keywords)

In [40]:
%spark.pyspark
