# Pretrained Word2Vec, GloVe - Document vectors

Since music lyrics dataset is not very large (~5k songs), and song lyrics is generally not a domain specific text, we will be using pretrained models.

In [1]:
import numpy as np
import pandas as pd
import re
import gensim.downloader
from gensim.models import KeyedVectors

In [2]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [None]:
w2v_news_model = gensim.downloader.load('word2vec-google-news-300')
w2v_news_model.save('w2v_wv.vectors')



In [None]:
w2v_news_model.most_similar('try')

In [None]:
glove_wiki_model = gensim.downloader.load('glove-wiki-gigaword-300')
glove_wiki_model.save('glove_wv.vectors')



In [None]:
glove_wiki_model.most_similar('try')

In [None]:
glove_twitter_model = gensim.downloader.load('glove-twitter-200')
glove_twitter_model.save('glove_twitter_wv.vectors')



In [None]:
w2v_wv = KeyedVectors.load('w2v_wv.vectors')

In [None]:
glove_wv = KeyedVectors.load('glove_wv.vectors')

In [None]:
twitter_wv = KeyedVectors.load('glove_twitter_wv.vectors')

In [16]:
twitter_wv['cover']

array([-9.0110e-01,  6.7654e-02, -1.7844e-01, -3.0759e-02,  3.4019e-01,
       -1.4142e-01,  4.7423e-01, -4.8548e-01,  4.3065e-01, -1.3109e-01,
       -3.3585e-01, -2.7486e-03, -5.6753e-01, -5.7471e-01,  3.3655e-01,
       -3.4100e-01,  6.4352e-01, -3.5425e-01,  2.5474e-01,  1.6506e-01,
       -3.3035e-01, -1.9022e-01,  4.7724e-01,  3.5875e-01, -7.8454e-02,
       -1.5209e-01,  1.8321e-01,  6.4695e-01,  1.0719e-01, -6.1577e-01,
        9.2943e-01,  1.1258e-01, -1.2998e-01, -1.4972e-01,  1.3988e-01,
       -8.1164e-01,  6.4217e-01, -4.3808e-01, -1.2254e-01,  1.7814e-01,
       -5.5728e-02, -5.5712e-01, -1.3302e-01,  1.6337e-01, -2.3227e-01,
       -1.2997e-01, -1.8018e-01,  3.2479e-02, -5.6179e-01, -5.6761e-01,
        1.2848e-01,  4.1932e-01, -4.8579e-01,  6.2612e-01,  1.6807e-01,
       -2.6855e-01, -1.4158e-01, -1.6578e-01,  4.5544e-01,  1.1801e-01,
        3.2150e-01, -3.2492e-01, -1.4357e-01, -1.8942e-01,  2.1420e-01,
        1.8384e-01, -1.1262e-01, -1.3392e-01,  5.8480e-01, -5.15

## Prepare music lyrics

In [21]:
df = pd.read_csv("music_dataset_final_hopefully.csv", index_col=[0])

In [22]:
df.head()

Unnamed: 0,track,artist,number_of_emotion_tags,valence_tags,arousal_tags,dominance_tags,spotify_id,seeds_array,artist_name,track_name,...,Country,Opera,Movie,Children's Music,A Capella,lyrics,artist_track,genius_url,new_lyrics,sentiment
0,'Till I Collapse,Eminem,6,4.55,5.273125,5.690625,4xkOaSrkexMciUUogZKVTS,['aggressive'],Eminem,'Till I Collapse,...,0,0,0,0,0,[Intro: Eminem]\n'Cause sometimes you just fee...,Eminem 'Till I Collapse,https://genius.com/Eminem-till-i-collapse-lyrics,[Intro: Eminem]\n'Cause sometimes you just fee...,intense_aggressive
1,St. Anger,Metallica,8,3.71,5.833,5.42725,3fOc9x06lKJBhz435mInlH,['aggressive'],Metallica,St. Anger,...,0,0,0,0,0,[Verse]\nSaint Anger 'round my neck\nSaint Ang...,Metallica St. Anger,https://genius.com/Metallica-st-anger-lyrics,[Verse]\nSaint Anger 'round my neck\nSaint Ang...,intense_aggressive
2,Die MF Die,Dope,7,3.771176,5.348235,5.441765,5bU4KX47KqtDKKaLM4QCzh,['aggressive'],Dope,Die MF Die,...,0,0,0,0,0,[Intro]\nDie!\n[Verse 1]\nI don't need your fo...,Dope Die MF Die,https://genius.com/Dope-die-mf-die-lyrics,[Intro]\nDie!\n[Verse 1]\nI don't need your fo...,intense_aggressive
3,Step Up,Drowning Pool,9,2.971389,5.5375,4.726389,4Q1w4Ryyi8KNxxaFlOQClK,['aggressive'],Drowning Pool,Step Up,...,0,0,0,0,0,"[Intro]\nOne, two, three, go!\n[Verse 1]\nBrok...",Drowning Pool Step Up,https://genius.com/Drowning-pool-step-up-lyrics,"[Intro]\nOne, two, three, go!\n[Verse 1]\nBrok...",intense_aggressive
4,Feedback,Kanye West,1,3.08,5.87,5.49,49fT6owWuknekShh9utsjv,['aggressive'],Kanye West,Feedback,...,0,0,0,0,0,"[Chorus]\nAyy, ya heard about the good news?\n...",Kanye West Feedback,https://genius.com/Kanye-west-feedback-lyrics,"[Chorus]\nAyy, ya heard about the good news?\n...",intense_aggressive


In [23]:
a = '[Intro: Eminem] Cause y  sometim like  y  you wanna  right, left [Verse 1: Eminem] Til I coin em and hock venom Adrenaline'
a = re.sub("\[[^\]]+\]", " ", a)
b = a.split(" ")
b = list(filter(lambda x: x != '', b))

In [24]:
df['processed_lyrics'] = df['artist_name'] + ' ' + df['track_name'] + ' ' + df['lyrics']
df['processed_lyrics'] = df['processed_lyrics'].apply(lambda x: re.sub("\[[^\]]+\]", '', x).lower())
df['processed_lyrics'] = df['processed_lyrics'].str.replace("\n", ' ').str.replace("\r", ' ').replace("(", ' ').replace(")", ' ').replace("(", ' ').replace(",", ' ').replace("!", ' ').replace("?", ' ').str.split(" ")

#df['processed_lyrics'] = list(filter(lambda x: x != '', df['processed_lyrics']))

In [25]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stops = set(stopwords.words('english'))

def transform_lyrics(row):
    base_lyrics = row['new_lyrics']
    artist_name = row['artist']
    track_title = row['track']
    
    lyrics = base_lyrics.replace("\r", " ")
    
    lyrics = re.sub(r"\[([^\]]+)\]", "", lyrics)
    
    lines = lyrics.split('\n')
    lines = list(dict.fromkeys(lines)) # cannot use set for unique verses cause it doesn't keep order

    lines = [line for line in lines if line is not None and line != '' and line != ' ']
    lyrics = ' '.join(lines)
    
    lyrics = track_title + "  " + artist_name + " " + lyrics 
    lyrics = lyrics.lower()

    # Clean the lyrics
    lyrics = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", lyrics)
    lyrics = re.sub(r"what's", "what is ", lyrics)
    lyrics = re.sub(r"\'s", " ", lyrics)
    lyrics = re.sub(r"\'ve", " have ", lyrics)
    lyrics = re.sub(r"can't", "cannot ", lyrics)
    lyrics = re.sub(r"n't", " not ", lyrics)
    lyrics = re.sub(r"ain't", "am not ", lyrics)
    lyrics = re.sub(r"i'm", "i am ", lyrics)
    lyrics = re.sub(r"\'re", " are ", lyrics)
    lyrics = re.sub(r"\'d", " would ", lyrics)
    lyrics = re.sub(r"\'ll", " will ", lyrics)
    lyrics = re.sub(r",", " ", lyrics)
    lyrics = re.sub(r"\.", " ", lyrics)
    lyrics = re.sub(r"!", " ! ", lyrics)
    lyrics = re.sub(r"\/", " ", lyrics)
    lyrics = re.sub(r"\^", " ^ ", lyrics)
    lyrics = re.sub(r"\+", " + ", lyrics)
    lyrics = re.sub(r"\-", " - ", lyrics)
    lyrics = re.sub(r"\=", " = ", lyrics)
    lyrics = re.sub(r"'", " ", lyrics)
    lyrics = re.sub(r"(\d+)(k)", r"\g<1>000", lyrics)
    lyrics = re.sub(r":", " : ", lyrics)
    lyrics = re.sub(r" e g ", " eg ", lyrics)
    lyrics = re.sub(r" b g ", " bg ", lyrics)
    lyrics = re.sub(r" u s ", " american ", lyrics)
    lyrics = re.sub(r"\0s", "0", lyrics)
    lyrics = re.sub(r" 9 11 ", "911", lyrics)
    lyrics = re.sub(r"e - mail", "email", lyrics)
    lyrics = re.sub(r"j k", "jk", lyrics)
    lyrics = re.sub(r"'s", "", lyrics)
    lyrics = re.sub(r"[\(\)]", "", lyrics)
    lyrics = re.sub(r"\s{2,}", " ", lyrics)
    
    lyrics = ' '.join([word for word in lyrics.split() if word not in stops])
      
    return lyrics

df['processed_lyrics'] = df.apply(transform_lyrics, axis=1)
df['processed_lyrics'] = df['processed_lyrics'].str.split(" ")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [26]:
len(df['processed_lyrics'].iloc[3652])

150

## Word Vectors

In [27]:
def get_document_vector(row, gl = 0):
  doc_vec = []
  doc_vec = np.zeros(300, dtype=float)

  for word in row['processed_lyrics']:
    if word != "":
      try:
        if gl == 1:
          word_vec = glove_wv[word]
        else:
          word_vec = w2v_wv[word]
      except: 
        word_vec = np.zeros(300, dtype=float)
      doc_vec += word_vec

  doc_vec = doc_vec/len(row['processed_lyrics'])
  return doc_vec

def get_document_vector_twitter(row):
  doc_vec = []
  doc_vec = np.zeros(200, dtype=float)

  for word in row['processed_lyrics']:
    if word != "":
      try:
        word_vec = twitter_wv[word]
      except: 
        word_vec = np.zeros(200, dtype=float)
      doc_vec += word_vec

  doc_vec = doc_vec/len(row['processed_lyrics'])
  return doc_vec

In [28]:
df['lyrics_glove'] = df.apply(lambda row: get_document_vector(row, 1), axis=1)
df['lyrics_w2v'] = df.apply(lambda row: get_document_vector(row, 0), axis=1)
df['lyrics_glove_twitter'] = df.apply(lambda row: get_document_vector_twitter(row), axis=1)

In [29]:
df.head()

Unnamed: 0,track,artist,number_of_emotion_tags,valence_tags,arousal_tags,dominance_tags,spotify_id,seeds_array,artist_name,track_name,...,A Capella,lyrics,artist_track,genius_url,new_lyrics,sentiment,processed_lyrics,lyrics_glove,lyrics_w2v,lyrics_glove_twitter
0,'Till I Collapse,Eminem,6,4.55,5.273125,5.690625,4xkOaSrkexMciUUogZKVTS,['aggressive'],Eminem,'Till I Collapse,...,0,[Intro: Eminem]\n'Cause sometimes you just fee...,Eminem 'Till I Collapse,https://genius.com/Eminem-till-i-collapse-lyrics,[Intro: Eminem]\n'Cause sometimes you just fee...,intense_aggressive,"[till, collapse, eminem, cause, sometimes, fee...","[0.007035506120882928, 0.007414329536550213, -...","[0.05652761459350586, 0.022828451792399087, 0....","[-0.05907789172510497, 0.11155135691709196, 0...."
1,St. Anger,Metallica,8,3.71,5.833,5.42725,3fOc9x06lKJBhz435mInlH,['aggressive'],Metallica,St. Anger,...,0,[Verse]\nSaint Anger 'round my neck\nSaint Ang...,Metallica St. Anger,https://genius.com/Metallica-st-anger-lyrics,[Verse]\nSaint Anger 'round my neck\nSaint Ang...,intense_aggressive,"[st, anger, metallica, saint, anger, round, ne...","[-0.018360453107478945, -0.09990968379530717, ...","[0.09270500414299243, 0.03882668235085227, 0.0...","[0.09798608987203448, 0.2460496624275534, 0.00..."
2,Die MF Die,Dope,7,3.771176,5.348235,5.441765,5bU4KX47KqtDKKaLM4QCzh,['aggressive'],Dope,Die MF Die,...,0,[Intro]\nDie!\n[Verse 1]\nI don't need your fo...,Dope Die MF Die,https://genius.com/Dope-die-mf-die-lyrics,[Intro]\nDie!\n[Verse 1]\nI don't need your fo...,intense_aggressive,"[die, mf, die, dope, die, !, need, forgiveness...","[-0.061807930793451225, -0.11636447740475768, ...","[0.06827423537986865, 0.030379585597826088, 0....","[0.0657521746782721, 0.14336622042068536, -0.1..."
3,Step Up,Drowning Pool,9,2.971389,5.5375,4.726389,4Q1w4Ryyi8KNxxaFlOQClK,['aggressive'],Drowning Pool,Step Up,...,0,"[Intro]\nOne, two, three, go!\n[Verse 1]\nBrok...",Drowning Pool Step Up,https://genius.com/Drowning-pool-step-up-lyrics,"[Intro]\nOne, two, three, go!\n[Verse 1]\nBrok...",intense_aggressive,"[step, drowning, pool, one, two, three, go, !,...","[0.04024819198499929, 0.06810907959067203, -0....","[0.01848656790597098, 0.020073828759131492, 0....","[0.03802624201498829, -0.004990142806396856, 0..."
4,Feedback,Kanye West,1,3.08,5.87,5.49,49fT6owWuknekShh9utsjv,['aggressive'],Kanye West,Feedback,...,0,"[Chorus]\nAyy, ya heard about the good news?\n...",Kanye West Feedback,https://genius.com/Kanye-west-feedback-lyrics,"[Chorus]\nAyy, ya heard about the good news?\n...",intense_aggressive,"[feedback, kanye, west, ayy, ya, heard, good, ...","[-0.02204457062095055, 0.016007030318758032, -...","[0.036414252387152776, 0.02762536835252193, 0....","[-0.00887784880874624, 0.16099679390681984, -0..."


In [30]:
df.to_csv('word_vector_embeddings_final.csv')