### Lyrics Analysis

In [93]:
# import required packages

# data manipulation
import re
import numpy as np
import pandas as pd
# ignore chain assignment warning 
pd.options.mode.chained_assignment = None

# lyrics retrieval
genius_token = 'JhNas_g4xHymyA1mzCmZ_alk-MpiOVrfihxvlQ4NYjOe2e_XpBRZB7dCgHG3PSij'
import lyricsgenius

# nltk- text tokenization
import nltk
from nltk.tokenize import sent_tokenize

# nltk- vader
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy

# nrc- emotion lexicon scores
from nrclex import NRCLex

# bert- transformer pipelines
from transformers import pipeline

# spotify
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# database
from sqlalchemy import create_engine
import psycopg2 
import io

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/macarthur/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [97]:
# create new object from lyricsgenius
genius = lyricsgenius.Genius(genius_token)

In [141]:
# testing nrc lexicon
t_example = NRCLex('Because the greatest love of all is happening to me. I found the greatest love of all inside of me')
aff_dict_example = t_example.affect_frequencies
try: 
    aff_dict_example['anticip'] = aff_dict_example['anticipation']
except: 
    pass
aff_dict_example.pop('anticipation', None)
aff_dict_example

{'fear': 0.18181818181818182,
 'anger': 0.18181818181818182,
 'anticip': 0.0,
 'trust': 0.09090909090909091,
 'surprise': 0.0,
 'positive': 0.0,
 'negative': 0.18181818181818182,
 'sadness': 0.18181818181818182,
 'disgust': 0.18181818181818182,
 'joy': 0.0}

In [188]:
# spotify api setup and testing
# TERMINAL COMMANDS FOR SPOTIPY AUTH
# create env variables in conda base
# conda env config vars set SPOTIPY_CLIENT_ID='4469d6dd1a6748fb9284c88eb8bb8d31'
# conda env config vars set SPOTIPY_CLIENT_SECRET='9c58dc24df58489d847452384633b64b'
# conda env config vars set SPOTIPY_REDIRECT_URI='http://google.com/'
# list out the variables
# conda env config vars list

# initiate spotipy- spotify api python wrapper
auth_manager = SpotifyClientCredentials()
spot = spotipy.Spotify(auth_manager=auth_manager)

#testing- artist info
urn = 'spotify:artist:3jOstUTkEu2JkjvRdBA5Gu'
artist = spot.artist(urn)
print(artist)

{'external_urls': {'spotify': 'https://open.spotify.com/artist/3jOstUTkEu2JkjvRdBA5Gu'}, 'followers': {'href': None, 'total': 2454438}, 'genres': ['alternative rock', 'modern power pop', 'modern rock', 'permanent wave', 'rock'], 'href': 'https://api.spotify.com/v1/artists/3jOstUTkEu2JkjvRdBA5Gu', 'id': '3jOstUTkEu2JkjvRdBA5Gu', 'images': [{'height': 640, 'url': 'https://i.scdn.co/image/ab6761610000e5ebef2e09aeedd0d8f842d1a690', 'width': 640}, {'height': 320, 'url': 'https://i.scdn.co/image/ab67616100005174ef2e09aeedd0d8f842d1a690', 'width': 320}, {'height': 160, 'url': 'https://i.scdn.co/image/ab6761610000f178ef2e09aeedd0d8f842d1a690', 'width': 160}], 'name': 'Weezer', 'popularity': 75, 'type': 'artist', 'uri': 'spotify:artist:3jOstUTkEu2JkjvRdBA5Gu'}


In [95]:
# Initialize Transformer- BERT Classifier
bert_cls = pipeline("text-classification",model= 'bhadresh-savani/albert-base-v2-emotion' , return_all_scores=True)
albert_column_names = ["sadness", "joy", "love", "anger", "fear", "surprise"]
albert_test_pred_raw = bert_cls("sunday morning rain is falling")
albert_test_pred_raw = albert_test_pred_raw[0]
albert_test_score_l = []
for score_label in albert_test_pred_raw:
    score = score_label['score']
    albert_test_score_l.append(score)
albert_test_score_l

[0.08749508857727051,
 0.044749047607183456,
 0.00836945790797472,
 0.3867079019546509,
 0.4656958281993866,
 0.006982794962823391]

In [100]:
import billboard
chart = billboard.ChartData('hot-100')
# print(chart)

In [96]:
# simulate getting data from json
import json
with open("example_songs.json") as jfile:
    jdata = jfile.read()
    j_ex = json.loads(jdata)
dfj = pd.DataFrame(j_ex)

In [164]:
sdf = pd.DataFrame()
# raw lyrics data
lyrics_raw = []
# cleaned lyrics, each song's lyrics as a list of strings
lyrics_lines = []
# cleaned lyrics, each song's lyrics as one string
lyrics_string = []
# song titles
titles = []
# song artists
artists = []
# isrc numbers
isrcs = []


for i in range(len(j_ex['data'])):
    n = j_ex['data'][i]['attributes']['name']
    #shorten search query in genius due to limitation 
    n_short = re.sub(r'\(.*', '', n)
    a = j_ex['data'][i]['attributes']['artistName']
    isrc = j_ex['data'][i]['attributes']['isrc']
    try:
        song = genius.search_song(n_short, a)
        ly = song.lyrics
    except:
        ly = None
    lyrics_raw.append(ly)
    titles.append(n)
    artists.append(a)
    isrcs.append(isrc)
sdf['isrc'] = isrcs
sdf['title'] = titles
sdf['artist'] = artists
sdf['lyrics'] = lyrics_raw

# filter out the songs that have no lyrics
sdf = sdf.dropna()

# lyrics cleaning and spliting into list of lines

# lyrics cleaning

for lyrics in sdf['lyrics']:
    lyrics = re.sub(r'\n+', '\n', lyrics)
    lyrics = lyrics.replace('\n', '. ')
    lyrics = lyrics.replace('\u2005', '')
    lyrics = re.sub(r'\[[^\]]*\][.]', '', lyrics)
    # lyrics as one string
    ly_concat = lyrics
    lyrics_string.append(ly_concat)
    # return lines as list of strings
    ly_list = sent_tokenize(lyrics)
    lyrics_lines.append(ly_list)

sdf['lyrics_lines'] = lyrics_lines
sdf['lyrics_string'] = lyrics_string

# NRC emotional lexicon analysis
nrc_scores = []
for lines in sdf['lyrics_lines']:
    # nrc affection score list
    nrc_affection_freq_l = []
    for line in lines:
        nrc_txt = NRCLex(line)
        aff_dict = nrc_txt.affect_frequencies
  
        try: 
            aff_dict['anticip'] = aff_dict['anticipation']
        except: 
            pass
        
        aff_dict.pop('anticipation', None)

        affection_freq = list(aff_dict.values())
    

        # print(affection_freq)
        nrc_affection_freq_l.append(affection_freq)
        # affection_freq_arr = np.append(affection_freq_arr, np.array([affection_freq]), axis=0)

    ly_aff_arr = np.array(nrc_affection_freq_l)
    nrc_score = np.sum(ly_aff_arr, axis=0)
    nrc_scores.append(nrc_score)

sdf['nrc_score'] = nrc_scores


# Vader
vader_sent = SentimentIntensityAnalyzer()
# vader scores based on string
sdf['str_vader_score'] = sdf['lyrics_string'].apply(lambda lyrics: vader_sent.polarity_scores(lyrics))

# vader scores based on lines
line_vader_scores = []
for lines in sdf['lyrics_lines']:
    # nrc affection score list
    vader_score_l = []
    for line in lines:
        line_vader_score = vader_sent.polarity_scores(line)
        line_vader_score = list(line_vader_score.values())
        vader_score_l.append(line_vader_score)

    ly_vad_arr = np.array(vader_score_l)
    vader_score = np.sum(ly_vad_arr, axis=0)
    line_vader_scores.append(vader_score)

sdf['line_vader_score'] = line_vader_scores

# Spotify audio feature/ analysis
spot_audio_analysis = []
spot_audio_features = []
for idx, song in sdf.iterrows():
    title = song['title']
    # in case
    t_short = re.sub(r'\(.*', '', title)
    isrc = song['isrc']
    artist = song['artist']
    ### REMINDER: ISRC SEARCH METHOD IS NOT STABLE AS SOME ISRC CODES PROVIDED BY APPLE MAY BE OUTDATED THUS COULD NOT BE USED ###
    # spot_track_obj = sp.search(q='isrc:' + isrc, type='track')
    try:
        spot_track_obj = spot.search(q='isrc:' + isrc, type='track')
        spot_track_id = spot_track_obj['tracks']['items'][0]['id']
        spot_ana = spot.audio_analysis(spot_track_id)
        spot_feat = spot.audio_features(tracks=[spot_track_id])
    except:
        try:
            spot_track_obj = spot.search(q='artist:' + artist + ' track:' + t_short, type='track')
            spot_track_id = spot_track_obj['tracks']['items'][0]['id']
            spot_ana = spot.audio_analysis(spot_track_id)
            spot_feat = spot.audio_features(tracks=[spot_track_id])
        except:
            spot_ana = None
            spot_feat = None
    

    spot_audio_analysis.append(spot_ana)
    spot_audio_features.append(spot_feat[0])

sdf['spot_audio_analysis'] = spot_audio_analysis
sdf['spot_audio_features'] = spot_audio_features

# BERT- ALBERT pretrained model
#TODO: albert scores based on string
#str_albert_scores = []
#for lyrics in sdf['lyrics_string']:
    #albert_pred_raw  = bert_cls(lyrics)
    #albert_score = [score_label['score'] for score_label in albert_pred_raw[0]]
    #str_albert_scores.append(albert_score)
#sdf['str_albert_score'] = str_albert_scores

# albert scores based on lines
line_albert_scores = []
for lines in sdf['lyrics_lines']:

    albert_score_l = []
    for line in lines:
        albert_pred_raw = bert_cls(line)
        albert_pred_raw = albert_pred_raw[0]
        albert_l = [score_label['score'] for score_label in albert_pred_raw]
        albert_score_l.append(albert_l)
    ly_albert_arr = np.array(albert_score_l)
    albert_score = np.sum(ly_albert_arr, axis=0)
    line_albert_scores.append(albert_score)

sdf['line_albert_score'] = line_albert_scores

Searching for "Enchanted" by Taylor Swift...
Done.
Searching for "Empire State of Mind " by JAY-Z & Alicia Keys...
Done.
Searching for "24K Magic" by Bruno Mars...
Done.
Searching for "Sucker for Pain " by Lil Wayne, Wiz Khalifa & Imagine Dragons...
Done.
Searching for "Someone You Loved" by Lewis Capaldi...
Done.


In [180]:
# break down score lists/ dicts from sdf

#nrc
nrc_std_dict = {'fear': 0.0, 'anger': 0.0, 'anticipation': 0.0, 'trust': 0.0, 'surprise': 0.0,'positive': 0.0,'negative': 0.0,'sadness': 0.0,'disgust': 0.0,'joy': 0.0}

nrc_fear = [nrc_score[0] for nrc_score in sdf['nrc_score']]
nrc_anger = [nrc_score[1] for nrc_score in sdf['nrc_score']]
nrc_anticipation = [nrc_score[2] for nrc_score in sdf['nrc_score']]
nrc_trust = [nrc_score[3] for nrc_score in sdf['nrc_score']]
nrc_surprise = [nrc_score[4] for nrc_score in sdf['nrc_score']]
nrc_positive = [nrc_score[5] for nrc_score in sdf['nrc_score']]
nrc_negative = [nrc_score[6] for nrc_score in sdf['nrc_score']]
nrc_sadness = [nrc_score[7] for nrc_score in sdf['nrc_score']]
nrc_disgust = [nrc_score[8] for nrc_score in sdf['nrc_score']]
nrc_joy = [nrc_score[9] for nrc_score in sdf['nrc_score']]

#vader
vader_dict = {'neg': 0.073, 'neu': 0.791, 'pos': 0.137, 'compound': 0.9706}

line_vader_neg = [line_vader_score[0] for line_vader_score in sdf['line_vader_score']]
line_vader_neu = [line_vader_score[1] for line_vader_score in sdf['line_vader_score']]
line_vader_pos = [line_vader_score[2] for line_vader_score in sdf['line_vader_score']]
line_vader_compound = [line_vader_score[3] for line_vader_score in sdf['line_vader_score']]

vader_str_df = sdf.str_vader_score.dropna().apply(pd.Series)
str_vader_neg = vader_str_df[vader_str_df.columns[0]]
str_vader_neu = vader_str_df[vader_str_df.columns[1]]
str_vader_pos = vader_str_df[vader_str_df.columns[2]]
str_vader_compound = vader_str_df[vader_str_df.columns[3]]

#spotify

spot_audio_ft_dict = {'danceability': 0, 'energy': 0, 'key': 0, 'loudness': 0, 'mode': 0, 'speechiness': 0,'acousticness': 0,'instrumentalness': 0,'liveness': 0,'valence': 0,'tempo': 0,'type': 'audio_features','id': 'id','uri': 'spotifyuri','track_href': 'url','analysis_url': 'url','duration_ms': 0,'time_signature': 0}

spot_audio_ana_df = sdf.spot_audio_analysis.dropna().apply(pd.Series)
spot_audio_ft_df = sdf.spot_audio_features.dropna().apply(pd.Series)
spot_danceability = spot_audio_ft_df[spot_audio_ft_df.columns[0]]
spot_energy = spot_audio_ft_df[spot_audio_ft_df.columns[1]]
spot_key = spot_audio_ft_df[spot_audio_ft_df.columns[2]]
spot_loudness = spot_audio_ft_df[spot_audio_ft_df.columns[3]]
spot_mode = spot_audio_ft_df[spot_audio_ft_df.columns[4]]
spot_speechiness = spot_audio_ft_df[spot_audio_ft_df.columns[5]]
spot_acousticness = spot_audio_ft_df[spot_audio_ft_df.columns[6]]
spot_instrumentalness = spot_audio_ft_df[spot_audio_ft_df.columns[7]]
spot_liveness = spot_audio_ft_df[spot_audio_ft_df.columns[8]]
spot_valence = spot_audio_ft_df[spot_audio_ft_df.columns[9]]
spot_tempo = spot_audio_ft_df[spot_audio_ft_df.columns[10]]

#bert
albert_column_names = ["sadness", "joy", "love", "anger", "fear", "surprise"]

line_albert_sadness = [line_albert_score[0] for line_albert_score in sdf['line_albert_score']]
line_albert_joy = [line_albert_score[1] for line_albert_score in sdf['line_albert_score']]
line_albert_love = [line_albert_score[2] for line_albert_score in sdf['line_albert_score']]
line_albert_anger = [line_albert_score[3] for line_albert_score in sdf['line_albert_score']]
line_albert_fear = [line_albert_score[4] for line_albert_score in sdf['line_albert_score']]
line_albert_surprise = [line_albert_score[5] for line_albert_score in sdf['line_albert_score']]

In [181]:
sdf

Unnamed: 0,isrc,title,artist,lyrics,lyrics_lines,lyrics_string,nrc_score,str_vader_score,line_vader_score,spot_audio_analysis,spot_audio_features,line_albert_score
0,USCJY1003663,Enchanted,Taylor Swift,[Verse 1]\nThere I was again tonight\nForcing ...,"[ There I was again tonight., Forcing laughter...","There I was again tonight. Forcing laughter, ...","[0.16666666666666666, 0.8666666666666667, 1.24...","{'neg': 0.073, 'neu': 0.791, 'pos': 0.137, 'co...","[3.5860000000000003, 63.21900000000001, 8.195,...","{'meta': {'analyzer_version': '4.0.0', 'platfo...","{'danceability': 0.455, 'energy': 0.623, 'key'...","[4.637160549988039, 36.223811578936875, 4.9320..."
1,USJZ10900031,Empire State of Mind (feat. Alicia Keys),JAY-Z & Alicia Keys,[Intro: JAY-Z]\nYeah\n\n[Verse 1: Jay-Z]\nYeah...,"[ Yeah., Yeah, I'm out that Brooklyn, now I'm ...","Yeah. Yeah, I'm out that Brooklyn, now I'm d...","[6.633333333333334, 2.216666666666667, 3.13571...","{'neg': 0.053, 'neu': 0.794, 'pos': 0.153, 'co...","[2.9910000000000005, 75.78600000000002, 10.222...","{'meta': {'analyzer_version': '4.0.0', 'platfo...","{'danceability': 0.491, 'energy': 0.956, 'key'...","[4.01865521655418, 34.90297035779804, 3.149679..."
2,USAT21602945,24K Magic,Bruno Mars,[Intro]\nTonight\nI just want to take you high...,"[ Tonight., I just want to take you higher., T...",Tonight. I just want to take you higher. Thro...,"[1.453088578088578, 2.143997668997669, 3.32616...","{'neg': 0.09, 'neu': 0.826, 'pos': 0.083, 'com...","[4.1819999999999995, 82.305, 6.513999999999999...","{'meta': {'analyzer_version': '4.0.0', 'platfo...","{'danceability': 0.818, 'energy': 0.803, 'key'...","[4.3135047677205876, 31.08296650776174, 2.8674..."
3,USAT21601891,"Sucker for Pain (with Logic, Ty Dolla $ign & X...","Lil Wayne, Wiz Khalifa & Imagine Dragons",[Chorus: Dan Reynolds & Sam Harris]\nI torture...,"[ I torture you., Take my hand through the fla...",I torture you. Take my hand through the flame...,"[9.367532467532472, 6.400865800865802, 4.27543...","{'neg': 0.251, 'neu': 0.652, 'pos': 0.098, 'co...","[21.627, 51.77199999999999, 7.6000000000000005...","{'meta': {'analyzer_version': '4.0.0', 'platfo...","{'danceability': 0.502, 'energy': 0.786, 'key'...","[5.4936684526037425, 15.897402132453863, 4.477..."
4,DEUM71807062,Someone You Loved,Lewis Capaldi,"[Verse 1]\nI'm going under, and this time, I f...","[ I'm going under, and this time, I fear there...","I'm going under, and this time, I fear there'...","[2.7595238095238095, 0.6428571428571428, 0.592...","{'neg': 0.078, 'neu': 0.795, 'pos': 0.127, 'co...","[1.645, 28.541000000000004, 2.814, 1.338300000...","{'meta': {'analyzer_version': '4.0.0', 'platfo...","{'danceability': 0.501, 'energy': 0.405, 'key'...","[3.227711145300418, 8.306011193897575, 6.43257..."


In [144]:
vader_dict = {'neg': 0.073, 'neu': 0.791, 'pos': 0.137, 'compound': 0.9706}

sdf['str_vader_score'][0]

{'neg': 0.073, 'neu': 0.791, 'pos': 0.137, 'compound': 0.9706}

In [138]:
nrc_std_dict = {'fear': 0.0, 'anger': 0.0, 'anticipation': 0.0, 'trust': 0.0, 'surprise': 0.0,'positive': 0.0,'negative': 0.0,'sadness': 0.0,'disgust': 0.0,'joy': 0.0}
sdf["nrc_score"][0]

array([0.16666667, 0.86666667, 1.25      , 2.7       , 0.95      ,
       7.45      , 1.83333333, 0.16666667, 0.16666667, 6.45      ])

In [139]:
albert_column_names = ["sadness", "joy", "love", "anger", "fear", "surprise"]
sdf["line_albert_score"][0]

array([ 4.63716055, 36.22381158,  4.93201505, 17.93599799,  9.79658638,
        1.47442829])

In [140]:
sdf["spot_audio_features"][0]

[{'danceability': 0.455,
  'energy': 0.623,
  'key': 8,
  'loudness': -3.878,
  'mode': 1,
  'speechiness': 0.0288,
  'acousticness': 0.0739,
  'instrumentalness': 0.000424,
  'liveness': 0.146,
  'valence': 0.208,
  'tempo': 163.893,
  'type': 'audio_features',
  'id': '10eBRyImhfqVvkiVEGf0N0',
  'uri': 'spotify:track:10eBRyImhfqVvkiVEGf0N0',
  'track_href': 'https://api.spotify.com/v1/tracks/10eBRyImhfqVvkiVEGf0N0',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/10eBRyImhfqVvkiVEGf0N0',
  'duration_ms': 352187,
  'time_signature': 4}]

In [146]:
sdf["spot_audio_analysis"][0]

{'meta': {'analyzer_version': '4.0.0',
  'platform': 'Linux',
  'detailed_status': 'OK',
  'status_code': 0,
  'timestamp': 1571899455,
  'analysis_time': 11.95495,
  'input_process': 'libvorbisfile L+R 44100->22050'},
 'track': {'num_samples': 7765716,
  'duration': 352.18668,
  'sample_md5': '',
  'offset_seconds': 0,
  'window_seconds': 0,
  'analysis_sample_rate': 22050,
  'analysis_channels': 1,
  'end_of_fade_in': 2.53098,
  'start_of_fade_out': 345.0195,
  'loudness': -3.878,
  'tempo': 163.893,
  'tempo_confidence': 0.085,
  'time_signature': 4,
  'time_signature_confidence': 1.0,
  'key': 8,
  'key_confidence': 0.747,
  'mode': 1,
  'mode_confidence': 0.802,
  'codestring': 'eJxVmwlyJDkOBL9ST-B9_P9j4x4sqTW2ZrNNViaTBIFA4NCptd_dz_mUz1m31N7L-dT1RuvO3T_9jO9otfOZY3xuG2uMOhfz-a22fkv51MJC30dPZZnePidfuON8Wqnrz0faKH-_0vb838OX7zgc1a_2Wn6G2dLg27wwamvVDfLt02abu9TPaG18Zpnlnjs_Y-7FqJ9y9_iMy9EyOnxk9tr-jBaHuZ3trcr8Ks55sosA1sj5xhi97j4_y-129nPn4pfNS5_OvysLMBzHA9Y5z2ns9Ozmr3PcMu7nuMH3654IzB3egwC

In [149]:
for d in sdf:
    print(d)

isrc
title
artist
lyrics
lyrics_lines
lyrics_string
nrc_score
str_vader_score
line_vader_score
spot_audio_analysis
spot_audio_features
line_albert_score


In [7]:
# loading kaggle data files

# file with basic song information, such as song name, artist name
df_song_info = pd.read_csv('billboard_hot_100_kaggle/song_info.csv')

# sample 10 songs from dataset
df_song_sample = df_song_info.sample(n=10, random_state=42)

# find lyrics
df_song_sample['lyrics'] = df_song_sample.apply(find_lyrics, axis=1)

# remove songs if lyrics information is None (returned none from genius)
df_lyrics_sample = df_song_sample.dropna()
df_ly = df_lyrics_sample['lyrics']

Searching for "Contrabando Y Traicion" by Ramon Ayala Y Sus Bravos Del Norte...
No results found for: 'Contrabando Y Traicion Ramon Ayala Y Sus Bravos Del Norte'
Searching for "Nordlicht" by Flug 8...
No results found for: 'Nordlicht Flug 8'
Searching for "I've Got A Woman" by Ray Charles...
Done.
Searching for "Crazy in Love" by Daniela Andrade...
Done.
Searching for "Don't Get Too High" by Bryson Tiller...
Done.
Searching for "Wannabe" by Spice Girls...
Done.
Searching for "Best Life" by Hardy Caprio...
Done.
Searching for "Clandestino" by Shakira...
Done.
Searching for "Woman Like Me (feat. Nicki Minaj)" by Little Mix...
Done.
Searching for "Take Me To Church (Acoustic Cover) feat. Matt Wright" by Megan Davies...
No results found for: 'Take Me To Church (Acoustic Cover) feat. Matt Wright Megan Davies'


In [182]:
line_albert_surprise

[1.4744282916362863,
 0.9202406413096469,
 1.6364919737097807,
 1.5113379742833786,
 0.2501261677243747]