# Part 2: Data Pre-Processing

Import Libraries and lyric data collected from Part 1B.

In [2]:
import pandas as pd
import re
from afinn import Afinn
from nrclex import NRCLex

Import audio feature data from Spotify scraping (Part 1A) and lyric data from Lyric Genius (Part 1B). 

In [5]:
lyrics = pd.read_csv('../data/lyrics.csv')
lyrics.drop_duplicates(subset=['id'],inplace=True)
audio = pd.read_csv('../data/audio_features.csv')
audio.drop_duplicates(subset=['id'],inplace=True)

audio.shape, lyrics.shape

((3831, 26), (3652, 4))

In [6]:
df = lyrics.merge(audio[['id','genre']], on='id')

In [7]:
df.genre.value_counts()

rock         1094
dance_pop    1037
hip_hop       935
country       586
Name: genre, dtype: int64

## New Features
- Create features for word counts
- Create features based on lexicons

In [9]:
df['lyrics'] = [re.sub(r'^.*?Lyrics', '', str(lyric)) for lyric in df['lyrics']]
df['cleaned_lyrics'] = [str(lyric).replace('\n',' ') for lyric in df['lyrics']]
df['cleaned_lyrics'] = [re.sub("\[.*?\]","",lyric) for lyric in df['cleaned_lyrics']]

df['title_length'] = [len(title) for title in df['track']]
df['lines'] = [str(lyric).count('\n') for lyric in df['lyrics']]
df['sections'] = [str(lyric).count('\n\n') for lyric in df['lyrics']]
df['verse_count'] = [str(lyric).count('[Verse') for lyric in df['lyrics']]
df['chorus_count'] = [str(lyric).count('[Chorus') for lyric in df['lyrics']]
df['dash_count'] = [str(lyric).count('" -') for lyric in df['lyrics']]
df['words'] = [len(str(lyric).split()) for lyric in df['cleaned_lyrics']]
df['unique_words'] = [len(set(str(lyric).split())) for lyric in df['cleaned_lyrics']]

df.head(3)

Unnamed: 0,id,track,artist,lyrics,genre,cleaned_lyrics,title_length,lines,sections,verse_count,chorus_count,dash_count,words,unique_words
0,6qc34bnVOyqGDPni8H5W0U,Amazed,Lonestar,[Verse 1]\nEvery time our eyes meet\nThis feel...,country,Every time our eyes meet This feeling inside ...,6,55,6,2,3,0,258,99
1,3EUl8M6SzxZl03NPkB8mUd,Neon Moon,Brooks & Dunn,[Verse 1]\nWhen the sun goes down on my side o...,country,When the sun goes down on my side of town Tha...,9,62,9,5,3,0,295,133
2,7lUE02KHkZM44BZgjCaWRO,Meet In the Middle,Diamond Rio,[Verse 1]\nIt was seven hundred fence posts fr...,country,It was seven hundred fence posts from your pl...,18,44,6,2,5,0,311,103


In [196]:
df = df[df.dash_count<3]

In [35]:
df['emot_positive'] = [NRCLex(lyric).affect_frequencies.get('positive') for lyric in df['cleaned_lyrics']]
df['emot_negative'] = [NRCLex(lyric).affect_frequencies.get('negative') for lyric in df['cleaned_lyrics']]
df['emot_anger'] = [NRCLex(lyric).affect_frequencies.get('anger') for lyric in df['cleaned_lyrics']]
df['emot_trust'] = [NRCLex(lyric).affect_frequencies.get('trust') for lyric in df['cleaned_lyrics']]
df['emot_disgust'] = [NRCLex(lyric).affect_frequencies.get('disgust') for lyric in df['cleaned_lyrics']]
df['emot_fear'] = [NRCLex(lyric).affect_frequencies.get('fear') for lyric in df['cleaned_lyrics']]
df['emot_joy'] = [NRCLex(lyric).affect_frequencies.get('joy') for lyric in df['cleaned_lyrics']]
df['emot_surprise'] = [NRCLex(lyric).affect_frequencies.get('surprise') for lyric in df['cleaned_lyrics']]
df['emot_anticp'] = [NRCLex(lyric).affect_frequencies.get('anticip') for lyric in df['cleaned_lyrics']]

In [46]:
df.head(5)

Unnamed: 0,id,track,artist,lyrics,genre,cleaned_lyrics,title_length,lines,sections,verse_count,...,unique_words,emot_positive,emot_negative,emot_anger,emot_trust,emot_disgust,emot_fear,emot_joy,emot_surprise,emot_anticp
0,6qc34bnVOyqGDPni8H5W0U,Amazed,Lonestar,[Verse 1]\nEvery time our eyes meet\nThis feel...,country,Every time our eyes meetThis feeling inside me...,6,55,6,2,...,99,0.28125,0.125,0.0625,0.03125,0.0625,0.0625,0.15625,0.03125,0.0
1,3EUl8M6SzxZl03NPkB8mUd,Neon Moon,Brooks & Dunn,[Verse 1]\nWhen the sun goes down on my side o...,country,When the sun goes down on my side of townThat ...,9,62,9,5,...,124,0.041667,0.166667,0.138889,0.027778,0.041667,0.194444,0.041667,0.083333,0.0
2,7lUE02KHkZM44BZgjCaWRO,Meet In the Middle,Diamond Rio,[Verse 1]\nIt was seven hundred fence posts fr...,country,It was seven hundred fence posts from your pla...,18,44,6,2,...,97,0.163265,0.020408,0.020408,0.122449,0.020408,0.0,0.163265,0.020408,0.0
3,2ulBBx6YQ3qY3ci34RadtN,She's In Love With The Boy - Single Version,Trisha Yearwood,Kelsea Ballerini - Dibs\nDolly Parton - Jolene...,country,Kelsea Ballerini - DibsDolly Parton - JoleneCl...,43,56,0,0,...,166,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0
4,0158TMiGmHyibaDeaUKayX,She Won't Be Lonely Long,Clay Walker,[Verse 1]\nSomething 'bout the way she’s weari...,country,Something 'bout the way she’s wearing her dres...,24,36,4,2,...,89,0.0,0.236559,0.16129,0.0,0.193548,0.172043,0.0,0.0,0.0


In [10]:
df['track_lower'] = df['track'].str.lower()
#remove words within brackets and paranethes in song titles 
df['track_lower'] = [re.sub("\[.*?\]","",title) for title in df['track_lower']]
df['track_lower'] = [re.sub("\(.*?\)","",title) for title in df['track_lower']]

df['track_lower'] = [str(title).replace('- single version','') for title in df['track_lower']]
df['track_lower'] = [str(title).replace('- single edit','') for title in df['track_lower']]
df['track_lower'] = [str(title).replace('- remix','') for title in df['track_lower']]
df['track_lower'] = [str(title).replace('- remix','') for title in df['track_lower']]

df['lyrics_lower'] = df['cleaned_lyrics'].astype(str)
df['lyrics_lower'] = df['lyrics_lower'].str.lower()

df['lyrics_contain_title'] = [df.iloc[index]['track_lower'] in df.iloc[index]['lyrics_lower'] for index in df.index]

In [11]:
df.lyrics_contain_title.value_counts(normalize=True)

True     0.747809
False    0.252191
Name: lyrics_contain_title, dtype: float64

## Removing Bad Records
Some lyric data contains track lists or miscellaneous content instead of lyrics. 

Example of bad records:

In [23]:
df[df.id == '5QifrqmnjHzdOuBAheeeNU']['lyrics'].item()

"1. ⬆ Nathaniel Rateliff & The Night Sweats - S.O.B\n2. ⬇ Magic City Hippies - Fanfare\n3. ★ Hailee Steinfeld - Love Myself\n4. ⬇ Morningsiders - Empress\n5. ★ Matthew Koma - So Fuckin' Romantic\n6. ★ EL VY - Return to the Moon (Political Song for Didi Bloome to Sing (With Crescendo))\n7. ⬆ Keith James - Not My Day\n8. ★ Johnny Stimson - So. Good\n9. ⬇ Raf Riley - Summer ft. Etta Bond, Avelino and Dun D\n10. ⬆ Tiggs Da Author - Georgia\n11. ★ Cash Cash - Devil ft. Busta Rhymes, B.o.B and Neon Hitch\n12. ⬇ SG Lewis - Warm\n13. ⬇ Alexander Cardinale - Made for You\n14. ⬇ CVIRO & GXNXVS - Sober\n15. ⬇ EDEN - End Credits ft. Leah Kelly\n16. ⬆ Tory Lanez - Say It\n17. ⬆ Keith Ape - IT G MA REMIX ft. A$AP Ferg, Dumbfoundead, Father and Waka Flocka Flame\n18. ★ Mating Ritual - I Wear Glasses\n19. ⬆ Madcon - Don't Worry ft. Ray Dalton\n20. ⬇ Sports - You Are the Right One\n21. ⬆ Ritual - Josephine ft. Lisa Hannigan\n22. ⬇ Lil Dicky - Professional Rapper ft. Snoop Dogg\n23. ⬇ Michael Blume - Ma

In [26]:
df[df.id == '0FYUTxJZvvOVLopuhusqYC']['lyrics'].item()[0:1000]

"1. Chris Brown- Look At Me Now (feat. Lil Wayne & Busta Rhymes)\n2. The Pussycat Dolls & Busta Rhymes- Don't Cha\n3. Busta Rhymes- Break Ya Neck\n4. Busta Rhymes & Mariah Carey- I Know What You Want (feat. Flipmode Squad)\n5. Busta Rhymes- Touch It\n6. Rob Bailey & The Hustle Standard- Beast (feat. Busta Rhymes, KXNG Crooked & Tech N9ne) [Southpaw Remix]\n7. Busta Rhymes- Look Over Your Shoulder (feat. Kendrick Lamar)\n8. Busta Rhymes- Put Your Hands Where My Eyes Could See\n9. M.O.P.- Ante Up (feat. Busta Rhymes, Teflon & Remi Martin) [Remix]\n10. Busta Rhymes- Gimme Some More\n11. Diddy- Victory (feat. The Notorious B.I.G. & Busta Rhymes)\n12. Busta Rhymes- Woo Hah!! Got You All in Check\n13. Busta Rhymes- Pass The Courvoisier Part II (feat. Diddy & Pharrell Williams) [Remix]\n14. Busta Rhymes & Busta Rhymes for Flipmode Entertainment- Turn It Up / Fire It Up (Remix)\n15. A$AP Ferg- East Coast (feat. Busta Rhymes, A$AP Rocky, Dave East, French Montana, Rick Ross & Snoop Dogg) [Remix

Get a list of all records suspected to be faulty based on numerical values.

In [27]:
excl = df[(df['cleaned_lyrics'].str.contains('12.')) & (df['verse_count']==0)]
excl_ids = [i for i in excl['id']]

Exclude these ids from lyric data and audio feature data. Make sure dataframes have the same records in each dataset.

In [36]:
df_lyrics = df[df['id'].isin(excl_ids)==False]
df_lyrics.shape

(3425, 26)

In [32]:
#subste dataset for audio features
song_ids = [i for i in df_lyrics['id']]
df_audio = audio[audio['id'].isin(song_ids)]
df_audio.shape

(3425, 26)

In [37]:
df_lyrics.to_csv('../data/data_lyrics.csv', index=False)
df_audio.to_csv('../data/data_audio.csv',index=False)