In [1]:
#imports 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk

## Data Preparation

In [2]:
albums = pd.read_csv('Data/Albums.csv', index_col=0).rename(columns = {'ID': 'Album ID'})
albums.head()

Unnamed: 0,Album ID,Albums
0,758025,Speak Now (Taylor’s Version)
1,1040217,Midnights (The Late Night Edition)
2,1040211,Midnights (The Til Dawn Edition)
3,1027134,folklore: the long pond studio sessions (Recor...
4,1013719,The More Red (Taylor’s Version) Chapter


In [3]:
albums.shape #46 albums in total

(46, 2)

Create a df named `all_tracks` that has both track info and album info:

In [4]:
# create paths to the csv file of each album that contains songs and their IDs
album_paths = []
for album in albums['Albums']:
    album = re.sub(r"[^\w\s]", '_', album)
    album = re.sub(r"[\s]+", '', album)
    album_path = 'Data/Tabular/' + album + '.csv'
    album_paths.append(album_path)

album_paths[:5]

['Data/Tabular/SpeakNow_Taylor_sVersion_.csv',
 'Data/Tabular/Midnights_TheLateNightEdition_.csv',
 'Data/Tabular/Midnights_TheTilDawnEdition_.csv',
 'Data/Tabular/folklore_thelongpondstudiosessions_RecordStoreDayExclusive_.csv',
 'Data/Tabular/TheMoreRed_Taylor_sVersion_Chapter.csv']

In [5]:
# create the df
all_tracks = pd.DataFrame()
for idx, album in enumerate(albums['Albums']):
    # for each album, combine tracks and the album information together
    tracks = pd.read_csv(album_paths[idx], index_col = 0).rename(columns = {'ID': 'track_id', 'Tracks': 'track'})
    tracks['album_id'] = albums.loc[idx]['Album ID']
    tracks['album'] = albums.loc[idx]['Albums']

    # combine all albums and tracks
    all_tracks = pd.concat([tracks, all_tracks], ignore_index = True)

In [6]:
all_tracks

Unnamed: 0,track_id,track,album_id,album
0,8442190,Lavender Haze,1040211,Midnights (The Till Dawn Edition)
1,8485907,Maroon,1040211,Midnights (The Till Dawn Edition)
2,8434253,Anti-Hero,1040211,Midnights (The Till Dawn Edition)
3,8445376,Snow On The Beach,1040211,Midnights (The Till Dawn Edition)
4,8485908,"Youre On Your Own, Kid",1040211,Midnights (The Till Dawn Edition)
...,...,...,...,...
568,9199809,When Emma Falls in Love (Taylors Version) [Fro...,758025,Speak Now (Taylor’s Version)
569,4499979,I Can See You (Taylors Version) [From The Vault],758025,Speak Now (Taylor’s Version)
570,4499925,Castles Crumbling (Taylors Version) [From The ...,758025,Speak Now (Taylor’s Version)
571,3947017,Foolish One (Taylors Version) [From The Vault],758025,Speak Now (Taylor’s Version)


To add the lyrics of a song to `all_tracks`, we need a directory path leading to the respective album folders containing the lyrics for their corresponding songs.

In [7]:
# create a dictionary where the key is a song track and the value corresponds to the album it belongs
track_album = {all_tracks.loc[idx]['track']:all_tracks.loc[idx]['album'] for idx in all_tracks.index}
list(track_album.items())[:5]

[('Lavender Haze', 'Midnights (The Late Night Edition)'),
 ('Maroon', 'Midnights (The Late Night Edition)'),
 ('Anti-Hero', 'Midnights (The Late Night Edition)'),
 ('Snow On The Beach', 'Midnights (The Late Night Edition)'),
 ('Youre On Your Own, Kid', 'Midnights (The Late Night Edition)')]

In [8]:
# create a dictionary where the key is an album name and the value is the path to the album folder
album_paths_cd = {}
for album in all_tracks['album'].unique():
    album_path = re.sub(r"[\s’]", '', album)
    album_path = re.sub(r"[^\w]", '_', album_path)
    album_paths_cd[album] = 'Data/Albums/' + album_path + '/'
list(album_paths_cd.items())[:5]

[('Midnights (The Till Dawn Edition)',
  'Data/Albums/Midnights_TheTillDawnEdition_/'),
 ('evermore: the dropped your hand while dancing chapter',
  'Data/Albums/evermore_thedroppedyourhandwhiledancingchapter/'),
 ('evermore: the “forever is the sweetest con” chapter',
  'Data/Albums/evermore_the_foreveristhesweetestcon_chapter/'),
 ('evermore (Japanese Edition)', 'Data/Albums/evermore_JapaneseEdition_/'),
 ('evermore (deluxe version)', 'Data/Albums/evermore_deluxeversion_/')]

By leveraging the dictionaries above, we can derive the album directory path using the song track's name to access its associated lyric file.

In [9]:
lyric_paths = []
for idx in range(all_tracks['track'].shape[0]):
    # find path to the lyric file of each track
    track = all_tracks.loc[idx]['track']
    album = all_tracks.loc[idx]['album']
    lyric = re.sub(r"[\s]", '', track)
    lyric = re.sub(r"[^\w]", '_', lyric)
    lyric_paths.append(album_paths_cd[album] + lyric + '.txt')
lyric_paths[:5]

['Data/Albums/Midnights_TheTillDawnEdition_/LavenderHaze.txt',
 'Data/Albums/Midnights_TheTillDawnEdition_/Maroon.txt',
 'Data/Albums/Midnights_TheTillDawnEdition_/Anti_Hero.txt',
 'Data/Albums/Midnights_TheTillDawnEdition_/SnowOnTheBeach.txt',
 'Data/Albums/Midnights_TheTillDawnEdition_/YoureOnYourOwn_Kid.txt']

combine the file `all_tracks` with the lyrics (if any).

In [179]:
lyric_ser = pd.Series(index = all_tracks.index)
for idx, path in enumerate(lyric_paths):
    try:
        with open(path, 'r') as f:
            txt = f.read()
            lyric_ser[idx] = txt
    except:
        lyric_ser[idx] = np.nan
        pass

In [180]:
all_tracks['lyrics'] = lyric_ser
all_tracks

Unnamed: 0,track_id,track,album_id,album,lyrics
0,8442190,Lavender Haze,1040211,Midnights (The Till Dawn Edition),118 ContributorsTranslationsEspañolFrançaisPor...
1,8485907,Maroon,1040211,Midnights (The Till Dawn Edition),131 ContributorsTranslationsEspañolTürkçeFranç...
2,8434253,Anti-Hero,1040211,Midnights (The Till Dawn Edition),192 ContributorsTranslationsPortuguêsEspañolTü...
3,8445376,Snow On The Beach,1040211,Midnights (The Till Dawn Edition),143 ContributorsTranslationsPortuguêsEspañol中文...
4,8485908,"Youre On Your Own, Kid",1040211,Midnights (The Till Dawn Edition),137 ContributorsTranslationsEspañolFrançaisСрп...
...,...,...,...,...,...
568,9199809,When Emma Falls in Love (Taylors Version) [Fro...,758025,Speak Now (Taylor’s Version),79 ContributorsTranslationsPortuguêsItalianoFr...
569,4499979,I Can See You (Taylors Version) [From The Vault],758025,Speak Now (Taylor’s Version),86 ContributorsTranslationsPortuguêsItalianoFr...
570,4499925,Castles Crumbling (Taylors Version) [From The ...,758025,Speak Now (Taylor’s Version),45 ContributorsTranslationsPortuguêsItalianoFr...
571,3947017,Foolish One (Taylors Version) [From The Vault],758025,Speak Now (Taylor’s Version),57 ContributorsTranslationsPortuguêsItalianoFr...


The lyrics are messy. Let's clean it up.

In [197]:
cleaned_lyrics = []
for lyric in all_tracks['lyrics']:
    repl = re.sub(r'\[.*?\]', '', str(lyric)) # Remove lyric description: [Verse 1], [Chorus], etc.
    repl = re.sub(r'[\n+\d]', ' ', repl) # Remove newline characters and digits (all digits in lyrics are in words)
    repl = re.sub(r'.*Lyrics ', '', repl) # Remove the non-lyric first line
    repl = re.sub(r'\s+', ' ', repl) # Remove unnecessary whitespaces
    repl = re.sub(r' Embed', '', repl).strip() # Remove the non-lyric closing word
    cleaned_lyrics.append(repl)

# assign the cleaned lyrics back to `lyrics`
all_tracks['lyrics'] = cleaned_lyrics
all_tracks['lyrics'] = all_tracks['lyrics'].apply(lambda x: np.nan if x == 'nan' else x)

## Missingness

Some songs do not have lyrics. Let's see what the songs are and what album do they belong to.

In [199]:
all_tracks.isnull().mean()

track_id    0.000000
track       0.000000
album_id    0.000000
album       0.000000
lyrics      0.097731
dtype: float64

In [205]:
no_lyrics = all_tracks.loc[all_tracks.lyrics.isnull(), ['track', 'album']]
no_lyrics

Unnamed: 0,track,album
29,cowboy like me,evermore: the “forever is the sweetest con” ch...
30,mirrorball,evermore: the “forever is the sweetest con” ch...
31,evermore,evermore: the “forever is the sweetest con” ch...
32,long story short,evermore: the “forever is the sweetest con” ch...
33,invisible string,evermore: the “forever is the sweetest con” ch...
34,willow,evermore: the “forever is the sweetest con” ch...
256,betty,the “ladies lunching” chapter
257,dorothea,the “ladies lunching” chapter
258,marjorie,the “ladies lunching” chapter
259,august,the “ladies lunching” chapter


In [206]:
no_lyrics['album'].value_counts()

album
Midnights (The Til Dawn Edition)                                23
Midnights (The Late Night Edition)                              19
evermore: the “forever is the sweetest con” chapter              6
the “ladies lunching” chapter                                    6
Carolina (From The Motion Picture “Where The Crawdads Sing”)     2
Name: count, dtype: int64

In [211]:
for idx, track, album in no_lyrics.iterrows():
    no_lyrics[idx]['album'] = 

(29, track                                       cowboy like me
album    evermore: the “forever is the sweetest con” ch...
Name: 29, dtype: object)
(30, track                                           mirrorball
album    evermore: the “forever is the sweetest con” ch...
Name: 30, dtype: object)
(31, track                                             evermore
album    evermore: the “forever is the sweetest con” ch...
Name: 31, dtype: object)
(32, track                                     long story short
album    evermore: the “forever is the sweetest con” ch...
Name: 32, dtype: object)
(33, track                                     invisible string
album    evermore: the “forever is the sweetest con” ch...
Name: 33, dtype: object)
(34, track                                               willow
album    evermore: the “forever is the sweetest con” ch...
Name: 34, dtype: object)
(256, track                            betty
album    the “ladies lunching” chapter
Name: 256, dtype: object)
(2

In [60]:
ser = all_tracks.loc[(all_tracks['track'] == 'Anti-Hero') & (~all_tracks['lyrics'].isnull()), 'lyrics']

In [71]:
ser.iloc[0][4:200] == ser.iloc[1][4:200]

False

In [None]:
for no_lyric_track in dic:
    all_tracks.loc[all_tracks['track'] == track, 'lyrics']
    