In [1]:
#imports 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk

In [2]:
albums = pd.read_csv('Data/Albums.csv', index_col=0).rename(columns = {'ID': 'Album ID'})
albums.head()

Unnamed: 0,Album ID,Albums
0,758025,Speak Now (Taylor’s Version)
1,1040217,Midnights (The Late Night Edition)
2,1040211,Midnights (The Til Dawn Edition)
3,1027134,folklore: the long pond studio sessions (Recor...
4,1013719,The More Red (Taylor’s Version) Chapter


In [3]:
album_paths = []
for album in albums['Albums']:
    # create path to the csv file of each album
    album = re.sub(r"[^\w\s]", '_', album)
    album = re.sub(r"[\s]+", '', album)
    album_path = 'Data/Tabular/' + album + '.csv'
    album_paths.append(album_path)

In [5]:
album_paths[:5]

['Data/Tabular/SpeakNow_Taylor_sVersion_.csv',
 'Data/Tabular/Midnights_TheLateNightEdition_.csv',
 'Data/Tabular/Midnights_TheTilDawnEdition_.csv',
 'Data/Tabular/folklore_thelongpondstudiosessions_RecordStoreDayExclusive_.csv',
 'Data/Tabular/TheMoreRed_Taylor_sVersion_Chapter.csv']

In [7]:
# create a df that have both track info and album info
all_tracks = pd.DataFrame()
for idx, album in enumerate(albums['Albums']):
    # for each album, combine tracks and the album information together
    tracks = pd.read_csv(album_paths[idx], index_col = 0).rename(columns = {'ID': 'track_id', 'Tracks': 'track'})
    tracks['album_id'] = albums.loc[idx]['Album ID']
    tracks['album'] = albums.loc[idx]['Albums']

    # combine all albums and tracks
    all_tracks = pd.concat([tracks, all_tracks], ignore_index = True)

In [31]:
all_tracks

Unnamed: 0,track_id,track,album_id,album
0,8442190,Lavender Haze,1040211,Midnights (The Till Dawn Edition)
1,8485907,Maroon,1040211,Midnights (The Till Dawn Edition)
2,8434253,Anti-Hero,1040211,Midnights (The Till Dawn Edition)
3,8445376,Snow On The Beach,1040211,Midnights (The Till Dawn Edition)
4,8485908,"Youre On Your Own, Kid",1040211,Midnights (The Till Dawn Edition)
...,...,...,...,...
568,9199809,When Emma Falls in Love (Taylors Version) [Fro...,758025,Speak Now (Taylor’s Version)
569,4499979,I Can See You (Taylors Version) [From The Vault],758025,Speak Now (Taylor’s Version)
570,4499925,Castles Crumbling (Taylors Version) [From The ...,758025,Speak Now (Taylor’s Version)
571,3947017,Foolish One (Taylors Version) [From The Vault],758025,Speak Now (Taylor’s Version)


In [26]:
all_tracks[all_tracks.album == 'Midnights (The Till Dawn Edition)']

Unnamed: 0,track_id,track,album_id,album
0,8442190,Lavender Haze,1040211,Midnights (The Till Dawn Edition)
1,8485907,Maroon,1040211,Midnights (The Till Dawn Edition)
2,8434253,Anti-Hero,1040211,Midnights (The Till Dawn Edition)
3,8445376,Snow On The Beach,1040211,Midnights (The Till Dawn Edition)
4,8485908,"Youre On Your Own, Kid",1040211,Midnights (The Till Dawn Edition)
5,8485905,Midnight Rain,1040211,Midnights (The Till Dawn Edition)
6,8485912,Question...?,1040211,Midnights (The Till Dawn Edition)
7,8485914,Vigilante Shit,1040211,Midnights (The Till Dawn Edition)
8,8485915,Bejeweled,1040211,Midnights (The Till Dawn Edition)
9,8445366,Labyrinth,1040211,Midnights (The Till Dawn Edition)


We need a path to reach the directory to each album folder that contains lyrics for each of its song.

In [10]:
track_album = {all_tracks.loc[idx]['track']:all_tracks.loc[idx]['album'] for idx in all_tracks.index}
list(track_album.items())[:5]

[('Lavender Haze', 'Midnights (The Late Night Edition)'),
 ('Maroon', 'Midnights (The Late Night Edition)'),
 ('Anti-Hero', 'Midnights (The Late Night Edition)'),
 ('Snow On The Beach', 'Midnights (The Late Night Edition)'),
 ('Youre On Your Own, Kid', 'Midnights (The Late Night Edition)')]

In [42]:
album_paths_cd = {} # paths to albums as a directory (not csv file): Data/Albums/album.
for album in all_tracks['album'].unique():
    album_path = re.sub(r"[\s’]", '', album)
    album_path = re.sub(r"[^\w]", '_', album_path)
    album_paths_cd[album] = 'Data/Albums/' + album_path + '/'
list(album_paths_cd.items())[:5]

[('Midnights (The Till Dawn Edition)',
  'Data/Albums/Midnights_TheTillDawnEdition_/'),
 ('evermore: the dropped your hand while dancing chapter',
  'Data/Albums/evermore_thedroppedyourhandwhiledancingchapter/'),
 ('evermore: the “forever is the sweetest con” chapter',
  'Data/Albums/evermore_the_foreveristhesweetestcon_chapter/'),
 ('evermore (Japanese Edition)', 'Data/Albums/evermore_JapaneseEdition_/'),
 ('evermore (deluxe version)', 'Data/Albums/evermore_deluxeversion_/')]

With the dictionaries above, we can use the name of a song track to derive the path to its album directory, which leads us to the song's lyric file.

In [43]:
lyric_paths = []
for idx in range(all_tracks['track'].shape[0]):
    # find path to the lyric file of each track
    track = all_tracks.loc[idx]['track']
    album = all_tracks.loc[idx]['album']
    lyric = re.sub(r"[\s]", '', track)
    lyric = re.sub(r"[^\w]", '_', lyric)
    lyric_paths.append(album_paths_cd[album] + lyric + '.txt')
lyric_paths[:5]

['Data/Albums/Midnights_TheTillDawnEdition_/LavenderHaze.txt',
 'Data/Albums/Midnights_TheTillDawnEdition_/Maroon.txt',
 'Data/Albums/Midnights_TheTillDawnEdition_/Anti_Hero.txt',
 'Data/Albums/Midnights_TheTillDawnEdition_/SnowOnTheBeach.txt',
 'Data/Albums/Midnights_TheTillDawnEdition_/YoureOnYourOwn_Kid.txt']

combine the file `all_tracks` with the lyrics (if any)

In [48]:
lyric_ser = pd.Series(index = all_tracks.index)
for idx, path in enumerate(lyric_paths):
    try:
        with open(path, 'r') as f:
            txt = f.read()
            lyric_ser[idx] = txt
    except:
        lyric_ser[idx] = np.nan
        pass

In [49]:
all_tracks['lyrics'] = lyric_ser
all_tracks

Unnamed: 0,track_id,track,album_id,album,lyrics
0,8442190,Lavender Haze,1040211,Midnights (The Till Dawn Edition),118 ContributorsTranslationsEspañolFrançaisPor...
1,8485907,Maroon,1040211,Midnights (The Till Dawn Edition),131 ContributorsTranslationsEspañolTürkçeFranç...
2,8434253,Anti-Hero,1040211,Midnights (The Till Dawn Edition),192 ContributorsTranslationsPortuguêsEspañolTü...
3,8445376,Snow On The Beach,1040211,Midnights (The Till Dawn Edition),143 ContributorsTranslationsPortuguêsEspañol中文...
4,8485908,"Youre On Your Own, Kid",1040211,Midnights (The Till Dawn Edition),137 ContributorsTranslationsEspañolFrançaisСрп...
...,...,...,...,...,...
568,9199809,When Emma Falls in Love (Taylors Version) [Fro...,758025,Speak Now (Taylor’s Version),79 ContributorsTranslationsPortuguêsItalianoFr...
569,4499979,I Can See You (Taylors Version) [From The Vault],758025,Speak Now (Taylor’s Version),86 ContributorsTranslationsPortuguêsItalianoFr...
570,4499925,Castles Crumbling (Taylors Version) [From The ...,758025,Speak Now (Taylor’s Version),45 ContributorsTranslationsPortuguêsItalianoFr...
571,3947017,Foolish One (Taylors Version) [From The Vault],758025,Speak Now (Taylor’s Version),57 ContributorsTranslationsPortuguêsItalianoFr...


In [50]:
all_tracks.isnull().mean()

track_id    0.000000
track       0.000000
album_id    0.000000
album       0.000000
lyrics      0.097731
dtype: float64