In [92]:
import json
import os
import datetime
from unidecode import unidecode

In [111]:
with open("../data/raw_lyrics.json", "r") as fh:
    data = json.load(fh)

In [112]:
for song in data:
    for i in range(len(song["lyrics"])):
        song["lyrics"][i] = unidecode(song["lyrics"][i])
    metadata_keys = list(song["metadata"].keys())
    for k in metadata_keys:
        new_k = k.lower().replace("©", "").replace("℗", "").replace("&","").strip().replace(" ", "_")
        song["metadata"][new_k] = song["metadata"].pop(k)
    for k, v in song["metadata"].items():
        if isinstance(v, list):
            song["metadata"][k] = ", ".join(sorted(v))

In [113]:
annotations = []
for song in data:
    section_start = -1
    annot = []
    for i, line in enumerate(song["lyrics"]):
        if line.startswith("["):
            if section_start != -1:
                annot.append((label, section_start, i))
            section_start = i
            label = line[1:-1]
    if section_start != -1:
        annot.append((label, section_start, i))
    else:
        annot.append(())
    annotations.append(annot)

In [114]:
for song, annots in zip(data, annotations):
    lyrics = song["lyrics"]
    clean_lyrics = []
    song["annotations"] = []
    for i, annot in enumerate(annots):
        if len(annot) != 3:
            break
        label, start, end = annot
        clean_lyrics += lyrics[start+1:end]
        song["annotations"].append({"section": label, "start": start-i, "end": end-(i+1)})
    if len(clean_lyrics):
        song["lyrics"] = clean_lyrics
        

In [115]:
for song in data:
    release_date = song["metadata"].get("release_date", "")
    if release_date:
        try:
            if len(release_date) > 4:
                release_date = datetime.datetime.strptime(release_date, "%B %d, %Y")
                release_date = release_date.strftime("%d-%m-%Y")
            else: 
                release_date = datetime.datetime.strptime(release_date, "%Y")
                release_date = release_date.strftime("%Y")
            
        except:
            print(song)
            raise
    song["metadata"]["release_date"] = release_date

In [120]:
with open(os.path.join("../data", "lyrics.json"), "w") as fh:
    json.dump(data, fh, ensure_ascii=False, indent=4)