# Initial Setup

In [None]:
!git clone https://github.com/Lyrics/lyrics.git

Cloning into 'lyrics'...
remote: Enumerating objects: 22867, done.[K
remote: Counting objects: 100% (2344/2344), done.[K
remote: Compressing objects: 100% (1073/1073), done.[K
remote: Total 22867 (delta 1035), reused 2258 (delta 984), pack-reused 20523[K
Receiving objects: 100% (22867/22867), 4.24 MiB | 13.82 MiB/s, done.
Resolving deltas: 100% (11814/11814), done.


In [None]:
import os
from collections import defaultdict
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [None]:
def get_char(char):
    if ord(char) < 128:
        return char
    if char == "’":
        return "'"
    return " "

def get_lyrics(f):
    fin = open(f, 'r+')
    curr_verse = ""
    verses = []
    metadata = False
    for line in fin:
        if line == "\n":
            if curr_verse != "":
                clean_verse = ''.join([get_char(i) for i in curr_verse])
                verses.append(clean_verse)
                curr_verse = ""
        elif len(line) > 5 and line[:5] == "_____":
            if curr_verse != "":
                clean_verse = ''.join([get_char(i) for i in curr_verse])
                verses.append(clean_verse)
            return verses
        else:
            if line != "\n":
                curr_verse += line
    if curr_verse != "":
        verses.append(curr_verse)
    return verses

def extract_english_lyrics(remove_if_contains = set()):
    word_counts = defaultdict(int)
    file_count = 0
    song_lyrics = {}
    filepaths = []
    for path, currentDirectory, files in os.walk("lyrics/database"):
        for file in files:
            filepath = os.path.join(path, file)
            filepaths.append(filepath)
    for filepath in sorted(filepaths):
        lyrics = get_lyrics(filepath)
        removed = False
        word_list = []
        for lyric in lyrics:
            for line in lyric.split('\n'):
              clean_line = ''.join([get_char(i) for i in line])
              for word in clean_line.split(" "):
                  word_list.append(word.lower())
        for word in remove_if_contains:
            if not removed and word in word_list:
                removed = True
                os.remove(filepath)
        if not removed:
            file_count += 1
            for word in word_list:
                word_counts[word] += 1
            song_lyrics[filepath] = lyrics


    words = []
    for word in word_counts:
        words.append((word_counts[word], word))
    words.sort(reverse=True)
    return song_lyrics, {pair[1]: i for i, pair in enumerate(words)}
song_lyrics, sorted_words = extract_english_lyrics(
    {'ich', 'und', 'der', 'du', 'das', 'wir', 'nicht', 'ist', 'es', 'ein', 
     'auf', 'zu', 'sie', 'mich', 'doch', 'wenn', 'dich', 'für', 'wie', 'uns', 
     'nur', 'sind', 'mir', 'noch'}
)

def lyrics_to_songs(song_lyrics):
    lyric_to_song = {}
    song_to_lyric = {}
    for song in song_lyrics:
        lyrics = song_lyrics[song]
        for i, lyric in enumerate(lyrics):
            if lyric not in lyric_to_song:
                lyric_to_song[lyric] = (song, i)
                song_to_lyric[(song, i)] = lyric
    return lyric_to_song, song_to_lyric

lyric_to_song, song_to_lyric = lyrics_to_songs(song_lyrics)

# BERT

In [None]:
!pip install transformers
from transformers import BertTokenizer, BertModel
import torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 7.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 42.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.8.1 tokenizers-0.12.1 transformers-4.21.1


In [None]:
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )
model.eval()

def bert(sentence, model, tokenizer):
    marked_text = "[CLS] " + sentence + " [SEP]"

    tokenized_text = tokenizer.tokenize(marked_text)
    tokenized_text = tokenizer.tokenize(marked_text)

    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    token_vecs = hidden_states[-2][0]

    sentence_embedding = torch.mean(token_vecs, dim=0)
    return sentence_embedding.detach().numpy()

def bert_embedding(lyric):
    return bert(lyric, model, tokenizer)

lyric_to_vector, vector_to_lyric = get_verse_vectors(bert_embedding, song_lyrics)

len(lyric_to_vector), len(vector_to_lyric)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(17947, 17890)

In [None]:
def save_vectors(lyric_to_vector):
    np.savez_compressed('lyric_vectors_bert.npz',**lyric_to_vector)

def load_vectors(filepath):
    loader=np.load(filepath)
    data = {}
    for lyrics in song_lyrics.values():
        for lyric in lyrics:
            data[lyric] = loader[lyric]
    return data

## This takes around 1 minute to load and save (and 30MB), whereas running spacy takes around 5 minutes. Up to you to decide what to use.

# save_vectors(lyric_to_vector)
lyric_to_vector = load_vectors('lyric_vectors_bert.npz')
vector_to_lyric = {tuple(lyric_to_vector[lyric].tolist()): lyric for lyric in lyric_to_vector}

In [None]:
# This takes around 15 seconds to run.
def get_knn(lyric_to_vector):
    lyric_indices = {}
    count = 0
    for lyric in lyric_to_vector:
        lyric_indices[lyric] = count
        count += 1
    neigh = NearestNeighbors(n_neighbors=30)
    data = [vec for vec in lyric_to_vector.values()]
    neigh.fit(data)
    neighbors = neigh.kneighbors_graph(data)
    row, col = neighbors.nonzero()
    return row, col, lyric_indices

knn_rows, knn_cols, lyric_indices = get_knn(lyric_to_vector)

def index_lyrics(lyric_indices):
    index_lyric = {}
    for lyric in lyric_indices:
        index_lyric[lyric_indices[lyric]] = lyric
    return index_lyric
index_lyric = index_lyrics(lyric_indices)

# Bag of Words

In [None]:
def bag_of_words(lyric):
    lines = lyric.split("\n")
    word_set = set()
    for line in lines:
        words = line.split()
        for word in words:
            index = sorted_words[word.lower()]
            if index >= 23:
                word_set.add(index)
    return word_set

def get_bow_sets(bow, song_lyrics):
    lyric_to_vector = dict()
    for song in song_lyrics:
        lyrics = song_lyrics[song]
        for lyric in lyrics:
            vector = bow(lyric)
            lyric_to_vector[lyric] = vector
    return lyric_to_vector

lyric_to_bow = get_bow_sets(bag_of_words, song_lyrics)

In [None]:
# Takes 13m to run.
def bow_similarity(set1, set2):
    return len(set1.intersection(set2)) / (len(set1.union(set2))+1)

def get_bow_knn(lyric_to_bow):
    bow_knn = {}
    count = 0
    for lyric in lyric_to_bow:
        lyric_knns = []
        for other_lyric in lyric_to_bow:
            lyric_knns.append((bow_similarity(lyric_to_bow[lyric], lyric_to_bow[other_lyric]), 
                               lyric_to_song[other_lyric]))
            if len(lyric_knns) == 60:
                lyric_knns.sort()
                lyric_knns = lyric_knns[30:]
        bow_knn[lyric_to_song[lyric]] = lyric_knns
    return bow_knn

bow_knn = get_bow_knn(lyric_to_bow)

# Merge Similarity Metrics

In [None]:
def get_similar_lyrics(bow_knn, knn_rows, knn_cols):
    keys = list(bow_knn.keys())
    similar_lyric = {}
    bonus = {}
    for key in keys:
        i = 0
        val = sorted(bow_knn[key], reverse=True)
        while i < len(val) and key[0] == val[i][1][0]:
            i += 1
        if i < len(val) and val[i][0] >= 0.2 and key[0] != val[i][1][0]:
            similar_lyric[key] = val[i][1]
            bonus[key] = val[i][0]
        else:
            lyric = song_to_lyric[key]
            i = lyric_indices[lyric]
            row = knn_rows[i]
            j = 0
            col = knn_cols[i+j]
            row_lyric = index_lyric[row]
            col_lyric = index_lyric[col]
            while lyric_to_song[row_lyric][0] == lyric_to_song[col_lyric][0]:
                j += 1
                col = knn_cols[i+j]
                row_lyric = index_lyric[row]
                col_lyric = index_lyric[col]
            similar_lyric[key] = lyric_to_song[col_lyric]
            bonus[key] = np.dot(lyric_to_vector[row_lyric], lyric_to_vector[col_lyric])/(
            np.linalg.norm(lyric_to_vector[row_lyric]) * np.linalg.norm(lyric_to_vector[row_lyric]))
    return bonus, similar_lyric
bonus, similar_lyric = get_similar_lyrics(bow_knn, knn_rows, knn_cols)

# Markdown Generation

In [None]:
import string
import os

def urlize(s, spacer):
    return s.lower().translate(str.maketrans('', '', string.punctuation)).replace(" ", spacer)

tags = set()

def generate_markdown(filename):
    # generate md file for quartz
    lyrics = song_lyrics[filename]
    fin = open(filename, 'r+')
    metadata = False
    metadata_map = {}
    for line in fin.readlines():
        if not metadata and line[:5] == '_____':
            metadata = True
        elif metadata:
            key = ""
            val = ""
            begin_split = False
            end_split = False
            for char in line.strip():
                if not begin_split and char == " ":
                    begin_split = True
                elif not begin_split:
                    key += char
                elif begin_split and not end_split and char != " ":
                    end_split = True
                    val += char
                elif end_split:
                    val += char
            metadata_map[key] = val.replace(":", "")
    md_str = "---\n"
    md_str += f'title: "{metadata_map["Name"]}"\n'
    md_str += "tags:\n"
    if 'Artist' in metadata_map:
        md_str += f'- {metadata_map["Artist"]}\n'
    if 'Album' in metadata_map:
        md_str += f'- {metadata_map["Album"]}\n'
    md_str += "---\n"
    for i, lyric in enumerate(lyrics):
        md_str += "&nbsp;\n"
        if (filename, i) not in similar_lyric:
            for line in lyric.strip().split("\n"):
                md_str += f'#### {line}\n'
        else:
            similar = similar_lyric[(filename, i)]
            other_lyric = song_to_lyric[similar]
            similar_ref = similar_lyric[similar]
            tag = urlize(other_lyric.split("\n")[0], "-")
            while len(tag) > 0 and tag[-1] == '-':
                tag = tag[:-1]
            tag = tag + "-vyl-wnanory"
            elements = []
            for element in similar[0][16:].split("/"):
                elements.append(urlize(element, "_"))
            link = "songs/" + "/".join(elements) + ".md"
            for j, line in enumerate(lyric.strip().split("\n")):
                while len(line) > 0 and line[-1] == ' ':
                    line = line[:-1]
                md_str += f'#### [[{link}#{tag}|{line}]]'
                anchor = urlize(line, "-") + "-vyl-wnanory"
                if j == 0 and anchor not in tags:
                    tags.add(anchor)
                    md_str += " {#" + anchor + "}"
                md_str += "\n"
    filename_elements = []
    for element in filename[16:].split("/"):
        filename_elements.append(urlize(element, "_"))
    new_filename = "songs/" + "/".join(filename_elements) + ".md"
    os.makedirs(os.path.dirname(new_filename), exist_ok=True)
    with open(new_filename, "w") as f:
        f.write(md_str)

def generate_markdowns():
    # traverse all files and generate md for each
    filepaths = []
    for path, currentDirectory, files in os.walk("lyrics/database"):
        for file in files:
            filepath = os.path.join(path, file)
            filepaths.append(filepath)
    for filepath in sorted(filepaths):
        generate_markdown(filepath)

generate_markdowns()

In [None]:
import shutil
shutil.make_archive("songs", 'zip', "songs")

'/content/songs.zip'