# Data Pre-processing

In [1]:
import os
import json

## Load lyric files

In [2]:
dataset_path = '../../dataset'

In [3]:
def get_song_files(root_dir: str) -> dict:
    return {int(os.path.splitext(os.path.basename(f))[0]): os.path.join(root_dir, f) for f in os.listdir(root_dir) if f.endswith('.json')}

In [4]:
def load_songs(file_dict: dict):
    song_dict = {}
    for song_id, path in file_dict.items():
        with open(path, 'r') as fp:
            song_dict[song_id] = json.load(fp)
    return song_dict

In [5]:
songid_to_file = get_song_files(os.path.join(dataset_path,'songs'))
songid_to_song = load_songs(songid_to_file)

## Cleanse lyrics

In [6]:
import collections
import itertools
import pickle
import re
from typing import List
import nltk

In [7]:
# Define corpus restrictions.
min_count = 3 # Minimum word count.
min_len = 3 # Minimum sentence length.

In [8]:
# List of tokens that have reserved corpus index.
PAD_token = '<pad>'
SOS_token = '<sos>'
EOS_token = '<eos>'
UNK_token = '<unk>'
reserved_tokens = [PAD_token, SOS_token, EOS_token, UNK_token]

In [9]:
def decontracted(phrase: str):
    """Remove English word contractions.

    Gleaned from: https://stackoverflow.com/a/47091490
    """
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"wanna", "want to", phrase)
    phrase = re.sub(r"gotta", "got to", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


def clean_lyrics(lyric: str) -> List[List[str]]:
    """Converts a lyric string into a list of sentence tokens.

    Sentences are distinguished by both newline ('\n') and period ('.') separators.
    """
    lyric = lyric.lower() # Convert to common case.
    lyric = re.sub(r'\[[^\]]*\]', '', lyric) # Remove paranthetical content "[*]", like markers for chorus and verses.
    lyric = re.sub(r'\([^\)]*\)', '', lyric) # Remove paranthetical content "(*)", like markers for chorus and verses.
    lyric = lyric.strip() # Remove any extra newlines at the ends.
    lyric = decontracted(lyric) # Remove contractions before tokenizer to handle special cases.

    # Replace any periods with newlines to ensure sentences end with a newline.
    lyric = re.sub(r"\.", r'\n', lyric)

    # Preserve line structure because tokenizer will remove traditional newlines.
    lyric = re.sub(r"(?:\s*\n\s*)+", r'\n', lyric) # Remove repeated newlines.
    lyric = re.sub('\n', ' NEWLINE ', lyric) # Re-map newlines so that tokenizer doesn't remove them.

    # Tokenize the entire song into list of words.
    tokens = nltk.tokenize.word_tokenize(lyric) # Split into word tokens.
    tokens = [word for word in tokens if word.isalpha()] # Careful to remove punct after contractions.

    # Group sentences together by line.
    tokens_lines = list(filter(None, iter(list(group) for key,group in itertools.groupby(tokens, lambda s: s == 'NEWLINE') if not key)))
    return tokens_lines

In [10]:
# Clean the lyrics of each song and combine into 2D list of sentences and tokens.
corpus = list(itertools.chain.from_iterable(clean_lyrics(song['lyrics']) for _,song in songid_to_song.items()))

# Remove sentence duplicates.
corpus = list(map(str.split, set(map(' '.join, corpus))))

# Sort corpus by decreasing length.
corpus.sort(key=len, reverse=True)

# Remove any lines that are below the threshold.
if min_len:
    corpus = list(itertools.takewhile(lambda sen: len(sen) >= min_len, corpus))

# Add EOS token to the end of each sentence.
for i in range(len(corpus)):
    corpus[i].append(EOS_token)

In [11]:
# Generate counts for each word.
counts = collections.Counter(itertools.chain.from_iterable(corpus))

# Remove any counts below the threshold.
if min_count:
    for key,_ in itertools.dropwhile(lambda tup: tup[1] >= min_count, counts.most_common()):
        del counts[key]

In [12]:
# Generate list of unique words.
index2token = reserved_tokens + sorted(set(counts))

# Build mappings for: token --> index
token2index = {token: i for i,token in enumerate(index2token)}

In [13]:
# Generate index vectors based on word assignment.
# Since some words may have been removed, replace all unknown with the UNK token.
vectors = [[token2index.get(token, token2index[UNK_token]) for token in line] for line in corpus]

### Print some vocabulary statistics

In [14]:
print(f"{len(songid_to_song)} songs")
print(f"{len(corpus)} unique lines")
print(f"{len(index2token)} unique words")
print(f"Longest sentence: {len(corpus[0])} words")
print(f"Shortest sentence: {len(corpus[-1])} words")

1339 songs
28735 unique lines
2779 unique words
Longest sentence: 132 words
Shortest sentence: 4 words


In [15]:
print(f"Line:      {corpus[-1]}")
print(f"Embedding: {vectors[-1]}")

Line:      ['lord', 'god', 'almighty', '<eos>']
Embedding: [1393, 1006, 58, 4]


## Collect phonemes

Use the CMU Pronouncing Dictionary (cmudict) to collect phonemes for each word.
- http://www.speech.cs.cmu.edu/cgi-bin/cmudict

In [16]:
# Load CMU dictionary.
cd = nltk.corpus.cmudict.dict()

In [17]:
def count_syllables(word: str):
    """Determine the number of syllables in the given word.

    This function requires the NLTK cmudict.
    """
    return max(iter(sum(re.match(r'.*\d', p) is not None for p in phonemes) for phonemes in cd.get(word, [])), default=0)

In [18]:
# Compute syllable counts for each sentence in the corpus.
syllables = [sum(count_syllables(token) for token in line) for line in corpus]

In [19]:
# Print an example from the corpus.
print(f"{syllables[-1]}: {corpus[-1]}")

5: ['lord', 'god', 'almighty', '<eos>']


## Write embedding to pickle file

In [20]:
# Path to output pickle file.
path = os.path.join(dataset_path, 'lyrics.pickle')

# Contents of pickle will be the following dictionary.
store = {
    'index2token': index2token,
    'token2index': token2index,
    'vectors': vectors,
    'counts': counts,
    'corpus': corpus,
    'syllables': syllables,
}
with open(path, 'wb') as fp:
    pickle.dump(store, fp, protocol=pickle.DEFAULT_PROTOCOL)

print(f'Lyrics vocabulary saved: {path}')

Lyrics vocabulary saved: ../../dataset/lyrics.pickle
