# Lyric Mood Classification - Word Embeddings

Steps:

1. Build vocabulary from _full_ set of song lyrics (including those without labels). Save vocabulary as 

word2vec
* skip-gram
* cbow

In [3]:
import os
from index_lyrics import read_file_contents
from collections import defaultdict
import time
import pandas as pd

lyrics_dir_root = 'data/lyrics/txt'

## How many unique words do we have?

In [13]:
start = time.time()

unique_words = defaultdict(lambda: 0)
lyricfiles = os.listdir(lyrics_dir_root)
num_files = len(lyricfiles)
contents_processed = 0
for count, lyricfile in enumerate(lyricfiles):
    lyricfile = os.path.join(lyrics_dir_root, lyricfile)
    if count % 10000 == 0:
        print('{0}/{1} lyric files processed. {2:.02f} minutes elapsed. {3} contents processed. {4} unique words acquired.'.format(
            count, num_files, (time.time() - start) / 60, contents_processed, len(unique_words)))
    contents = read_file_contents(lyricfile)
    if contents and contents[0]:
        split = contents[0].split()
        for word in split:
            unique_words[word] += 1
        contents_processed += 1
            
end = time.time()
elapsed = (end - start) / 60

print('Elapsed Time: {0} minutes.'.format(elapsed))

0/294299 lyric files processed. 0.00 minutes elapsed. 0 contents processed. 0 unique words acquired.
10000/294299 lyric files processed. 0.01 minutes elapsed. 9547 contents processed. 122479 unique words acquired.
20000/294299 lyric files processed. 0.02 minutes elapsed. 19145 contents processed. 195130 unique words acquired.
30000/294299 lyric files processed. 0.03 minutes elapsed. 28736 contents processed. 257393 unique words acquired.
40000/294299 lyric files processed. 0.04 minutes elapsed. 38320 contents processed. 308552 unique words acquired.
50000/294299 lyric files processed. 0.05 minutes elapsed. 47911 contents processed. 357693 unique words acquired.
60000/294299 lyric files processed. 0.06 minutes elapsed. 57519 contents processed. 402886 unique words acquired.
70000/294299 lyric files processed. 0.07 minutes elapsed. 67117 contents processed. 446049 unique words acquired.
80000/294299 lyric files processed. 0.08 minutes elapsed. 76664 contents processed. 484761 unique word

In [14]:
df = pd.DataFrame.from_dict(unique_words, orient='index', columns=['count'])
df = df.sort_values('count', ascending=False)
df[:20]

Unnamed: 0,count
the,1888745
I,1607271
you,1313028
to,1119509
a,1003483
me,748343
and,704609
my,614638
in,613737
of,582333


# TensorFlow Word2vec
Tutorial: http://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/

Steps:

1. Build a list containing all words in the dataset
2. "Extract the top V most common words to include in our embedding vector"
3. "Gather together all the unique words and index them with a unique integer value – this is what is required to create an equivalent one-hot type input for the word.  We’ll use a dictionary to do this"
4. "Loop through every word in the dataset (vocabulary variable) and assign it to the unique integer word identified, created in Step 2 above.  This will allow easy lookup / processing of the word data stream"

In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords
import string

def lyrics_preprocessing(lyrics):
    """
    Apply this function to any lyric file contents before reading for embeddings
    """
    # https://stackoverflow.com/questions/17390326/getting-rid-of-stop-words-and-document-tokenization-using-nltk
    stop = stopwords.words('english') + list(string.punctuation)
    tokens = [i for i in word_tokenize(lyrics.lower()) if i not in stop]
    return tokens