# Doc2Vec Helper Function Class

This notebook contains the following classes/methods that support a Doc2Vec implementation:
* A TaggedLineDocument class which allows the reading of multiple txt files
* convert_lyrics_to_d2v which takes in an iterable of song lyrics and busts out a .d2v model

In [7]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from random import shuffle

In [2]:
# Custom TaggedLineDocument Class to allow handling of multiple TaggedLineDocument objects
# =====================================================================================================================
class TaggedLineDocument(object):
    def __init__(self, sources):
        self.sources = sources

        flipped = {}

        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')

    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])

    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences

    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

# OK, let's make a Doc2Vec function that will take in all the lyrics & spit out a d2v model that you can use.
def convert_lyrics_to_d2v(allLyricsList, model_min_count=1, model_window=10, model_vector_size=100, model_sample=1e-4, model_negative=5, model_workers=7, model_epoch_range=10):
    """
    Function that takes in an iterable (egs: list) of song lyrics, where each element is a single string.
    Returns a Doc2Vec model (song_lyrics.d2v)
    Parameters
    ----------
    first : allLyricsList
        An iterable (egs: list) of all song lyrics, where each element is a single string representing lyrics of one song.
    For an explanation of the remaining parameters, see the documentation at:
    https://radimrehurek.com/gensim/models/doc2vec.html#gensim.models.doc2vec.Doc2Vec

    Returns
    -------
    .d2v
        Doc2Vec model
    """
    
    master_txt = open('all_song_lyrics.txt', 'w') 
    
    for song_lyrics in allLyricsList:
        master_txt.write(song_lyrics)
        master_txt.write('\n')
    
    master_txt.close()

    sources = {'all_song_lyrics.txt':'SONG_NUMBER'}
    sentences = TaggedLineDocument(sources)

    model = Doc2Vec(model_min_count=1, model_window=10, model_vector_size=100, model_sample=1e-4, model_negative=5, model_workers=7)

    model.build_vocab(sentences.to_array())

    for epoch in range(model_epoch_range):
        model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=model.epochs)

    model.save('./song_lyrics.d2v')

---