In [1]:
import math
import os
import pandas as pd

from dotenv import load_dotenv
from gensim.models.doc2vec import Doc2Vec
from preprocessing import get_path, prep

In [2]:
load_dotenv("variable.env")
artists_env = os.getenv("art_col")
lyrics_env = os.getenv("lyr_col")
titles_env = os.getenv("tit_col")

df = pd.read_csv(filepath_or_buffer = get_path(f"{os.getenv('dataset')}.csv"))
df = df[df[os.getenv("lan_col")] == "en"]

In [3]:
lyr_we = prep(df, {"!", "?"})

In [4]:
lyr_we.lwe_corpus(lyrics_env, [titles_env, artists_env])
print(math.ceil(math.sqrt(lyr_we.vocabular)))
lyr_we.lwe_train_corpus[:5]

244


[lyric_doc(words=['i', 'shot', 'the', 'sheriff', 'but', 'i', 'did', 'not', 'shoot', 'no', 'deputy', 'oh', 'no', 'oh', 'i', 'shot', 'the', 'sheriff', 'but', 'i', 'did', 'not', 'shoot', 'no', 'deputy', 'ooh', 'ooh', 'ooh', 'yeah', 'all', 'around', 'in', 'my', 'hometown', 'they', 'trying', 'to', 'track', 'me', 'down', 'yeah', 'they', 'say', 'they', 'want', 'to', 'bring', 'me', 'in', 'guilty', 'for', 'the', 'killing', 'of', 'a', 'deputy', 'for', 'the', 'life', 'of', 'a', 'deputy', 'but', 'i', 'say', 'oh', 'now', 'now', 'oh', 'i', 'shot', 'the', 'sheriff', 'the', 'sheriff', '),', 'but', 'i', 'swear', 'it', 'was', 'in', 'self', 'defense', 'no', 'no', 'ooh', 'ooh', 'ooh', 'yeah', 'i', 'said', 'i', 'shot', 'the', 'sheriff', 'oh', 'lord', 'and', 'they', 'say', 'it', 'is', 'a', 'capital', 'offense', 'oh', 'now', 'yeah', 'ooh', 'ooh', 'ooh', 'yeah', 'hear', 'this', 'sheriff', 'john', 'brown', 'always', 'hated', 'me', 'for', 'what', '?', 'i', 'do', 'not', 'know', 'every', 'time', 'i', 'plant', 'a'

In [5]:
lyr_we.words_per_sentence(lyrics_env)
print(lyr_we.average_words_per_sentence)

6.191647504182561


In [6]:
lyric_model = Doc2Vec(dm = 1,
                      vector_size = 168, 
                      window = math.ceil(lyr_we.average_words_per_sentence),
                      seed = 42,
                      min_count = 2, 
                      dm_mean = 1,
                      epochs = 10)
lyric_model.build_vocab(lyr_we.lwe_train_corpus)
lyric_model.train(lyr_we.lwe_train_corpus, 
                  total_examples = lyric_model.corpus_count, 
                  epochs = lyric_model.epochs)

In [7]:
name, artists = zip(*(item.split("_") for item in lyric_model.dv.index_to_key))
df_lyr_we = pd.DataFrame({
    titles_env: name,
    artists_env: artists,
    **{f"lyr_we_{i}": list(we) for i, we in enumerate(lyric_model.dv.vectors.T, start = 1)}
})

In [8]:
df_lyr_we.to_csv(f"{get_path('')}/lyr_we.csv",
                 index = False)