##### This notebooks serves as the basic exploratory data analysis and demonstrating the interfaces of the `Tokenizer` and `Embedding` modules created for the purpose of language modeling
---

In [None]:
%load_ext autoreload
%autoreload 2

## EDA

In [2]:
import pandas as pd
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv("Data/df_lyrics.csv")
print(df.shape)
df.head()

(80285, 4)


Unnamed: 0,Artist,Song,Lyrics,Genre
0,Britney Spears,...Baby One More Time,"Oh, baby, baby Oh, baby, baby Oh, baby, baby...",Pop
1,Britney Spears,Toxic,"Baby, can't you see I'm calling? A guy like y...",Pop
2,Britney Spears,Work Bitch,You wanna? You wanna? You want a hot body? Y...,Pop
3,Britney Spears,Oops!... I Did It Again,"Mmm, yeah Yeah, yeah, yeah, yeah, yeah, yeah ...",Pop
4,Britney Spears,If U Seek Amy,"La, la, la, la, la-la, la, la La, la, la, la,...",Pop


In [4]:
for genre in set(df.Genre):
    print(genre, df[df.Genre == genre].shape[0])

Rock 21962
Rap 18331
Pop 18275
Country 21717


In [5]:
limit = 5
for index, row in df.iterrows():
    print(row["Lyrics"].split())
    print("."in row["Lyrics"])
    limit -= 1
    if(limit == 0):
        break

['Oh,', 'baby,', 'baby', 'Oh,', 'baby,', 'baby', 'Oh,', 'baby,', 'baby', 'How', 'was', 'I', 'supposed', 'to', 'know', 'That', 'something', "wasn't", 'right', 'here?', 'Oh,', 'baby,', 'baby', 'I', "shouldn't", 'have', 'let', 'you', 'go', 'And', 'now', "you're", 'out', 'of', 'sight,', 'yeah', 'Show', 'me', 'how', 'you', 'want', 'it', 'to', 'be', 'Tell', 'me,', 'baby,', "'cause", 'I', 'need', 'to', 'know', 'now', 'Oh,', 'because', 'My', 'loneliness', 'is', 'killing', 'me', '(And', 'I)', 'I', 'must', 'confess,', 'I', 'still', 'believe', '(Still', 'believe)', 'When', "I'm", 'not', 'with', 'you,', 'I', 'lose', 'my', 'mind', 'Give', 'me', 'a', 'sign', 'Hit', 'me,', 'baby,', 'one', 'more', 'time', 'Oh,', 'baby,', 'baby', 'The', 'reason', 'I', 'breathe', 'is', 'you', '(Oh,', 'yeah)', 'Boy,', 'you', 'got', 'me', 'blinded', 'Oh,', 'pretty,', 'baby', "There's", 'nothing', 'that', 'I', "wouldn't", 'do', "It's", 'not', 'the', 'way', 'I', 'planned', 'it', 'Show', 'me', 'how', 'you', 'want', 'it', 'to

In [6]:
MAX_LENGTH = 1024
lyrics = [l[:min(len(l), MAX_LENGTH)] for l in list(df["Lyrics"])]

## Tokenizer

In [7]:
from preprocessing.tokenize import Tokenizer


tok = Tokenizer()

# ## training the tokenizer
# tok.fit(sentences = lyrics, path = "Weights/tokenizer.json")

In [8]:
# ## loading the tokenizer
tok.load(path = "Weights/tokenizer.json")

In [9]:
tok.tokenize("I'm a little teapot", get_token_ids=True)

2023-04-09 21:58:54.162232: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


{'tokens': [['<s>', 'I', '_am', '_a', '_little', '_te', 'ap', 'ot', '</s>']],
 'token_ids': [[0, 432, 51, 57, 778, 2153, 1821, 6834, 87]]}

In [10]:
tok.decode([[0, 432, 51, 57, 778, 2153, 1821, 6834, 87]])

['I am a little teapot']

In [11]:
len(tok.index_word)

39903

In [12]:
tok.tokenize("I'm a little teapot", get_token_ids=True)

{'tokens': [['<s>', 'I', '_am', '_a', '_little', '_te', 'ap', 'ot', '</s>']],
 'token_ids': [[0, 432, 51, 57, 778, 2153, 1821, 6834, 87]]}

In [21]:
tok.decode([[0, 432, 51, 57, 778, 2153, 1821, 6834, 87]], remove_special_tokens=False)

['<s>I am a little teapot</s>']

## Embedding

In [14]:
from preprocessing.embeddings import Embedding

embedding_size = 300
tokens = tok.tokenize(lyrics, get_token_ids=True)

embedder = Embedding()
embedder.train(sentences=tokens["tokens"], embeddings_size=embedding_size, path=f"Weights/embeddings_{embedding_size}_w2v.txt")

training word2vec model with 80285 sentences
finished training >> saving to Weights/embeddings_300_w2v.txt


In [15]:
embedder.model.most_similar("_you")
# embedder.model.get_vector("_you")

[('_I', 0.7497004270553589),
 ('_me', 0.7384284138679504),
 ('_You', 0.7070844769477844),
 ('_that', 0.6351985931396484),
 ('_shudder', 0.6319519281387329),
 ('_we', 0.6306412220001221),
 ('_legitimately', 0.6248846054077148),
 ('_know', 0.6207215785980225),
 ('_not', 0.6197407841682434),
 ('_reassure', 0.6174041032791138)]

In [16]:
embedder.vocab_size

39903

In [17]:
from tqdm import tqdm
word_counts = {}
for t in tqdm(tokens["tokens"]):
    for w in t:
        if w not in word_counts:
            word_counts[w] = 0
        word_counts[w] += 1

100%|██████████| 80285/80285 [00:02<00:00, 34818.92it/s]


In [18]:
len(word_counts)

39903

In [20]:
tok.tokenize("this is christmas")

{'tokens': [['<s>', 'this', '_is', '_christ', 'mas', '</s>']],
 'token_ids': [[0, 14092, 40, 16091, 11467, 87]]}