In [22]:
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import re
import matplotlib.pyplot as plt
from tqdm import tqdm
import json

In [2]:
f = open("biblia-em-txt.txt", "r")
text = f.read()

Dando uma olhada no txt:

In [3]:
print(text[2995:3150])

 domine ele sobre os peixes do mar, sobre as aves do céu, sobre os animais domésticos, e sobre toda a terra, e sobre todo réptil que se arrasta sobre a ter


In [4]:
def tokenize_sentece(sent):
    tokenized = word_tokenize(sent)
    tokenized = [token.lower() for token in tokenized if token.isalpha()]
    return tokenized

def tokenize_text(text):
    sentences = sent_tokenize(text)
    tokenized_senteces = [tokenize_sentece(sent) for sent in sentences]
    return tokenized_senteces

In [5]:
sent_tokens = tokenize_text(text)

Vamos ver os tokens

In [6]:
sent_tokens[200]

['tudo',
 'o',
 'que',
 'tinha',
 'fôlego',
 'do',
 'espírito',
 'de',
 'vida',
 'em',
 'suas',
 'narinas',
 'tudo',
 'o',
 'que',
 'havia',
 'na',
 'terra',
 'seca',
 'morreu']

In [7]:
def make_vocab_and_freq(sent_tokenized):
    i = 0
    vocab = {}
    freqs = {}
    for sent in sent_tokenized:
        for token in sent:
            if token in vocab:
                freqs[token]+=1
            else:
                vocab[token] = i
                freqs[token] = 1
                i+=1
    return vocab,freqs

In [8]:
vocab,freqs = make_vocab_and_freq(sent_tokens)
V = len(vocab)

In [9]:
sorted(vocab)[:15]

['a',
 'aará',
 'aazai',
 'aba',
 'abadom',
 'abafado',
 'abagta',
 'abaixa',
 'abaixam',
 'abaixando',
 'abaixaram',
 'abaixará',
 'abaixarás',
 'abaixava',
 'abaixavam']

In [10]:
vocab['aazai']

13654

Criando as frequências corrigidas das palavras, para o negative sampling

In [11]:
def fixed_distribution(freqs,alpha=3/4):
    noise_dist = {key: val ** alpha for key, val in freqs.items()}
    denominator = sum(noise_dist.values())
    noise_dist_normalized = {key: val / denominator for key, val in noise_dist.items()}
    return noise_dist_normalized

In [12]:
original_dist = np.random.exponential(size = 10000)
original_dist = original_dist/np.sum(original_dist)
trans_original = original_dist**(3/4)
transformed_dist = trans_original/np.sum(trans_original)

In [13]:
freqs_fixed = fixed_distribution(freqs)

In [14]:
np.random.choice(list(freqs_fixed.keys()), p=list(freqs_fixed.values()), size=3, replace = True)

array(['vir', 'tribulação', 'regeneração'], dtype='<U19')

In [15]:
def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
        yield context_words, center_word
        i += 1

In [16]:
def sigmoid(z):
    """
    Compute sigmoid function
    
    Inputs:
    ----
    z (int or np.array): value on which to perform the sigmoid
    
    Outputs:
    ----
    Value of sigmoid
    
    """
    return 1/(1 + np.exp(-z))

In [17]:
def initialize_params(V, d):
    """
    Initialize params from context and central words
    
    Inputs:
    ----
    V (int): number of words in the vocabulary
    d (int): desired embedding dimention
    
    Outputs:
    ----
    V_W (numpy array): random initialization for words matrix
    V_C (numpy array): random initialization for context matrix
    """
    #central words
    V_W = np.random.randn(V,d)
    #context words
    V_C = np.random.randn(V,d)
    
    return V_W, V_C

In [18]:
def compute_cost(positive_sigmoid, negative_sigmoids):
    """
    Compute cost from training
    
    Inputs:
    ----
    positive_sigmoid (int): σ(v^T_wO.vw_I )
    negative_sigmoids (numpy array): σ(-v^T_wO.vw_I ) para cada negativo
    
    Outputs:
    ----
    E (int): computed cost
    """
    E_1 = -np.log(positive_sigmoid)
    E_2 = -np.sum(np.log(negative_sigmoids))
    E = E_1 + E_2
    return E 

In [19]:
def get_gradients(v_w,v_c, W_neg, positive_sigmoid, negative_sigmoids):
    #gradient central word
    negative_part = [negative_sigmoids[i]*W_neg[i] for i in range(len(W_neg))]
    d_central = (positive_sigmoid - 1)*v_c + np.sum(negative_part)
    #gradient positive word
    d_pos = (positive_sigmoid - 1)*v_w
    #gradient
    d_neg = [n_sig*v_w for n_sig in negative_sigmoids]
    return d_central, d_pos, d_neg

In [23]:
sent_tokens_orig = sent_tokens
vocab = vocab
V = len(vocab)
distribution = freqs_fixed
d=100
C=2
k=5
learning_rate = 0.005
verbose=True
    
V_W, V_C = initialize_params(V, d)
j = -1
costs = []
for i in range(2):
    print(f"EPOCH: {i}")
    for sentence in tqdm(sent_tokens):
        j+=1
        if verbose==True and j%1000==0 and j!=0:
            print("iteração: {:.0f} | custo médio 100 ultimas its: {:.4f}".format(int(j),np.mean(costs[-100:])))
            encoded_embeddings = {}
            for word, i in vocab.items():
                encoded_embeddings[word] = list(V_W[i])
            with open(f'models/bible_embeddings_{j}.json', 'w') as file:
                file.write(json.dumps(encoded_embeddings)) 

        for context_words, central_word in get_windows(sentence,C):
            for context in context_words:
                #negative sampling
                W_neg_words = np.random.choice(list(vocab.keys()), p=list(freqs_fixed.values()), size=k)

                #getting the vectors
                v_w = V_W[vocab[central_word]]
                v_c = V_C[vocab[context]]
                W_neg = np.array([V_C[vocab[w]] for w in W_neg_words])

                #auxiliar value
                positive_sigmoid = sigmoid(np.dot(v_c.T,v_w))
                negative_sigmoids_1 = np.array([sigmoid(np.dot(-w_neg.T,v_w)) for w_neg in W_neg])
                negative_sigmoids = np.array([sigmoid(np.dot(w_neg.T,v_w)) for w_neg in W_neg])

                #get gradients
                d_central, d_pos, d_neg = get_gradients(v_w,v_c, W_neg, positive_sigmoid, negative_sigmoids)

                #calculate cost
                cost = compute_cost(positive_sigmoid, negative_sigmoids_1)
                costs.append(cost)

                #update params 
                V_W[vocab[central_word]]-=learning_rate*d_central
                V_C[vocab[context]]-=learning_rate*d_pos
                for word,g_wj in zip(W_neg_words,d_neg):
                    V_C[vocab[word]] -= learning_rate*g_wj
            

  0%|          | 0/30118 [00:00<?, ?it/s]

EPOCH: 0


  3%|▎         | 1000/30118 [07:24<2:30:16,  3.23it/s]

iteração: 1000 | custo médio 100 ultimas its: 18.6258


  7%|▋         | 2000/30118 [15:46<4:51:34,  1.61it/s] 

iteração: 2000 | custo médio 100 ultimas its: 15.7259


 10%|▉         | 3000/30118 [28:46<6:15:59,  1.20it/s] 

iteração: 3000 | custo médio 100 ultimas its: 12.6970


 13%|█▎        | 4000/30118 [40:29<3:54:48,  1.85it/s] 

iteração: 4000 | custo médio 100 ultimas its: 14.1077


 17%|█▋        | 5000/30118 [54:16<12:00:50,  1.72s/it]

iteração: 5000 | custo médio 100 ultimas its: 10.3169


 20%|█▉        | 6000/30118 [1:05:33<3:22:18,  1.99it/s] 

iteração: 6000 | custo médio 100 ultimas its: 7.5436


 23%|██▎       | 7000/30118 [1:12:10<2:02:05,  3.16it/s]

iteração: 7000 | custo médio 100 ultimas its: 6.9860


 27%|██▋       | 8000/30118 [1:20:09<3:31:54,  1.74it/s]

iteração: 8000 | custo médio 100 ultimas its: 7.2589


 30%|██▉       | 9000/30118 [1:28:26<2:04:13,  2.83it/s] 

iteração: 9000 | custo médio 100 ultimas its: 7.1258


 33%|███▎      | 10000/30118 [1:34:49<2:20:17,  2.39it/s]

iteração: 10000 | custo médio 100 ultimas its: 10.3622


 37%|███▋      | 11000/30118 [1:43:31<3:39:19,  1.45it/s] 

iteração: 11000 | custo médio 100 ultimas its: 6.6594


 40%|███▉      | 12000/30118 [1:52:10<2:10:54,  2.31it/s]

iteração: 12000 | custo médio 100 ultimas its: 10.4516


 43%|████▎     | 13000/30118 [1:58:56<1:49:58,  2.59it/s] 

iteração: 13000 | custo médio 100 ultimas its: 10.1379


 46%|████▋     | 14000/30118 [2:03:37<1:29:02,  3.02it/s]

iteração: 14000 | custo médio 100 ultimas its: 9.6829


 50%|████▉     | 15000/30118 [2:08:14<1:40:02,  2.52it/s]

iteração: 15000 | custo médio 100 ultimas its: 10.7737


 53%|█████▎    | 16000/30118 [2:13:04<53:55,  4.36it/s]  

iteração: 16000 | custo médio 100 ultimas its: 8.2775


 56%|█████▋    | 17000/30118 [2:19:14<2:01:07,  1.81it/s]

iteração: 17000 | custo médio 100 ultimas its: 11.1717


 60%|█████▉    | 18000/30118 [2:27:46<2:20:41,  1.44it/s]

iteração: 18000 | custo médio 100 ultimas its: 9.3903


 63%|██████▎   | 19000/30118 [2:36:22<2:27:20,  1.26it/s]

iteração: 19000 | custo médio 100 ultimas its: 7.0446


 66%|██████▋   | 20000/30118 [2:46:34<1:09:44,  2.42it/s]

iteração: 20000 | custo médio 100 ultimas its: 7.4106


 70%|██████▉   | 21000/30118 [2:56:44<1:32:13,  1.65it/s]

iteração: 21000 | custo médio 100 ultimas its: 4.7350


 73%|███████▎  | 22000/30118 [3:05:49<1:29:56,  1.50it/s]

iteração: 22000 | custo médio 100 ultimas its: 7.1248


 76%|███████▋  | 23000/30118 [3:13:14<22:11,  5.35it/s]  

iteração: 23000 | custo médio 100 ultimas its: 8.7571


 80%|███████▉  | 24000/30118 [3:21:20<1:13:15,  1.39it/s]

iteração: 24000 | custo médio 100 ultimas its: 10.3582


 83%|████████▎ | 25000/30118 [3:29:54<47:39,  1.79it/s]  

iteração: 25000 | custo médio 100 ultimas its: 9.1739


 86%|████████▋ | 25999/30118 [3:37:24<16:30,  4.16it/s]  

iteração: 26000 | custo médio 100 ultimas its: 7.5418


 90%|████████▉ | 27000/30118 [3:42:52<31:52,  1.63it/s]

iteração: 27000 | custo médio 100 ultimas its: 5.7208


 93%|█████████▎| 28000/30118 [3:49:22<12:48,  2.76it/s]  

iteração: 28000 | custo médio 100 ultimas its: 5.7872


 96%|█████████▋| 29000/30118 [3:57:04<06:04,  3.07it/s]

iteração: 29000 | custo médio 100 ultimas its: 7.1040


100%|█████████▉| 30000/30118 [4:05:14<00:49,  2.38it/s]

iteração: 30000 | custo médio 100 ultimas its: 7.4819


100%|██████████| 30118/30118 [4:06:07<00:00,  2.04it/s]
  0%|          | 0/30118 [00:00<?, ?it/s]

EPOCH: 1


  3%|▎         | 882/30118 [05:18<2:47:17,  2.91it/s] 

iteração: 31000 | custo médio 100 ultimas its: 4.4251


  6%|▌         | 1882/30118 [11:36<3:33:05,  2.21it/s]

iteração: 32000 | custo médio 100 ultimas its: 8.3776


 10%|▉         | 2882/30118 [20:19<5:04:52,  1.49it/s] 

iteração: 33000 | custo médio 100 ultimas its: 4.4838


 13%|█▎        | 3881/30118 [30:10<2:43:39,  2.67it/s] 

iteração: 34000 | custo médio 100 ultimas its: 5.9032


 16%|█▌        | 4882/30118 [42:15<4:18:55,  1.62it/s] 

iteração: 35000 | custo médio 100 ultimas its: 6.1807


 20%|█▉        | 5882/30118 [53:23<5:36:34,  1.20it/s] 

iteração: 36000 | custo médio 100 ultimas its: 9.1103


 23%|██▎       | 6882/30118 [1:01:14<2:55:59,  2.20it/s]

iteração: 37000 | custo médio 100 ultimas its: 8.1947


 26%|██▌       | 7882/30118 [1:08:44<2:02:01,  3.04it/s]

iteração: 38000 | custo médio 100 ultimas its: 8.0449


 29%|██▉       | 8882/30118 [1:16:08<2:06:59,  2.79it/s] 

iteração: 39000 | custo médio 100 ultimas its: 7.7488


 33%|███▎      | 9882/30118 [1:22:18<3:22:46,  1.66it/s]

iteração: 40000 | custo médio 100 ultimas its: 7.3609


 36%|███▌      | 10882/30118 [1:30:45<2:48:18,  1.90it/s] 

iteração: 41000 | custo médio 100 ultimas its: 5.4752


 39%|███▉      | 11882/30118 [1:39:36<6:06:26,  1.21s/it] 

iteração: 42000 | custo médio 100 ultimas its: 10.0261


 43%|████▎     | 12882/30118 [1:46:42<1:02:49,  4.57it/s] 

iteração: 43000 | custo médio 100 ultimas its: 9.4719


 46%|████▌     | 13882/30118 [1:51:13<1:03:40,  4.25it/s]

iteração: 44000 | custo médio 100 ultimas its: 10.9990


 49%|████▉     | 14882/30118 [1:55:45<1:20:53,  3.14it/s]

iteração: 45000 | custo médio 100 ultimas its: 5.7045


 53%|█████▎    | 15882/30118 [2:00:18<1:49:57,  2.16it/s]

iteração: 46000 | custo médio 100 ultimas its: 9.4144


 56%|█████▌    | 16882/30118 [2:05:26<1:02:45,  3.52it/s]

iteração: 47000 | custo médio 100 ultimas its: 10.1097


 59%|█████▉    | 17882/30118 [2:12:18<1:52:10,  1.82it/s]

iteração: 48000 | custo médio 100 ultimas its: 9.6912


 63%|██████▎   | 18882/30118 [2:21:06<2:39:15,  1.18it/s]

iteração: 49000 | custo médio 100 ultimas its: 9.4677


 66%|██████▌   | 19882/30118 [2:32:24<56:31,  3.02it/s]  

iteração: 50000 | custo médio 100 ultimas its: 11.0532


 69%|██████▉   | 20882/30118 [2:42:27<3:41:36,  1.44s/it]

iteração: 51000 | custo médio 100 ultimas its: 9.1488


 73%|███████▎  | 21882/30118 [2:51:48<48:07,  2.85it/s]  

iteração: 52000 | custo médio 100 ultimas its: 13.4021


 76%|███████▌  | 22882/30118 [2:59:53<48:04,  2.51it/s]  

iteração: 53000 | custo médio 100 ultimas its: 9.1575


 79%|███████▉  | 23881/30118 [3:05:56<26:12,  3.97it/s]  

iteração: 54000 | custo médio 100 ultimas its: 7.9105


 83%|████████▎ | 24882/30118 [3:12:14<36:08,  2.41it/s]  

iteração: 55000 | custo médio 100 ultimas its: 9.2526


 86%|████████▌ | 25882/30118 [3:18:10<29:54,  2.36it/s]  

iteração: 56000 | custo médio 100 ultimas its: 8.8794


 89%|████████▉ | 26882/30118 [3:24:14<19:27,  2.77it/s]  

iteração: 57000 | custo médio 100 ultimas its: 11.2668


 93%|█████████▎| 27882/30118 [3:32:24<16:27,  2.26it/s]  

iteração: 58000 | custo médio 100 ultimas its: 8.9439


 96%|█████████▌| 28882/30118 [3:41:26<17:37,  1.17it/s]  

iteração: 59000 | custo médio 100 ultimas its: 7.9360


 99%|█████████▉| 29882/30118 [3:51:42<02:33,  1.53it/s]

iteração: 60000 | custo médio 100 ultimas its: 11.4878


100%|██████████| 30118/30118 [3:53:59<00:00,  2.15it/s]


In [25]:
def cossine_similarity(x,y):
    return np.dot(x.T,y)/(np.linalg.norm(x)*np.linalg.norm(y))

In [26]:
def find_most_similar(x, word_emb):
    similar = {}
    for word, vec in word_emb.items():
        sim = cossine_similarity(np.array(word_emb[x]),np.array(vec))
        if word != x:
            similar[word] = sim
    similar =  {k: v for k, v in sorted(similar.items(), key=lambda item: item[1], reverse=True)}
    most_10_similar = list(similar.keys())[:10]
    most_similar = [(x, similar[x]) for x in most_10_similar]
    return most_similar

In [27]:
find_most_similar('rei', encoded_embeddings)

[('irmãos', 0.9289269710573728),
 ('esta', 0.9259679552027191),
 ('ele', 0.9245393838274762),
 ('uma', 0.923422744078026),
 ('dizendo', 0.9232049396251653),
 ('pois', 0.9227717454692973),
 ('ia', 0.9212027810710568),
 ('ó', 0.9211967719337417),
 ('homem', 0.9211084747274799),
 ('davi', 0.9209030807430225)]

In [28]:
find_most_similar('mulher', encoded_embeddings)

[('habitantes', 0.9317869126492995),
 ('átrio', 0.9316115285762017),
 ('portas', 0.9312491221524574),
 ('sois', 0.9274494632600175),
 ('falou', 0.9272689413311233),
 ('chegado', 0.9272580839422789),
 ('fazendo', 0.9272391510330231),
 ('grandes', 0.9271587598731948),
 ('quer', 0.9258465266561224),
 ('vir', 0.9257120289916091)]

In [29]:
find_most_similar('ir', encoded_embeddings)

[('colunas', 0.9335873065725974),
 ('levitas', 0.9328224342815542),
 ('príncipes', 0.9327844203833894),
 ('lá', 0.932681748183787),
 ('águas', 0.9318841324396119),
 ('cativeiro', 0.9317898737899123),
 ('santuário', 0.9314836620538367),
 ('mandamentos', 0.9308822713976851),
 ('arraial', 0.9304567606155713),
 ('trabalho', 0.9300000858074456)]