In [1]:
import itertools
from collections import OrderedDict 
import re
import nltk
from nltk.corpus import brown, gutenberg
from nltk.probability import FreqDist
from nltk.corpus import stopwords

In [2]:
nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# corpus

In [3]:
gutenberg.fileids()[3]

'bible-kjv.txt'

### corpus preprocessing

In [4]:
samples = gutenberg.sents(gutenberg.fileids()[3])
pattern = re.compile("[A-Za-z]+")
stop_w =  set(stopwords.words('english'))
corpus = []
for sent in samples:
    sent = [w.lower() for w in sent]
    sent = [w for w in sent if w not in stop_w]
    sent = [w.replace('\n', ' ') for w in sent]
    sent = [w for w in sent if pattern.fullmatch(w)]
    if len(sent) > 5:
        corpus.append(sent)

In [5]:
len(samples) # 문장의 수가 3만개임
len(corpus) # 실제 문장의 수는 corpus에 들어가있음

25481

In [6]:
fre_dist = FreqDist()
for sent in corpus:
    fre_dist.update(sent)
fre_dist = {k : v for k, v in fre_dist.items() if v > 5}

In [7]:
sorted(fre_dist.items(), key = lambda item: item[1]) # 단어의 빈도수를 저장하는 딕셔너리

[('gihon', 6),
 ('heel', 6),
 ('firstlings', 6),
 ('sevenfold', 6),
 ('jared', 6),
 ('methuselah', 6),
 ('seventeenth', 6),
 ('restrained', 6),
 ('husbandman', 6),
 ('salah', 6),
 ('commended', 6),
 ('plagued', 6),
 ('dwelled', 6),
 ('arioch', 6),
 ('eshcol', 6),
 ('thread', 6),
 ('childless', 6),
 ('pleaseth', 6),
 ('shur', 6),
 ('hastened', 6),
 ('blindness', 6),
 ('withheld', 6),
 ('bondwoman', 6),
 ('knife', 6),
 ('kirjatharba', 6),
 ('machpelah', 6),
 ('castles', 6),
 ('twins', 6),
 ('bashemath', 6),
 ('elon', 6),
 ('savoury', 6),
 ('smooth', 6),
 ('subtilty', 6),
 ('mandrakes', 6),
 ('leaped', 6),
 ('quite', 6),
 ('stole', 6),
 ('parcel', 6),
 ('shammah', 6),
 ('dishon', 6),
 ('wandering', 6),
 ('rid', 6),
 ('overseer', 6),
 ('uppermost', 6),
 ('forgat', 6),
 ('knee', 6),
 ('storehouses', 6),
 ('bondman', 6),
 ('changes', 6),
 ('laden', 6),
 ('revived', 6),
 ('jamin', 6),
 ('tola', 6),
 ('stooped', 6),
 ('wolf', 6),
 ('physicians', 6),
 ('babe', 6),
 ('spied', 6),
 ('leprous', 6)

In [8]:
vocab_size = len(fre_dist) # 단어의 수가 저장 되고
idx_to_word = {idx: word for idx,  word in enumerate(fre_dist.keys())}
word_to_idx = {word: idx for idx, word in idx_to_word.items()}

### convert word to index 

In [9]:
corpus_indexed = [[word_to_idx[word] for word in sent if word in word_to_idx]for sent in corpus]
corpus_indexed = [sent for sent in corpus_indexed if len(sent) > 5]
fre_dist_indexed = {word_to_idx[w]: f for w, f in fre_dist.items()}

In [10]:
# corpus_indexed[0]
# for i in corpus_indexed[0]:
#     print(idx_to_word[i])
# corpus[0]

## Huffman Tree

In [11]:
import numpy as np

In [12]:
class HuffmanNode:
    def __init__(self, is_leaf, value=None, fre=0, left=None, right=None):
        self.is_leaf = is_leaf
        self.value = value  # the node's index in huffman tree / char
        self.fre = fre  # word frequency in corpus / freq
        self.code = []  # huffman code 
        self.code_len = 0  # lenght of code
        self.node_path = []  # the path from root node to this node
        self.left = left  # left child
        self.right = right  # right child

In [13]:
freq_dict = sorted(fre_dist_indexed.items(), key=lambda x:x[1], reverse=True)
node_list = [HuffmanNode(is_leaf=True, value=w, fre=fre) for w, fre in freq_dict]  # create leaf node
node_list += [HuffmanNode(is_leaf=False, fre=1e10) for i in range(len(fre_dist_indexed))]  # create non-leaf node

In [14]:
import heapq
from tqdm import tqdm


class HeapNode:
    def __init__(self, char, freq):
        self.char = char
        self.freq = freq
        self.left = None
        self.right = None
        self.index = None
        self.vector = None

    def __lt__(self, other):
        if other is None:
            return -1
        if not isinstance(other, HeapNode):
            return -1
        return self.freq < other.freq


class HuffmanCoding:
    def __init__(self):
        self.heap = []
        self.codes = {}
        self.reverse_mapping = {}
        self.merged_nodes = None

    def make_heap(self, frequency):  # frequency has a shape of  { word : frequency }
        for key in frequency:  # make a node, then push in list of heap queue
            node = HeapNode(key, frequency[key])
            heapq.heappush(self.heap, node)

    def merge_nodes(self):      # make nodes from low to high frequency and merge to tree.
        index = 0
        merged = None
        while len(self.heap) > 1:
            node1 = heapq.heappop(self.heap)
            node2 = heapq.heappop(self.heap)

            merged = HeapNode(None, node1.freq + node2.freq)
            merged.left = node1
            merged.right = node2
            merged.index = index                # index is reversed, i.e. root node has a biggest index.
            heapq.heappush(self.heap, merged)

            index += 1

        return merged

    def make_codes_helper(self, root, current_code):
        if root is None:
            return

        if root.char is not None:
            self.codes[root.char] = current_code
            self.reverse_mapping[current_code] = root.char
            return

        self.make_codes_helper(root.left, current_code + "0")
        self.make_codes_helper(root.right, current_code + "1")

    def make_codes(self):
        root = heapq.heappop(self.heap)
        current_code = ""
        self.make_codes_helper(root, current_code)

    def build(self, frequency):
        self.make_heap(frequency) # frequency를 기준으로 heapnode를 추가한다.
        merged = self.merge_nodes()
        self.make_codes()

        return self.codes, merged


def init_huffman_modified():
    h = HuffmanCoding()

    codes, merged = h.build(fre_dist)
    
    tree = {}
    max_depth = 0

    for word in tqdm(codes.keys(), desc='Building Huffman Tree', ncols=100):
        direction_code = codes[word]
        depth = len(direction_code)
        root = merged
        index_path = [root.index]
        direction_path = []
        for i in range(depth):
            direction_path.append(int(direction_code[i]))
            if direction_code[i] == '0':
                root = root.left
            else:
                root = root.right
            if root.index is not None:
                index_path.append(root.index)
        # if len(index_path) != len(direction_path):
        #     print(word)
        #     print(direction_code)
        #     print(len(direction_code))
        #     print(len(index_path))
        #     print(len(direction_path))
        #     print(index_path)
        #     print(direction_path)
        #     break
        info = {'index_path': index_path, 'direction_path': direction_path, 'depth': depth}
        tree[word_to_idx[word]] = info

        if depth > max_depth:
            max_depth = depth

    return tree, max_depth

In [15]:
tree, max_depth = init_huffman_modified()
total = sum([item[1] for item in fre_dist.items()])

Building Huffman Tree: 100%|█████████████████████████████████| 4533/4533 [00:00<00:00, 32187.46it/s]


In [16]:
import os
import sys

sys.path.append('./')

import argparse
from distutils.util import strtobool as _bool
import numpy as np

from src.preprocess_utils import *
from src.huffman import *
from eval import cosine_similarity

In [21]:
def sigmoid(xs):
    ans = 1 / (1 + np.exp(-xs))
    top = 1 / (1 + math.exp(6))
    bottom = 1 / (1 + math.exp(-6))
    for i, num in enumerate(ans[0]):
        if num < top:
            ans[0, i] = 0
        elif num > bottom:
            ans[0, i] = 1
    return ans


def similar_word(emb):
    index_to_word = pickle.load(open(cfg.index_to_word_path, 'rb'))
    word_to_index = pickle.load(open(cfg.word_to_index_path, 'rb'))
    embedding_norm = np.linalg.norm(emb, axis=1)
    norm_emb = emb / embedding_norm[:, None]
    word1 = word_to_index['king']
    word2 = word_to_index['queen']
    word3 = word_to_index['husband']
    answer = word_to_index['wife']

    target = norm_emb[word2] - norm_emb[word1] + norm_emb[word3]
    target = target / np.linalg.norm(target)

    max_index = answer
    max_sim = cosine_similarity(target, norm_emb[answer])
    for i in tqdm(range(len(word_to_index)), desc="Finding closest word to queen-king+husband", ncols=70):
        if i == word1 or i == word2 or i == word3 or i == answer:
            pass
        else:
            sim = cosine_similarity(norm_emb[i], target)
            if sim > max_sim:
                max_sim = sim
                max_index = i
    print(index_to_word[max_index])


In [18]:
hidden_size = 300
embedding = np.random.uniform(low=-0.5/300, high=0.5/300, size=(vocab_size, hidden_size)).astype('f')
emb_grad_temp = []
context = np.zeros_like(embedding).astype('f')  # for negative sampling
node_mat = np.zeros((vocab_size-1, hidden_size)).astype('f')  # for hierarchical softmax
node_mat_grad_temp = []

starting_lr = 0.05
min_loss = math.inf

In [20]:
print("Start training on {} words".format(vocab_size))
step = 0
update_step = 0
# logging_loss = 0
start_time = time.time()
lr = starting_lr
update_size = 12
epochs = 3
window_size = 10
subsampling_t = 1e-5

Start training on 4533 words


In [None]:
class HierarchicalSoftmaxLayer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, freq_dict):
        super().__init__()
        ## in w2v c implement, syn1 initial with all zero
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.syn1 = nn.Embedding(
            num_embeddings=vocab_size + 1,
            embedding_dim=embedding_dim,
            padding_idx=vocab_size

        )
        torch.nn.init.constant_(self.syn1.weight.data, val=0)
        self.huffman_tree = HuffmanTree(freq_dict)

    def forward(self, neu1, target):
        # neu1: [b_size, embedding_dim]
        # target: [b_size, 1]

        # turns:[b_size, max_code_len_in_batch]
        # paths: [b_size, max_code_len_in_batch]
        turns, paths = self._get_turns_and_paths(target)
        paths_emb = self.syn1(paths)  # [b_size, max_code_len_in_batch, embedding_dim]

        loss = -F.logsigmoid(
            (turns.unsqueeze(2) * paths_emb * neu1.unsqueeze(1)).sum(2)).sum(1).mean()
        return loss

    def _get_turns_and_paths(self, target):
        turns = []  # turn right(1) or turn left(-1) in huffman tree
        paths = []
        max_len = 0
        ''' we have batch of center words ... '''
        for n in target:
            n = n.item()
            node = self.huffman_tree.node_dict[n]

            code = target.new_tensor(node.code).int()  # in code, left node is 0; right node is 1
            turn = torch.where(code == 1, code, -torch.ones_like(code))  # 1 -> 1;  0 -> -1

            turns.append(turn)
            '''node_path records the index from root to leaf node in huffman tree'''
            paths.append(target.new_tensor(node.node_path))

            if node.code_len > max_len:
                max_len = node.code_len

        '''Because each word may has different code length, we should pad them to equal length'''
        turns = [F.pad(t, pad=(0, max_len - len(t)), mode='constant', value=0) for t in turns]
        paths = [F.pad(p, pad=(0, max_len - p.shape[0]), mode='constant', value=net.hs.vocab_size) for p in paths]
        return torch.stack(turns).int(), torch.stack(paths).long()


아래부턴ㅌ 쓰레기\

The way builing huffman tree refer to c's original implement

In [12]:
class HuffmanTree:
    def __init__(self, freq_dict):
        self.root = None
        freq_dict = sorted(freq_dict.items(), key=lambda x:x[1], reverse=True) # 내림차순으로 나열
        self.vocab_size = len(freq_dict)
        self.node_dict = {}
        self._build_tree(freq_dict)
    
    def _build_tree(self, freq_dict):
        '''
            freq_dict is in 내림차순 정렬
            node_list: two part: [leaf node :: internal node]
                leaf node is sorting by frequency in decent order; 
        '''
    
        node_list = [HuffmanNode(is_leaf=True, value=w, fre=fre) for w, fre in freq_dict]  # create leaf node
        node_list += [HuffmanNode(is_leaf=False, fre=1e10) for i in range(self.vocab_size)]  # create non-leaf node

        parentNode = [0] * (self.vocab_size * 2)  # only 2 * vocab_size - 2 be used
        binary = [0] * (self.vocab_size * 2)  # recording turning left or turning right
        
        '''
          pos1 points to currently processing leaf node at left side of node_list
          pos2 points to currently processing non-leaf node at right side of node_list
        '''

        pos1 = self.vocab_size - 1
        pos2 = self.vocab_size
        
        '''
            each iteration picks two node from node_list
            the first pick assigns to min1i
            the second pick assigns to min2i 
            
            min2i's frequency is always larger than min1i
        '''
        min1i = 0
        min2i = 0
        '''
            the main process of building huffman tree
        '''
        for a in range(self.vocab_size - 1):
            '''
                first pick assigns to min1i
            '''
            if pos1 >= 0:
                if node_list[pos1].fre < node_list[pos2].fre:
                    min1i = pos1
                    pos1 -= 1
                else:
                    min1i = pos2
                    pos2 += 1
            else:
                min1i = pos2
                pos2 += 1
            
            '''
               second pick assigns to min2i 
            '''
            if pos1 >= 0:
                if node_list[pos1].fre < node_list[pos2].fre:
                    min2i = pos1
                    pos1 -= 1
                else:
                    min2i = pos2
                    pos2 += 1
            else:
                min2i = pos2
                pos2 += 1
            
            ''' fill information of non leaf node '''
            node_list[self.vocab_size + a].fre = node_list[min1i].fre + node_list[min2i].fre
            node_list[self.vocab_size + a].left = node_list[min1i]
            node_list[self.vocab_size + a].right = node_list[min2i]
            
            '''
                the parent node always is non leaf node
                assigen lead child (min2i) and right child (min1i) to parent node
            '''
            parentNode[min1i] = self.vocab_size + a  # max index = 2 * vocab_size - 2
            parentNode[min2i] = self.vocab_size + a
            binary[min2i] = 1
        
        '''generate huffman code of each leaf node '''
        for a in range(self.vocab_size):
            b = a
            i = 0
            code = []
            point = []

            '''

                backtrace path from current node until root node. (bottom up)
                'root node index' in node_list is  2 * vocab_size - 2 
            '''
            while b != self.vocab_size * 2 - 2:
                code.append(binary[b])  
                b = parentNode[b]
                # point recording the path index from leaf node to root, the length of point is less 1 than the length of code
                point.append(b)
            
            '''
                huffman code should be top down, so we reverse it.
            '''
            node_list[a].code_len = len(code)
            node_list[a].code = list(reversed(code))
            

            '''
                1. Recording the path from root to leaf node (top down). 
                
                2.The actual index value should be shifted by self.vocab_size,
                  because we need the index starting from zero to mapping non-leaf node
                
                3. In case of full binary tree, the number of non leaf node always equals to vocab_size - 1.
                  The index of BST root node in node_list is 2 * vocab_size - 2,
                  and we shift vocab_size to get the actual index of root node: vocab_size - 2
            '''
            node_list[a].node_path = list(reversed([p - self.vocab_size for p in point]))
            
            self.node_dict[node_list[a].value] = node_list[a]
            
        self.root = node_list[2 * vocab_size - 2]
        
        
        

## CBOW + HS
$$
-\log p(w_O| w_I) = -\log \dfrac{\text{exp}({h^\top \text{v}'_O})}{\sum_{w_i \in V} \text{exp}({h^\top \text{v}'_{w_i}})}= - \sum^{L(w)-1}_{l=1}  \log\sigma( [ \cdot ] h^\top \text{v}^{'}_l)
$$

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import tqdm

### create dataset 

In [181]:
class CBOWDataset(torch.utils.data.Dataset):
    def __init__(self, corpus, windows_size=5, sentence_length_threshold=5):
        self.windows_size = windows_size
        self.sentence_length_threshold = sentence_length_threshold
        self.contexts, self.centers = self._generate_pairs(corpus, windows_size)
        
    def _generate_pairs(self, corpus, windows_size):
        contexts = []
        centers = []
        
        for sent in corpus:
            if len(sent) < self.sentence_length_threshold:
                continue
            
            for center_word_pos in range(len(sent)):
                context = []
                for w in range(-windows_size, windows_size + 1):
                    context_word_pos = center_word_pos + w
                    if(0 <= context_word_pos < len(sent) and context_word_pos != center_word_pos):
                        context.append(sent[context_word_pos])
                if(len(context) == 2 * self.windows_size):
                    contexts.append(context)
                    centers.append(sent[center_word_pos])
        return contexts, centers
    
    def __len__(self):
        return len(self.centers)
    
    def __getitem__(self, index):
        return np.array(self.contexts[index]), np.array([self.centers[index]])

### define network

In [182]:
class HierarchicalSoftmaxLayer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, freq_dict):
        super().__init__()
        ## in w2v c implement, syn1 initial with all zero
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.syn1 = nn.Embedding(
            num_embeddings=vocab_size + 1,
            embedding_dim=embedding_dim,
            padding_idx=vocab_size
            
        )
        torch.nn.init.constant_(self.syn1.weight.data, val=0)
        self.huffman_tree = HuffmanTree(freq_dict)

    def forward(self, neu1, target):
        # neu1: [b_size, embedding_dim]
        # target: [b_size, 1]
        
        # turns:[b_size, max_code_len_in_batch]
        # paths: [b_size, max_code_len_in_batch]
        turns, paths = self._get_turns_and_paths(target)
        paths_emb = self.syn1(paths) # [b_size, max_code_len_in_batch, embedding_dim]

        loss = -F.logsigmoid(
            (turns.unsqueeze(2) * paths_emb * neu1.unsqueeze(1)).sum(2)).sum(1).mean()
        return loss
    
    def _get_turns_and_paths(self, target):
        turns = []  # turn right(1) or turn left(-1) in huffman tree
        paths = []
        max_len = 0
        ''' we have batch of center words ... '''
        for n in target:
            n = n.item()
            node = self.huffman_tree.node_dict[n]
            
            code = target.new_tensor(node.code).int()  # in code, left node is 0; right node is 1
            turn = torch.where(code == 1, code, -torch.ones_like(code)) # 1 -> 1;  0 -> -1
            
            turns.append(turn)
            '''node_path records the index from root to leaf node in huffman tree'''
            paths.append(target.new_tensor(node.node_path))
            
            if node.code_len > max_len:
                max_len = node.code_len
        
        '''Because each word may has different code length, we should pad them to equal length'''
        turns = [F.pad(t, pad=(0, max_len - len(t)), mode='constant', value=0) for t in turns] 
        paths = [F.pad(p, pad=(0, max_len - p.shape[0]), mode='constant', value=net.hs.vocab_size) for p in paths]
        return torch.stack(turns).int(), torch.stack(paths).long()
    

In [183]:
class CBOWHierarchicalSoftmax(nn.Module):
    def __init__(self, vocab_size, embedding_dim, freq_dict):
        super().__init__()
        self.syn0 = nn.Embedding(vocab_size, embedding_dim)
        self.hs = HierarchicalSoftmaxLayer(vocab_size, embedding_dim, freq_dict)

    
    def forward(self, context, target):
        # context: [b_size, 2 * window_size]
        # target: [b_size]
        neu1 = self.syn0(context.long()).mean(dim=1)  # [b_size, embedding_dim]
        loss = self.hs(neu1, target.long())
        return loss
    

## training

In [205]:
data_set = CBOWDataset(corpus_indexed)
data_loader = DataLoader(data_set, batch_size=100, num_workers=0)

In [206]:
embedding_dim = 50
net = CBOWHierarchicalSoftmax(vocab_size, embedding_dim, fre_dist_indexed)
optimizer = optim.Adam(net.parameters(), lr=0.001,  weight_decay=1e-5)

In [207]:
log_interval = 100
for epoch_i in range(10):
    total_loss = 0
    net.train()
    tk0 = tqdm.tqdm(data_loader, smoothing=0, mininterval=1.0)
    for i, (context, center) in enumerate(tk0):

        loss = net(context, center)
        net.zero_grad()
        loss.backward()
        
        optimizer.step()

        total_loss += loss.item()
        if(i + 1) % log_interval == 0:
            tk0.set_postfix(loss = total_loss/log_interval)
            total_loss = 0
        

100%|██████████| 1147/1147 [00:32<00:00, 35.12it/s, loss=10.9]
100%|██████████| 1147/1147 [00:29<00:00, 38.94it/s, loss=10.7]
100%|██████████| 1147/1147 [00:30<00:00, 37.59it/s, loss=10.6]
100%|██████████| 1147/1147 [00:22<00:00, 50.96it/s, loss=10.5]
100%|██████████| 1147/1147 [00:29<00:00, 39.32it/s, loss=10.4]
100%|██████████| 1147/1147 [00:26<00:00, 43.69it/s, loss=10.3]
100%|██████████| 1147/1147 [00:23<00:00, 47.86it/s, loss=10.2]
100%|██████████| 1147/1147 [00:26<00:00, 43.45it/s, loss=10.1]
100%|██████████| 1147/1147 [00:27<00:00, 41.68it/s, loss=10.1]
100%|██████████| 1147/1147 [00:28<00:00, 40.07it/s, loss=10]


## fetch word embedding

In [208]:
w2v_embedding = net.syn0.weight.data
w2v_embedding = w2v_embedding.numpy()
l2norm = np.linalg.norm(w2v_embedding, 2, axis=1, keepdims=True)
w2v_embedding = w2v_embedding / l2norm


  after removing the cwd from sys.path.


# Evaluate

In [209]:
class CosineSimilarity:
    def __init__(self, word_embedding, idx_to_word_dict, word_to_idx_dict):
        self.word_embedding = word_embedding # normed already
        self.idx_to_word_dict = idx_to_word_dict
        self.word_to_idx_dict = word_to_idx_dict
        
    def get_synonym(self, word, topK=10):
        idx = self.word_to_idx_dict[word]
        embed = self.word_embedding[idx]
        
        cos_similairty = w2v_embedding @ embed
        
        topK_index = np.argsort(-cos_similairty)[:topK]
        pairs = []
        for i in topK_index:
            w = self.idx_to_word_dict[i]
            pairs.append((w, cos_similairty[i]))
        return pairs
        

In [210]:
cosinSim = CosineSimilarity(w2v_embedding, idx_to_word, word_to_idx)
cosinSim.get_synonym('christ')

[('christ', 1.0),
 ('hope', 0.78780156),
 ('gospel', 0.7656436),
 ('jesus', 0.74575657),
 ('faith', 0.7190881),
 ('godliness', 0.7005944),
 ('offences', 0.70045626),
 ('grace', 0.6946964),
 ('dear', 0.666232),
 ('willing', 0.66131693)]

In [211]:
cosinSim.get_synonym('god')

[('god', 1.0),
 ('saviour', 0.53627664),
 ('remember', 0.51367503),
 ('sure', 0.4997003),
 ('hope', 0.47002873),
 ('purpose', 0.46906227),
 ('praise', 0.45354468),
 ('thanks', 0.4486973),
 ('doubtless', 0.44689322),
 ('formed', 0.44300675)]

In [212]:
cosinSim.get_synonym('jesus')

[('jesus', 0.9999999),
 ('gospel', 0.8051339),
 ('grace', 0.75879383),
 ('church', 0.7542972),
 ('christ', 0.74575657),
 ('manifest', 0.7415799),
 ('believed', 0.7215627),
 ('faith', 0.7198993),
 ('godliness', 0.7091305),
 ('john', 0.7015951)]