In [1]:
import numpy as np
import os
os.chdir('..')
from gensim.models import Word2Vec
from src.text.text_wrangler import Corpus
from src.graph.Graph import UndirectedGraph
from scipy import spatial



## Load the corpus and build initial model

In [2]:
shakespeare = Corpus("docs/shakespeare.txt")

In [3]:
model = Word2Vec(shakespeare.sentence_matrix, size = 120,
                 window = 5, min_count=5, workers=8, sg=1)

keyed_vectors = model.wv
embedding_matrix = keyed_vectors.vectors
word_to_index = {}
index_to_word = {}
for k, v in keyed_vectors.vocab.items():
    word_to_index[k] = v.index
    index_to_word[v.index] = k

In [4]:
semantic_graph = UndirectedGraph()

In [5]:
# Construct so that the word's row in the embedding_matrix lines up to the index in the semantic graph
for i in range(embedding_matrix.shape[0]):
    semantic_graph.add_node(index_to_word[i])

In [6]:
def select_subset(matrix, matrix_height, subset_proportion=0.1):
    subset_size = int(matrix_height * subset_proportion)
    sample_indices = np.random.choice(matrix_height, size=subset_size, replace=False)
    return sample_indices

def update_semantic_network(embedding_matrix, semantic_graph, key_to_index, index_to_key,
                            em_proportion=0.1, sg_proportion=0.1, stop_set=set(), thresh=0.4):
    em_subset = select_subset(embedding_matrix, embedding_matrix.shape[0], subset_proportion=em_proportion)
    sg_subset = select_subset(semantic_graph.adjacency_matrix, len(semantic_graph.nodes),
                              subset_proportion=sg_proportion)
    for i in em_subset:
        node_1 = index_to_key[i]
        node_1_emb = embedding_matrix[i]
        if node_1 in stop_set:
            continue
        for j in sg_subset:
            node_2 = index_to_key[j]
            node_2_emb = embedding_matrix[j]
            if node_2 in stop_set:
                continue
            cos_sim = spatial.distance.cosine(node_1_emb, node_2_emb)
#             print("Cosine similarity of {} and {}: {}".format(
#                 node_1, node_2, cos_sim
#             ))
            if cos_sim <= thresh:
                semantic_graph.add_edge(node_1, node_2, cos_sim)
#                 print("Updated edge to: {}".format(semantic_graph.edge(node_1, node_2)))
#             else:
#                 print("No update.")

    print(em_subset.shape)
    print(sg_subset.shape)
    

In [None]:
for i in range(5):
    model.train(shakespeare.sentence_matrix, total_examples=len(shakespeare.sentence_matrix),
                epochs=1, compute_loss=True)
    loss = model.get_latest_training_loss()
    # Quick glimpse at what Word2Vec finds to be the most similar
    sim = model.wv.most_similar("romeo")
    print("Round {} ==================".format(i))
    update_semantic_network(embedding_matrix, semantic_graph, key_to_index=word_to_index,
                            index_to_key=index_to_word, em_proportion=0.1, sg_proportion=0.2)

(814,)
(1629,)
(814,)
(1629,)


In [None]:
print(semantic_graph.adjacency_matrix[0])
for e in semantic_graph.adjacency_matrix[0]:
    print(e, end='')
print()