In [1]:
import numpy as np
import os
os.chdir('..')
from gensim.models import Word2Vec
from src.text.text_wrangler import Corpus
from src.graph.Graph import UndirectedGraph
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
shakespeare = Corpus("docs/shakespeare.txt")

In [3]:
model = Word2Vec(shakespeare.sentence_matrix, size = 120,
                 window = 5, min_count=5, workers=8, sg=1)

keyed_vectors = model.wv
embedding_matrix = keyed_vectors.vectors
word_to_index = {}
index_to_word = {}
for k, v in keyed_vectors.vocab.items():
    word_to_index[k] = v.index
    index_to_word[v.index] = k

In [4]:
semantic_graph = UndirectedGraph()
# semantic_matrix = np.zeros(shape=(embedding_matrix.shape[0], embedding_matrix.shape[0]), dtype="float32")

In [5]:
# Construct so that the word's row in the embedding_matrix lines up to the index in the semantic graph
for i in range(embedding_matrix.shape[0]):
    semantic_graph.add_node(index_to_word[i])

In [6]:
def select_subset(matrix, matrix_height, subset_proportion=0.1):
    subset_size = int(matrix_height * subset_proportion)
    sample_indices = np.random.choice(matrix_height, size=subset_size, replace=False)
    return sample_indices

def update_semantic_network(embedding_matrix, semantic_graph, key_to_index, index_to_key,
                            em_proportion=0.1, sg_proportion=0.1, stop_set=set(), thresh=0.8):
    em_subset = np.array([wd for wd in select_subset(embedding_matrix, embedding_matrix.shape[0],
                                                     subset_proportion=em_proportion) if wd not in stop_set])
    sg_subset = np.array([wd for wd in select_subset(semantic_graph.adjacency_matrix, len(semantic_graph.nodes),
                                                     subset_proportion=sg_proportion) if wd not in stop_set])

    em_embeddings = embedding_matrix[em_subset,]
    sg_embeddings = embedding_matrix[sg_subset,]
    cos_sims = cosine_similarity(em_embeddings, sg_embeddings)
    
    for i, em_i in enumerate(em_subset):
        for j, sg_j in enumerate(sg_subset):
            if cos_sims[i, j] >= thresh:
                semantic_graph.add_edge(index_to_key[em_i], index_to_key[sg_j], cos_sims[i, j])
                
    print("Updated")    

In [7]:
for i in range(5):
    model.train(shakespeare.sentence_matrix, total_examples=len(shakespeare.sentence_matrix),
                epochs=1, compute_loss=True)
    loss = model.get_latest_training_loss()
    # Quick glimpse at what Word2Vec finds to be the most similar
    sim = model.wv.most_similar("romeo")
    print("Round {} ==================".format(i))
    update_semantic_network(embedding_matrix, semantic_graph, key_to_index=word_to_index,
                            index_to_key=index_to_word, em_proportion=0.2, sg_proportion=0.3)

Updated
Updated
Updated
Updated
Updated


In [9]:
updated = np.argwhere(semantic_graph.adjacency_matrix > 0)
len(updated)

3679988