In [1]:
import gensim
import gensim.downloader
import spacy
import nltk
import random
import numpy as np

import clustering_class
from clustering_class import HierarchicalClustering



random.seed(1)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import vectors & the nlp word interpreter
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')
#nlp = spacy.load('en_core_web_trf')

In [3]:
from nltk.corpus import brown
len(brown.words())

1161192

In [4]:
vocab_size = 5000
tagged_vocab = nltk.pos_tag(glove_vectors.index_to_key[:vocab_size])

verbs = [verb[0] for verb in tagged_vocab if verb[1]=='VB']
nouns = [noun[0] for noun in tagged_vocab if noun[1]=='NN']
#print(nouns)

print(f"There are {len(verbs)} verbs and {len(nouns)} nouns.")

verb_indices = [glove_vectors.index_to_key.index(verb) for verb in verbs]
noun_indices = [glove_vectors.index_to_key.index(noun) for noun in nouns][2:]

chosen_indices = set(noun_indices)

There are 41 verbs and 1619 nouns.


In [5]:
# trying to use wordnet instead
from nltk.corpus import wordnet as wn
wordnet_chosen_words=[]
for word in glove_vectors.index_to_key[90:5000]:
    tmp = wn.synsets(word)
    if len([t.pos() for t in tmp if t.pos()=='n'])>=1:
        wordnet_chosen_words.append(word)

wordnet_chosen_indices = set([glove_vectors.key_to_index[word] for word in wordnet_chosen_words])
print(len(wordnet_chosen_indices))
#print(wordnet_chosen_words)


3437


In [16]:
# re-import the clustering class in case it was changed.
import importlib
importlib.reload(clustering_class)

# constants
proximity_const = 1
reducing_coef_const = 0
increasing_proximity_const = 0.2

# normed vectors
reduced_vectors = glove_vectors.get_normed_vectors()[:vocab_size]

hierarchical_clustering = clustering_class.HierarchicalClustering(
    word_embedding=glove_vectors,
    list_of_vectors=reduced_vectors,
    chosen_indices=wordnet_chosen_indices,
    initial_proximity=proximity_const,
    proximity_reduc=reducing_coef_const,
    initial_proximity_inc=increasing_proximity_const,
    verbose=True
)


better_hier_list, better_hier_list_w = hierarchical_clustering.get_better_list_of_hierarchical_orders()

#try to give it the similarity measure instead?

# named entity recognition, filter out 
# visualization of hierarchy

Setting distance matrix progress:  20.0 %
Setting distance matrix progress:  40.0 %
Setting distance matrix progress:  60.0 %
Setting distance matrix progress:  80.0 %
Setting distance matrix progress:  100.0 %
Finished setting distance matrix!
Starting with the 1563 used and 3437 not used indices.
Starting 1. hierarchical level.
--- Finished sorting: len(not_used)=3436, new_proximity_size = 1763
--- Finished sorting: len(not_used)=1672, new_proximity_size = 308
--- Finished sorting: len(not_used)=1363, new_proximity_size = 178
--- Finished sorting: len(not_used)=1184, new_proximity_size = 147
--- Finished sorting: len(not_used)=1036, new_proximity_size = 79
--- Finished sorting: len(not_used)=956, new_proximity_size = 77
--- Finished sorting: len(not_used)=878, new_proximity_size = 74
--- Finished sorting: len(not_used)=803, new_proximity_size = 57
--- Finished sorting: len(not_used)=745, new_proximity_size = 54
--- Finished sorting: len(not_used)=690, new_proximity_size = 43
--- Fini

In [19]:
chosen_words = np.asarray([reduced_vectors[i] for i in wordnet_chosen_indices])

In [20]:
hierarchical_clustering.visualize_vectors(chosen_words)

In [102]:
i = 5

print(len(better_hier_list[i].keys()))
better_hier_list_w[i].keys()

1


dict_keys(['graf'])

In [262]:
glove_vectors.key_to_index['philosophy']
print(glove_vectors.most_similar('philosophy'))
print(len(hierarchical_clustering.which_in_proximity(0, 1)))
for key in hierarchical_clustering.which_in_proximity(4044, 1):
    print(glove_vectors.index_to_key[key])

[('theology', 0.7256718873977661), ('philosophical', 0.6801457405090332), ('psychology', 0.6610170006752014), ('sociology', 0.6346594095230103), ('metaphysics', 0.6081408858299255), ('literature', 0.5989529490470886), ('taught', 0.5878127217292786), ('mathematics', 0.5867588520050049), ('teaching', 0.5839151740074158), ('philosophies', 0.5800243020057678)]
126
science
professor
theory
studied
ideas
religion
teaching
literature
taught
economics
principles
philosophy


So what would be a good measure of being a relevant word in a corpus of text. 

In [59]:
glove_vectors.vectors.shape


(400000, 300)