In [12]:
# imports and set up logging
import spacy
import gensim 
import logging
import glob, os
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [13]:
# directory containing all source texts for training the model 
data_dir="../corpus"

In [14]:
os.chdir(data_dir)
documents = []
for filename in sorted(glob.glob("*.txt")):
    filedata = open(filename, 'r').read()
    print(filename + " = " + str(len(filedata)) + " chars")
    documents.append(filedata)
    #documents = documents + filedata.split(".")

Hobbit_input_utf8.txt = 508213 chars
LostTales1_input_utf8.txt = 383135 chars
LostTales2_input_utf8.txt = 462819 chars
LotR_complete_input_utf8.txt = 2565751 chars
Silmarillion_input_utf8.txt = 698110 chars


In [15]:
%%time

# Use the NLP library SpaCy (spacy.io) to cut into sentences, and remove all punctuation and extra spaces (etc.)
#
# We could also lowercase and lemmatize everything here, and remove stopwords, but we aren't going to for now.
#

# Load the model
nlp = spacy.load('en_core_web_lg')
nlp.max_length = 3000000

all_sentences = []
for doc in documents:
    # replace all the line feeds with spaces
    doc = doc.replace("\n", " ")
    # run the spacy tokenization/nlp algorithm on each source document
    spacy_doc = nlp(doc)
    for spacy_sentence in spacy_doc.sents:
        #print(spacy_sentence)
        sentence_clean = []
        for token in spacy_sentence:
            if token.pos_ != "SPACE" and token.pos_ != "PUNCT":
                sentence_clean.append(token.text)
        cleaned_sentence = " ".join(sentence_clean)
        all_sentences.append(cleaned_sentence)
        #print(cleaned_sentence)
        #print("---")


CPU times: user 13min 39s, sys: 39.8 s, total: 14min 19s
Wall time: 14min 26s


In [16]:
# Check our total number of sentences in the corpus
print(str(len(all_sentences)))

54766


In [17]:
from gensim.models import word2vec, Phrases
from gensim.models.phrases import Phraser

#documents = ["the mayor of new york was there", "human computer interaction and machine learning has now become a trending research area","human computer interaction is interesting","human computer interaction is a pretty interesting subject", "human computer interaction is a great and new subject", "machine learning can be useful sometimes","new york mayor was present", "I love machine learning because it is a new subject area", "human computer interaction helps people to get user friendly applications"]

sentence_stream = [doc.split(" ") for doc in all_sentences]  #documents

trigram_sentences_project = []

bigram = Phraser(Phrases(sentence_stream))
trigram = Phraser(Phrases(bigram[sentence_stream]))

for sent in sentence_stream:
    bigrams_ = bigram[sent]
    trigrams_ = trigram[bigram[sent]]
    trigram_sentences_project.append(trigrams_)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 1    # Minimum word count                        
num_workers = 20      # Number of threads to run in parallel
context = 5           # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words
skip_grams = 1        # 0 for CBOW, 1 for skip-grams

model = word2vec.Word2Vec(trigram_sentences_project, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, sg = skip_grams)

vocab = list(model.wv.vocab.keys())
print(vocab[:10])


2021-10-09 09:57:21,739 : INFO : collecting all words and their counts
2021-10-09 09:57:21,747 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2021-10-09 09:57:22,099 : INFO : PROGRESS: at sentence #10000, processed 201113 words and 98653 word types
2021-10-09 09:57:22,386 : INFO : PROGRESS: at sentence #20000, processed 365659 words and 153215 word types
2021-10-09 09:57:22,627 : INFO : PROGRESS: at sentence #30000, processed 491531 words and 186645 word types
2021-10-09 09:57:22,843 : INFO : PROGRESS: at sentence #40000, processed 610994 words and 216901 word types
2021-10-09 09:57:23,096 : INFO : PROGRESS: at sentence #50000, processed 752697 words and 248421 word types
2021-10-09 09:57:23,317 : INFO : collected 274160 word types from a corpus of 876493 words (unigram + bigrams) and 54766 sentences
2021-10-09 09:57:23,318 : INFO : using 274160 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2021-10-09 09:57:23,319 : INF

2021-10-09 09:57:50,285 : INFO : worker thread finished; awaiting finish of 19 more threads
2021-10-09 09:57:50,290 : INFO : worker thread finished; awaiting finish of 18 more threads
2021-10-09 09:57:50,299 : INFO : worker thread finished; awaiting finish of 17 more threads
2021-10-09 09:57:50,341 : INFO : worker thread finished; awaiting finish of 16 more threads
2021-10-09 09:57:50,343 : INFO : worker thread finished; awaiting finish of 15 more threads
2021-10-09 09:57:50,346 : INFO : worker thread finished; awaiting finish of 14 more threads
2021-10-09 09:57:50,360 : INFO : worker thread finished; awaiting finish of 13 more threads
2021-10-09 09:57:50,365 : INFO : worker thread finished; awaiting finish of 12 more threads
2021-10-09 09:57:50,397 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-10-09 09:57:50,410 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-10-09 09:57:50,449 : INFO : worker thread finished; awaiting finish of 9 mo

['JRR', 'TOLKIEN', 'THE', 'HOBBIT', 'Chapter', 'I', 'An', 'Unexpected', 'Party', 'In']


In [18]:
# Print the total number of items in our model's vocabulary
print(len(model.wv.vocab))

22646


In [19]:
w1 = "Bilbo"
model.wv.most_similar (positive=w1)

2021-10-09 09:57:53,692 : INFO : precomputing L2-norms of word weight vectors


[('Gollum', 0.9023342132568359),
 ('Sam', 0.8929318785667419),
 ('Frodo', 0.8768751621246338),
 ('Pippin', 0.8751285076141357),
 ('Strider', 0.8512952327728271),
 ('Shagrat', 0.8474864959716797),
 ('Merry', 0.8417835235595703),
 ('wizard', 0.8417626619338989),
 ('Gandalf', 0.835616946220398),
 ('Tom', 0.8336421847343445)]

In [20]:
w1 = "dragon"
model.wv.most_similar (positive=w1)

[('rider', 0.8844809532165527),
 ('Rider', 0.8839728832244873),
 ('orc', 0.8817406892776489),
 ('enemy', 0.8769322633743286),
 ('sign', 0.8714866638183594),
 ('key', 0.8694632053375244),
 ('partly', 0.8665182590484619),
 ('challenge', 0.8615357875823975),
 ('Great_Goblin', 0.8601824641227722),
 ('accident', 0.8601601719856262)]

In [21]:
w1 = "trees"
model.wv.most_similar (positive=w1)

[('holes', 0.8676007390022278),
 ('shapes', 0.8624780774116516),
 ('lights', 0.8590327501296997),
 ('bushes', 0.8565700054168701),
 ('thickets', 0.8529249429702759),
 ('reeds', 0.8488107919692993),
 ('grasses', 0.8482720851898193),
 ('noises', 0.8469080924987793),
 ('lines', 0.844353973865509),
 ('points', 0.8439342379570007)]

In [22]:
# write the input files to display via https://projector.tensorflow.org/
tensorsfp = "../vectors.txt"
metadatafp = "../metadata.txt"

with open( tensorsfp, 'w+') as tensors:
    with open( metadatafp, 'w+') as metadata:
         for word in model.wv.index2word:
                metadata.write(word + '\n')
                vector_row = '\t'.join(map(str, model[word]))
                tensors.write(vector_row + '\n')

  if __name__ == '__main__':
