In [46]:
# imports and set up logging
import spacy
import gensim 
import logging
import glob, os
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [47]:
# directory containing all source texts for training the model 
data_dir="../corpus"

In [48]:
os.chdir(data_dir)
documents = []
for filename in sorted(glob.glob("*.txt")):
    filedata = open(filename, 'r').read()
    print(filename + " = " + str(len(filedata)) + " chars")
    documents.append(filedata)
    #documents = documents + filedata.split(".")

Hobbit_input_utf8.txt = 508213 chars
LostTales1_input_utf8.txt = 383135 chars
LostTales2_input_utf8.txt = 462819 chars
LotR_complete_input_utf8.txt = 2565751 chars
Silmarillion_input_utf8.txt = 698110 chars


In [49]:
%%time

# Use the NLP library SpaCy (spacy.io) to cut into sentences, and remove all punctuation and extra spaces (etc.)
#
# We could also lowercase and lemmatize everything here, and remove stopwords, but we aren't going to for now.
#

# Load the model
nlp = spacy.load('en_core_web_lg')
nlp.max_length = 3000000

all_sentences = []
for doc in documents:
    # replace all the line feeds with spaces
    doc = doc.replace("\n", " ")
    # run the spacy tokenization/nlp algorithm on each source document
    spacy_doc = nlp(doc)
    for spacy_sentence in spacy_doc.sents:
        #print(spacy_sentence)
        sentence_clean = []
        for token in spacy_sentence:
            if token.pos_ != "SPACE" and token.pos_ != "PUNCT":
                sentence_clean.append(token.text)
        cleaned_sentence = " ".join(sentence_clean)
        all_sentences.append(cleaned_sentence)
        #print(cleaned_sentence)
        #print("---")


CPU times: user 13min 56s, sys: 42.1 s, total: 14min 38s
Wall time: 14min 47s


In [50]:
# Check our total number of sentences in the corpus
print(str(len(all_sentences)))

54765


In [51]:
from gensim.models import word2vec, Phrases
from gensim.models.phrases import Phraser

#documents = ["the mayor of new york was there", "human computer interaction and machine learning has now become a trending research area","human computer interaction is interesting","human computer interaction is a pretty interesting subject", "human computer interaction is a great and new subject", "machine learning can be useful sometimes","new york mayor was present", "I love machine learning because it is a new subject area", "human computer interaction helps people to get user friendly applications"]

sentence_stream = [doc.split(" ") for doc in all_sentences]  #documents

trigram_sentences_project = []

bigram = Phraser(Phrases(sentence_stream))
trigram = Phraser(Phrases(bigram[sentence_stream]))

for sent in sentence_stream:
    bigrams_ = bigram[sent]
    trigrams_ = trigram[bigram[sent]]
    trigram_sentences_project.append(trigrams_)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 1    # Minimum word count                        
num_workers = 20      # Number of threads to run in parallel
context = 5           # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words
skip_grams = 1        # 0 for CBOW, 1 for skip-grams

model = word2vec.Word2Vec(trigram_sentences_project, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, sg = skip_grams)

vocab = list(model.wv.vocab.keys())
print(vocab[:10])


2021-10-07 16:56:57,240 : INFO : collecting all words and their counts
2021-10-07 16:56:57,241 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2021-10-07 16:56:57,568 : INFO : PROGRESS: at sentence #10000, processed 201113 words and 98653 word types
2021-10-07 16:56:57,832 : INFO : PROGRESS: at sentence #20000, processed 365659 words and 153214 word types
2021-10-07 16:56:58,049 : INFO : PROGRESS: at sentence #30000, processed 491546 words and 186647 word types
2021-10-07 16:56:58,253 : INFO : PROGRESS: at sentence #40000, processed 611015 words and 216902 word types
2021-10-07 16:56:58,486 : INFO : PROGRESS: at sentence #50000, processed 752714 words and 248416 word types
2021-10-07 16:56:58,694 : INFO : collected 274159 word types from a corpus of 876492 words (unigram + bigrams) and 54765 sentences
2021-10-07 16:56:58,695 : INFO : using 274159 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2021-10-07 16:56:58,696 : INF

2021-10-07 16:57:26,152 : INFO : worker thread finished; awaiting finish of 19 more threads
2021-10-07 16:57:26,235 : INFO : worker thread finished; awaiting finish of 18 more threads
2021-10-07 16:57:26,242 : INFO : worker thread finished; awaiting finish of 17 more threads
2021-10-07 16:57:26,273 : INFO : worker thread finished; awaiting finish of 16 more threads
2021-10-07 16:57:26,274 : INFO : worker thread finished; awaiting finish of 15 more threads
2021-10-07 16:57:26,285 : INFO : worker thread finished; awaiting finish of 14 more threads
2021-10-07 16:57:26,293 : INFO : worker thread finished; awaiting finish of 13 more threads
2021-10-07 16:57:26,312 : INFO : worker thread finished; awaiting finish of 12 more threads
2021-10-07 16:57:26,319 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-10-07 16:57:26,342 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-10-07 16:57:26,344 : INFO : worker thread finished; awaiting finish of 9 mo

['JRR', 'TOLKIEN', 'THE', 'HOBBIT', 'Chapter', 'I', 'An', 'Unexpected', 'Party', 'In']


In [52]:
# Print the total number of items in our model's vocabulary
print(len(model.wv.vocab))

22647


In [53]:
w1 = "Bilbo"
model.wv.most_similar (positive=w1)

2021-10-07 16:57:29,799 : INFO : precomputing L2-norms of word weight vectors


[('Sam', 0.8769451975822449),
 ('Gollum', 0.8649412393569946),
 ('Pippin', 0.8555766344070435),
 ('Frodo', 0.849555492401123),
 ('Strider', 0.8470858335494995),
 ('Tom', 0.8436428308486938),
 ('Mr._Butterbur', 0.8343122005462646),
 ('Thorin', 0.8338472843170166),
 ('Shagrat', 0.8319171667098999),
 ('Merry', 0.8291702270507812)]

In [54]:
w1 = "dragon"
model.wv.most_similar (positive=w1)

[('rider', 0.8914847373962402),
 ('sign', 0.8842803835868835),
 ('enemy', 0.8774009943008423),
 ('key', 0.8722566962242126),
 ('wolf', 0.8716298937797546),
 ('troll', 0.8692148327827454),
 ('Nazgyl', 0.8679828643798828),
 ('Great_Goblin', 0.8665998578071594),
 ('Rider', 0.8664273619651794),
 ('marked', 0.8639983534812927)]

In [55]:
w1 = "trees"
model.wv.most_similar (positive=w1)

[('shapes', 0.8648653030395508),
 ('bushes', 0.8618569374084473),
 ('rocks', 0.8409314751625061),
 ('holes', 0.837933361530304),
 ('thickets', 0.836373507976532),
 ('stones', 0.8329076170921326),
 ('points', 0.8316853642463684),
 ('swans', 0.830091118812561),
 ('grass', 0.8296234011650085),
 ('lines', 0.8273912668228149)]

In [56]:
# write the input files to display via https://projector.tensorflow.org/
tensorsfp = "../vectors.txt"
metadatafp = "../metadata.txt"

with open( tensorsfp, 'w+') as tensors:
    with open( metadatafp, 'w+') as metadata:
         for word in model.wv.index2word:
                metadata.write(word + '\n')
                vector_row = '\t'.join(map(str, model[word]))
                tensors.write(vector_row + '\n')

  if __name__ == '__main__':
