In [1]:
# imports and set up logging
import spacy
import glob, os
from collections import Counter
from pprint import pprint

In [2]:
# directory containing all source texts for training the model 
data_dir="../corpus"

In [3]:
os.chdir(data_dir)

documents = []
filenames = []

for filename in sorted(glob.glob("*.txt")):
    filedata = open(filename, 'r').read()
    print(filename + " = " + str(len(filedata)) + " chars")
    documents.append(filedata)
    filenames.append(filename)

LotR_complete_input_utf8.txt = 2565751 chars


In [4]:
%%time

# Use the NLP library SpaCy (spacy.io) to find all tokens (words) and lemmas (word roots)
#  that are not spaces, punctuation or "stopwords" (common words)

# Load the model
nlp = spacy.load('en_core_web_lg')
nlp.max_length = 3000000

doc_words = []
doc_lemmas = []
for doc in documents:
    all_words = []
    all_lemmas = []
    # replace all the line feeds with spaces
    doc = doc.replace("\n", " ")
    # run the spacy tokenization/nlp algorithm on each source document
    spacy_doc = nlp(doc)
    for token in spacy_doc:
        if token.pos_ != "SPACE" and token.pos_ != "PUNCT" and token.is_stop != True:
            all_words.append(token.text)
            all_lemmas.append(token.lemma_)
    doc_words.append(all_words)
    doc_lemmas.append(all_lemmas)

#print(all_lemmas)

CPU times: user 11min 19s, sys: 29.1 s, total: 11min 48s
Wall time: 11min 55s


In [5]:
# Provide the top 100 most frequent words for each source

filecounter = 0
for word_doc in doc_words:
    most_freq_words = Counter(word_doc)
    common_words = most_freq_words.most_common(100)
    print("Most common words in " + filenames[filecounter] + "\n")
    pprint(common_words)
    print("-----")
    # Let's also write each list to a file
    outfilename = "../" + filenames[filecounter] + "_most_common_words.txt"
    outfile = open(outfilename, 'w')
    outfile.write("Most common words in " + filenames[filecounter] + "\n\n")
    for word, count in common_words:
        outfile.write(word + "\t" + str(count) + "\n")
    outfile.close()
    # iterate
    filecounter += 1


Most common words in LotR_complete_input_utf8.txt

[('said', 4107),
 ('Frodo', 1988),
 ('Sam', 1290),
 ('came', 1252),
 ('Gandalf', 1120),
 ('like', 1104),
 ('long', 1101),
 ('great', 1033),
 ('come', 979),
 ('away', 923),
 ('far', 762),
 ('way', 735),
 ('Aragorn', 722),
 ('went', 707),
 ('Pippin', 685),
 ('know', 672),
 ('time', 666),
 ('dark', 664),
 ('shall', 662),
 ('old', 635),
 ('looked', 619),
 ('eyes', 613),
 ("'", 578),
 ('light', 578),
 ('little', 572),
 ('Merry', 544),
 ('thought', 539),
 ('saw', 538),
 ('stood', 511),
 ('night', 502),
 ('hand', 499),
 ('heard', 488),
 ('think', 481),
 ('hobbits', 481),
 ('passed', 472),
 ('left', 465),
 ('day', 462),
 ('men', 459),
 ('road', 438),
 ('good', 425),
 ('trees', 414),
 ('things', 412),
 ('Gollum', 406),
 ('going', 406),
 ('land', 396),
 ('found', 392),
 ('turned', 390),
 ('Gimli', 390),
 ('fell', 387),
 ('end', 378),
 ('seen', 375),
 ('water', 371),
 ('hope', 370),
 ('cried', 369),
 ('lay', 368),
 ('Bilbo', 367),
 ('black', 361)

In [6]:
# Provide the top 100 most frequent lemmas for each source

filecounter = 0
for lemma_doc in doc_lemmas:
    most_freq_lemmas = Counter(lemma_doc)
    common_lemmas = most_freq_lemmas.most_common(100)
    print("Most common lemmas in " + filenames[filecounter])
    pprint(common_lemmas)
    print("-----")
    # Let's also write each list to a file
    outfilename = "../" + filenames[filecounter] + "_most_common_lemmas.txt"
    outfile = open(outfilename, 'w')
    outfile.write("Most common lemmas in " + filenames[filecounter] + "\n\n")
    for lemma, count in common_lemmas:
        outfile.write(lemma + "\t" + str(count) + "\n")
    outfile.close()
    # iterate
    filecounter += 1


Most common lemmas in LotR_complete_input_utf8.txt
[('say', 4223),
 ('come', 2749),
 ('Frodo', 1988),
 ('go', 1482),
 ('long', 1329),
 ('Sam', 1290),
 ('look', 1233),
 ('great', 1156),
 ('like', 1147),
 ('know', 1139),
 ('Gandalf', 1120),
 ('see', 968),
 ('away', 949),
 ('think', 938),
 ('man', 874),
 ('day', 831),
 ('far', 812),
 ('time', 808),
 ('way', 800),
 ('fall', 787),
 ('find', 778),
 ('leave', 773),
 ('pass', 765),
 ('hand', 759),
 ('stand', 744),
 ('hear', 734),
 ('light', 731),
 ('Aragorn', 722),
 ('eye', 720),
 ('dark', 696),
 ('old', 693),
 ('Pippin', 685),
 ('hobbit', 684),
 ('shall', 682),
 ('turn', 666),
 ('thing', 663),
 ('speak', 637),
 ('lie', 629),
 ('tree', 608),
 ('good', 597),
 ('little', 582),
 ("'", 578),
 ('cry', 569),
 ('night', 554),
 ('Merry', 544),
 ('road', 541),
 ('land', 538),
 ('tell', 508),
 ('end', 506),
 ('ride', 501),
 ('take', 499),
 ('foot', 485),
 ('feel', 480),
 ('voice', 469),
 ('run', 466),
 ('answer', 461),
 ('let', 460),
 ('shadow', 459),
 