In [98]:
import time
import json
import re
import csv
import random
from pymongo import MongoClient
from utils import basic_parser, prep_definition_text, definition_word_counter
from collections import Counter
import spacy
from nltk.stem import WordNetLemmatizer

In [99]:
lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_lg")

In [2]:
# Database 
client = MongoClient()
db = client.MerriamWebster
collection = db.UpdatedMerriamWebsterDictionary

In [8]:
# Two Approaches, Testing their Speeds

# First Approach
# Assembling a List of all Words, Turning that into a set, and checking membership with in

# Second Approach
# Using MongoDB, find_one, if none then not in list (would only become competitive when checking for large amounts of memberships, asynchronously)

In [11]:
# First Approach
# Sets and Lists

In [83]:
with open("./unique_words.txt", "r+", encoding="utf-16") as f:
    unique_terms = f.readlines()
    term_set = set(unique_terms)

In [23]:
random_words = random.sample(unique_words, 100)
print(random_words)

['intertie\n', 'Pathein\n', 'vestiment\n', 'crise de conscience\n', 'gemeled\n', 'Haematobranchia\n', 'adhesive binding\n', 'subfragment\n', 'Wappo\n', 'seaworn\n', 'bolt handle\n', 'kettle of fish\n', 'drawn butter\n', 'garden flea\n', 'scintillation counter\n', 'Dutch-process\n', 'pretension\n', 'gravi-\n', 'zeroth\n', 'loja bark\n', 'corbiestep\n', 'anahau\n', 'predicate\n', 'pommel\n', 'wit-cracker\n', 'odori\n', 'menthane\n', 'missionize\n', 'Apache trout\n', 'translatorese\n', 'sea tangle\n', 'melilotus\n', 'troop carrier\n', 'hexulose\n', 'toolmarking\n', 'low-flung\n', 'harbor porpoise\n', 'pentahydroxy\n', 'stingray\n', 'whapuku\n', 'radiophotograph\n', 'hamadryad\n', 'case-phrase\n', 'Papilionidae\n', 'siccative\n', 'Sagittarius\n', 'allostatic load\n', 'curriery\n', 'Maori\n', 'cooba\n', 'b-ball\n', 'detached service\n', 'drum scale\n', 'woodcutting\n', 'gnomonics\n', 'Parcheesi\n', 'lady-of-the-night\n', 'restimulate\n', 'crackiest\n', 'clamaroo\n', 'xi#g\n', 'natal#g\n', '

In [74]:
start = time.time()
initial_run = []
words_searched = 0
for random_word in random_words:
    word = collection.find_one({"word": random_word})
    definition = word["dictionary_definitions"]
    if definition: 
        parsed_definition = basic_parser(definition)
        definition_counter = definition_word_counter(parsed_definition, remove_stopwords=False)
        for word, count in definition_counter.items():
            words_searched += 1
            if f"{word}\n" not in word_set:
                initial_run.append(word)
print(time.time() - start)
print("Number of Words Searched:" + str(words_searched))

4.76609206199646
Number of Words Searched:1200


In [76]:
initial_run

['interconnection',
 'permitting',
 'systems',
 'myanmar',
 'burma',
 'irrawaddy',
 'yangon',
 'rangoon',
 '216000',
 'paired',
 'arthropods',
 'consisting',
 'trilobites',
 'eurypterids',
 'crabs',
 'larger',
 'indian',
 'california',
 'yukian',
 'wappo',
 'eaten',
 'projecting',
 'operated',
 'affairs',
 'dealt',
 'melted',
 'clarified',
 'detecting',
 'registering',
 'scintillations',
 'treated',
 'pretentiousness',
 'numbered',
 'steps',
 'philippine',
 'livistona',
 'rotundifolia',
 'clubs',
 'bowstrings',
 'thatching',
 'hats',
 'fans',
 'affirmed',
 'denied',
 'designating',
 'expresses',
 'consists',
 'objects',
 'complements',
 'modifiers',
 'removable',
 'u-shaped',
 'handles',
 'makes',
 'wisecracks',
 'japanese',
 'characterized',
 'hydrocarbons',
 'c10h20',
 'hexahydro',
 'derivatives',
 'cymenes',
 'terpenoids',
 'deep-bodied',
 'oncorhynchus',
 'apache',
 'inhabiting',
 'streams',
 'lakes',
 'arizona',
 'yellowish-brown',
 'sides',
 'spots',
 'translator',
 'translated',

In [77]:
start = time.time()
capitalized_run = []
words_searched = 0
for word_lower in initial_run:
    word = collection.find_one({"word": f"{word_lower.capitalize()}\n"})
    if word:
        definition = word["dictionary_definitions"]
        if definition: 
            parsed_definition = basic_parser(definition)
            definition_counter = definition_word_counter(parsed_definition, remove_stopwords=False)
            for word, count in definition_counter.items():
                words_searched += 1
                if f"{word}\n" not in word_set and f"{word.capitalize()}\n" not in word_set:
                    capitalized_run.append(word)
print(time.time() - start)
print("Number of Words Searched:" + str(words_searched))

20.007914066314697
Number of Words Searched:620


In [78]:
len(capitalized_run)

125

In [79]:
capitalized_run

['located',
 'crossed',
 'cancer;',
 'naypyidaw',
 'yangon',
 '261228',
 'miles',
 '676578',
 'kilometers',
 '55623000',
 '1300',
 'miles',
 '2092',
 'kilometers',
 'burma',
 'mouths',
 'yangon',
 '2',
 'languages',
 'indians',
 "nation's",
 'state;',
 '158706',
 'miles',
 '411048',
 'kilometers',
 '37253956',
 'yukian',
 'malaysian',
 'palms',
 'having',
 'resembling',
 'salmons',
 'rays',
 'branchiostegals',
 'caeca',
 'rakers',
 'commercially',
 'fishes',
 'coastal',
 'streams',
 'peoples',
 'languages',
 'criminals',
 'bordering',
 'mexico;',
 '113909',
 'miles',
 '296163',
 'kilometers',
 '6392017',
 'relating',
 'nations',
 'elasmobranchs',
 'dasyatis',
 'comprising',
 'stingrays',
 'including',
 'butterflies',
 'zodiacal',
 'pictured',
 'containing',
 'located',
 'signs',
 'peoples',
 'languages',
 'zealand',
 'relating',
 'inhabitants',
 'languages',
 'biogeographic',
 'comprises',
 'islands',
 'celebes',
 'zealand',
 'languages',
 'indians',
 'shrubs',
 'solanaceae',
 'commonl

In [80]:
def get_lemma(word) -> str:
    doc = nlp(word)
    return doc[0].lemma_ if doc else None

start = time.time()
lemmatized_run = []
words_searched = 0
for word in capitalized_run:
    lemma = get_lemma(word)
    if lemma:
        word_doc = collection.find_one({"word": f"{lemma}\n"})
        if word_doc:
            definition = word_doc["dictionary_definitions"]
            if definition: 
                parsed_definition = basic_parser(definition)
                definition_counter = definition_word_counter(parsed_definition, remove_stopwords=False)
                for word, count in definition_counter.items():
                    words_searched += 1
                    if f"{word}\n" not in word_set and f"{word.capitalize()}\n" not in word_set:
                        lemmatized_run.append(word)
print(time.time() - start)

8.602387189865112


In [81]:
# Combining the Bits of the First Approach

In [86]:
# First Removing Terms, only words
unique_words = [word for word in unique_terms if " " not in word]
word_set = set(unique_words)

In [134]:
start = time.time()
failures = []
actual_words = []

for word in unique_words[:1000]:
    word_doc = collection.find_one({"word": word})
    definition = word_doc["dictionary_definitions"]
    if definition:
        parsed_definition = basic_parser(definition)
        definition_counter = definition_word_counter(parsed_definition, remove_stopwords=False)
        for sub_word, count in definition_counter.items():
            words_searched += 1
            if f"{sub_word}\n" not in word_set and f"{sub_word.capitalize()}\n" not in word_set and f"{lemmatizer.lemmatize(sub_word, 'v')}\n" not in word_set and f"{get_lemma(sub_word)}\n" not in word_set:
                failures.append(sub_word)
            else:
                actual_words.append(sub_word)
print(time.time() - start)
print(len(failures))
print(len(actual_words))

46.18916988372803
500
10358


In [135]:
print(failures)

['insidiously', 'secretly', 'compactly', 'stainable', 'sender', '2', 'c21h19no5', 'papaveraceous', 'possessor', 'extremely', 'sabelloid', 'spirally', 'peninsular', 'distinguishable', 'mystical', '2', 'c18h18', 'abietic', 'manchuria;', '72201', '187723', '27452815', '2', 'yungki', 'ˈyu̇ŋ-ˈjē', '1470000', '4c', '2', '1b', '2', 'niger-congo', 'aquarians', 'progressively', 'nervously', 'indecisively', 'estat—whence', 'uo2', 'terpenoid', 'c10h16o', 'hydrogenation', 'allergenic', 'miner', 'appropriateness', 'gestational', 'babblers', '1877–1960', 'illustrator', 'münzer', 'c6h8n2o2s', 'quickly', '190132', 'asclepiades', '4-wheeled', '4', '2005', "quebec's", 'warty', 'pleural', 'medusoid', '2a', 'imaginatively', '2', '105', '32', 'habitually', 'tufted', 'densely', '478000', '2', '4b', 'plainly', 'tyrant', 'indolyl', 'bantu-speaking', 'tembuland', 'manipulation', "shakespeare's", 'con32', 'azide', 'suavity', 'brusqueness', '2', 'survivor', 'pyrrha', 'systematic', 'completely', 'forcibly', 'nois

In [141]:
def get_lemma(word) -> str:
    doc = nlp(word)
    return doc[0].lemma_ if doc else None
start = time.time()
print(get_lemma("algae"))
print(time.time() - start)

algae
0.004003047943115234


In [140]:
start = time.time()
print(lemmatizer.lemmatize("algae", 'v'))
print(time.time() - start)

algae
0.0
