In [76]:
import time
import json
import re
import csv
from motor.motor_asyncio import AsyncIOMotorClient
from utils import prep_definition_text, definition_word_counter
from collections import Counter
import asyncio
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

In [77]:
def basic_parser(full_definition: str) -> str:
    """Parses a Word's full definition, removes unwanted characters and parts such as example usage.

    Args:
        full_definition (str): The full definition text of a Word, to be parsed

    Returns:
        str: a basic parsed string, removes the tags for splitting definitions as well as some special characters
    """
    cleaned = ""
    cleaned = re.findall(":.+\\n", full_definition)
    cleaned = ''.join([definition.strip() for definition in cleaned])
    # cleaned = re.sub(":|,|\.|\(|\)", "", cleaned).strip()
    cleaned = cleaned.replace(':', '').replace(',', '').replace(
        '.', '').replace('(', '').replace(')', '').strip()
    return cleaned.lower()

In [78]:
with open("../word_sample.txt", "r+", encoding="utf-16") as f:
    words = f.readlines()

In [79]:
# Database 
client = AsyncIOMotorClient()
db = client.MerriamWebster
collection = db.UpdatedMerriamWebsterDictionary

In [81]:
def find_and_prep_word(word: str) -> Counter:
    definition = ""
    definition = collection.find_one({"word": word})["dictionary_definitions"]
    cleaned_definition = basic_parser(definition)
    definition_counter = definition_word_counter(cleaned_definition, remove_stopwords=True)
    return definition_counter

def prep_rows(word: str) -> list[list]:
    rows = []
    definition_counter = find_and_prep_word(word)
    for sub_word, count in definition_counter.items():
        rows.append([word.strip(), sub_word, count])
    return rows

async def find_and_prep_async(word: str) -> Counter:
    definition = ""
    definition = await collection.find_one({"word": word})["dictionary_definitions"]
    if not definition: return None
    cleaned_definition = basic_parser(definition)
    definition_counter = definition_word_counter(cleaned_definition, remove_stopwords=False)
    return definition_counter

async def prep_rows_async(word: str) -> list[list]:
    rows = []
    definition_counter = await find_and_prep_async(word)
    if definition_counter:
        rows = [[word.strip(), sub_word, count] for sub_word, count in definition_counter.items()]
        return rows
    else:
        return None

async def make_tasks(words: list[str]):
    list_of_rows = []
    tasks = []
    for word in words:
        rows_to_write = await asyncio.create_task(prep_rows_async(word))
        tasks.append(rows_to_write)
    list_of_rows = await asyncio.gather(*tasks)
    return list_of_rows

async def main(words: list[str]):
    all_rows = await make_tasks(words)
    return all_rows

In [82]:
all_rows = asyncio.run(main(words[:100]))
print(all_rows)

RuntimeError: asyncio.run() cannot be called from a running event loop

In [68]:
start = time.time()
num_rows = []
with open("test_graph_items.csv", "w+", encoding="utf-16", newline="") as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(["word", "word_in_definition", "word_count"])
    for word in words:
        try:
            rows_to_write = prep_rows(word)
            num_rows.append(len(rows_to_write))
            csvwriter.writerows(rows_to_write)
        except TypeError:
            print(word)
    
print(time.time() - start)

Ushak

Macrochiroptera

nokkelost

hallen

septavalent

Homobasidiomycetidae

ponte

hatable

impolder

soogan

Fellini

cambrel

condrodite

choy

hicht

wove

traumatise

paleing

pentamerid

Lithodomus

cooley

Xosa

spitten

Frenchier

Tubinares

romage

aumbry

recule

trigging

colliest

remolade

-clinia

volatilise

gruing

unnourishing

naughts-and-crosses

kest

octine

did

menyie

kourbash

dunum

ladies'-pocket

perits

sinuauricular

arahat

Ancylocladus

furm

Scolopendrium

lives

radioscopic

formy

tracs

70.38991069793701


In [71]:
print(sum(num_rows) / len(num_rows))

7.271383315733897


In [72]:
print(len(num_rows))

947
