In [1]:
# Standard Library
import time
import csv
from collections import Counter
import random

# Third-Party Libraries
import pandas as pd
import numpy as np
import spacy
from nltk.stem import WordNetLemmatizer

# Custom Libraries
from utils import definition_word_counter, basic_parser

In [2]:
nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()

## Importing and Cleaning MongoDB Data

In [3]:
df = pd.read_csv("./data/UpdatedMerriamWebsterDictionaryCSV.csv")

In [4]:
df.head()

Unnamed: 0,_id,dictionary_definitions,part_of_speech,pronunciation,syllables,word
0,63a5c2e55e1fcf636f29eee4,\n\n\nCentral Polynesia \n\n\n\n\n\n\n\n: kava...,noun,\nˈ(ʔ)ävə\n,"[""'ava\n""]",'ava\n
1,63a5c2e55e1fcf636f29eee5,,,,"[""'avas\n""]",'avas\n
2,63a5c2e55e1fcf636f29eee6,\n\n\n\n\n\n\n: because\n\n\n \n\n\n,conjunction,ˈkȯz,"[""'cause\n""]",'cause\n
3,63a5c2e55e1fcf636f29eee7,\n\n\n\n\n\n\n: them\n\n\n \n\n\n,pronoun,"\nəm;\n after p,\nb,\nf,\n or v,\n often ᵊm\n","[""'em\n""]",'em\n
4,63a5c2e55e1fcf636f29eee8,\n\n\n\n\n\n\n: afro\nThe '70s were certainly ...,noun,ˈfrō,"[""'fro\n""]",'fro\n


In [5]:
# Normalizing the words that were part of the Geographical or Biographical Dictionary
def normalize_word(word: str):
    return word.strip().replace("#g", "").replace("#b", "").lower()
unique_words = set(df.word.transform(normalize_word))

In [6]:
df["word"] = df.word.transform(normalize_word)

In [7]:
df.head()

Unnamed: 0,_id,dictionary_definitions,part_of_speech,pronunciation,syllables,word
0,63a5c2e55e1fcf636f29eee4,\n\n\nCentral Polynesia \n\n\n\n\n\n\n\n: kava...,noun,\nˈ(ʔ)ävə\n,"[""'ava\n""]",'ava
1,63a5c2e55e1fcf636f29eee5,,,,"[""'avas\n""]",'avas
2,63a5c2e55e1fcf636f29eee6,\n\n\n\n\n\n\n: because\n\n\n \n\n\n,conjunction,ˈkȯz,"[""'cause\n""]",'cause
3,63a5c2e55e1fcf636f29eee7,\n\n\n\n\n\n\n: them\n\n\n \n\n\n,pronoun,"\nəm;\n after p,\nb,\nf,\n or v,\n often ᵊm\n","[""'em\n""]",'em
4,63a5c2e55e1fcf636f29eee8,\n\n\n\n\n\n\n: afro\nThe '70s were certainly ...,noun,ˈfrō,"[""'fro\n""]",'fro


In [9]:
df = df[df.dictionary_definitions.notnull()]

In [11]:
df["parsed_definitions"] = df.dictionary_definitions.transform(basic_parser)

In [12]:
df.head()

Unnamed: 0,_id,dictionary_definitions,part_of_speech,pronunciation,syllables,word,parsed_definitions
0,63a5c2e55e1fcf636f29eee4,\n\n\nCentral Polynesia \n\n\n\n\n\n\n\n: kava...,noun,\nˈ(ʔ)ävə\n,"[""'ava\n""]",'ava,kava
2,63a5c2e55e1fcf636f29eee6,\n\n\n\n\n\n\n: because\n\n\n \n\n\n,conjunction,ˈkȯz,"[""'cause\n""]",'cause,because
3,63a5c2e55e1fcf636f29eee7,\n\n\n\n\n\n\n: them\n\n\n \n\n\n,pronoun,"\nəm;\n after p,\nb,\nf,\n or v,\n often ᵊm\n","[""'em\n""]",'em,them
4,63a5c2e55e1fcf636f29eee8,\n\n\n\n\n\n\n: afro\nThe '70s were certainly ...,noun,ˈfrō,"[""'fro\n""]",'fro,afro from dr j's hindenburg-sized 'fro to howa...
5,63a5c2e55e1fcf636f29eee9,\n\n\n\n\n\n\n: will\nyou'll be late\n\n\n \n\n\n,verb,"\nl,\nəl,\nᵊl\n","[""'ll\n""]",'ll,will


In [13]:
df.reset_index(inplace=True)

## Transforming MongoDB Data to Graph Data

In [14]:
def prep_graph_rows(word: str, parsed_definition_text: str) -> list[list]: 
    definition_counter = definition_word_counter(parsed_definition_text, remove_stopwords=False)
    
    return [[word.strip(), sub_word, count] for sub_word, count in definition_counter.items()]

In [15]:
df.shape

(245423, 8)

In [16]:
rows = []
for i in range(df.shape[0]):
    rows.extend(prep_graph_rows(df.loc[i]["word"], df.loc[i]["parsed_definitions"]))

In [17]:
# word, word_in_definition
graph_df = pd.DataFrame(columns=["word", "word_in_definition", "count"], data=rows)

In [18]:
graph_df.shape

(3063845, 3)

In [19]:
graph_df.head()

Unnamed: 0,word,word_in_definition,count
0,'ava,kava,1
1,'cause,because,1
2,'em,them,1
3,'fro,afro,1
4,'fro,from,1


## Assembling Lemma Dict

In [20]:
def get_lemma(word: str) -> str:
    doc = nlp(word)
    return doc[0].lemma_ if doc else None

In [22]:
unique_word_in_def = list(graph_df.word_in_definition.unique())

In [23]:
len(unique_word_in_def)

133087

In [24]:
words_in_def_to_lemmatize = [word for word in unique_word_in_def if word not in unique_words]

In [25]:
len(words_in_def_to_lemmatize)

73533

In [26]:
# Creating a Dict for Mapping Words to their Lemmas
start = time.time()
word_to_word_lemmas = {}
for word in words_in_def_to_lemmatize:
    word_lemma = get_lemma(word)
    if word_lemma: word_to_word_lemmas[word] = word_lemma
print(time.time() - start)

186.84618473052979


In [27]:
clean_graph_df = graph_df.copy()

In [28]:
graph_df.head()

Unnamed: 0,word,word_in_definition,count
0,'ava,kava,1
1,'cause,because,1
2,'em,them,1
3,'fro,afro,1
4,'fro,from,1


In [40]:
split_clean_df = np.array_split(clean_graph_df, 12)

In [44]:
len(split_clean_df)

12

## Mapping Words to their Lemmas

In [41]:
# Attempting to build a function to replace replace with map

def swap_with_lemma(word: str) -> str:
    try:
        return word_to_word_lemmas[word]
    except KeyError:
        return word

In [45]:
start = time.time()
for index, split_df in enumerate(split_clean_df):
    split_df["word_in_definition"] = split_df.word_in_definition.map(swap_with_lemma)
    split_df.to_csv(f"clean_graph_df_partition_{index}.csv")
print(time.time() - start)

4.6946022510528564


In [48]:
clean_graph_df["word_in_definition"] = clean_graph_df.word_in_definition.map(swap_with_lemma)

## Saving the Data

In [49]:
clean_graph_df.to_csv("clean_graph_df_full.csv")

In [39]:
split_clean_df[4].to_csv("clean_graph_df_partition_4.csv")

In [56]:
graph_df_no_terms = clean_graph_df[clean_graph_df.word.str.contains("\s") == False]

In [59]:
graph_df_no_terms.to_csv("clean_graph_df_no_terms.csv")

In [201]:
clean_graph_df.head()

Unnamed: 0,word,word_in_definition,count
0,'ava,kava,1
1,'cause,because,1
2,'em,them,1
3,'fro,afro,1
4,'fro,from,1


## Creating Node Data

In [98]:
all_words = set(list(clean_graph_df.word) + list(clean_graph_df.word_in_definition))

In [60]:
# All Words and Terms

all_words_terms = set(list(clean_graph_df.word) + list(clean_graph_df.word_in_definition))

# All Words only

all_words = set(list(graph_df_no_terms.word) + list(graph_df_no_terms.word_in_definition))

In [62]:
all_words_terms_series = pd.Series(list(all_words_terms))

In [63]:
all_words_series = pd.Series(list(all_words))

In [64]:
all_words_terms_series.to_csv("word_term_nodes.csv")

In [65]:
all_words_series.to_csv("word_nodes.csv")