In [1]:
from preprocessing import (
    preprocess_kjv,
    get_word_types_with_tf_idf,
)
from extract_relations import (
    get_directed_relations,
    order_directed_relations,
)
from ontology_algorithm import (
    construct_ontology_hierarchy,
    print_hierarchy_tree_from_ontology,
)

## Preprocessing

We will preprocess words from the kjv Bible using tf-idf and extract the top n words.

In [14]:
kjv_path = "/Users/zebo/Documents/Freelancing/upwork/Peter_J_Worth_Jr/NLP/hierarchical_clustering/data/t_kjv.csv"
kjv_bible_df, genesis_df = preprocess_kjv(
    path_to_kjv=kjv_path,
    get_book="Genesis",
)

# Specify the number of chapters to use.
last_chapter = 50
# Specify the number of top words to use.
n = 50

selected_chapter_verses = []
for chapter_index in range(1, last_chapter + 1):
    chapter = genesis_df[genesis_df["chapter"] == chapter_index]
    chapter_verses = ""
    for verse in chapter["text"].values:
        chapter_verses = chapter_verses + " " + verse
        if "the LORD" in verse and "the LORD God" not in verse:
            # replace "the LORD" with "God"
            verse = verse.replace("the LORD", "God") #TODO change?
    selected_chapter_verses.append(chapter_verses)

tf_idf_pre_filtering = get_word_types_with_tf_idf(
    selected_chapter_verses,
    "tf_idf",
    skip_stopwords=True,
    include_verbs=False,
    include_determiners=False,
    include_pronouns=False,
    include_adverbs=False,
    include_numbers=False,
)

top_n_words = tf_idf_pre_filtering.head(n)["word"].values

Excluding words with the following word types: {'WP$', 'VB', 'CD', 'PRP$', 'WRB', 'RBR', 'WDT', 'DT', 'PRP', 'VBG', 'VBZ', 'RB', 'RBS', 'PDT', 'VBP', 'VBD', 'VBN', 'WP'}


# Extract relations from the corpus

In [15]:
# Create a list of all verses of the corpus.
all_verses = []
for chapter_index in range(1, last_chapter + 1):
    Chapter = genesis_df[genesis_df["chapter"] == chapter_index]
    chapter_verses = []
    for verse in Chapter["text"].values:
        if "the LORD" in verse and "the LORD God" not in verse:
            # replace "the LORD" with "God"
            verse = verse.replace("the LORD", "God") #TODO change?
        chapter_verses.append(verse)
    all_verses.extend(chapter_verses)

In [22]:
directed_relations = get_directed_relations(
    top_n_words=top_n_words,
    all_verses=all_verses,
    verbose=False,
)

In [23]:
ordered_directed_relations = order_directed_relations(
    directed_relations=directed_relations,
    tf_idf_pre_filtering=tf_idf_pre_filtering,
)

## Construct the ontology hierarchy

In [24]:
ontology_hierarchy, words_with_parents = construct_ontology_hierarchy(
    ordered_directed_relations=ordered_directed_relations,
)
print_hierarchy_tree_from_ontology(
    ontological_hierarchy=ontology_hierarchy,
    words_with_parents=words_with_parents,
)

god
├── isaac
│   ├── jacob
│   │   ├── pharaoh
│   │   ├── name
│   │   ├── house
│   │   ├── rachel
│   │   │   └── joseph
│   │   │       ├── servants
│   │   │       ├── egypt
│   │   │       ├── money
│   │   │       ├── cattle
│   │   │       ├── brethren
│   │   │       └── dream
│   │   ├── esau
│   │   ├── laban
│   │   │   └── leah
│   │   ├── years
│   │   ├── daughters
│   │   ├── father
│   │   ├── daughter
│   │   └── wife
│   └── rebekah
│       └── brother
├── servant
│   └── camels
├── covenant
├── sons
│   └── israel
├── cain
├── sodom
├── son
├── noah
│   └── ark
├── seed
├── sarah
├── earth
├── master
├── kind
├── city
├── abram
├── abraham
└── abimelech
