In [1]:
from preprocessing import (
    preprocess_kjv,
    get_word_types_with_tf_idf,
    get_gospel_top_70_words_dictionary,
)
from extract_relations import (
    get_directed_relations,
    order_directed_relations,
)
from ontology_algorithm import (
    construct_ontology_hierarchy,
    print_hierarchy_tree_from_ontology,
)

## Preprocessing

We will preprocess words from the kjv Bible using tf-idf and extract the top n words.

In [2]:
a='Beautiful, is; better*than\nugly'
import re
re.split('; |, |\*|\n',a)

['Beautiful', 'is', 'better', 'than', 'ugly']

In [3]:
theology_reconsidered_path = "/Users/zebo/Documents/Freelancing/upwork/Peter_J_Worth_Jr/NLP/hierarchical_clustering/data/theology_reconsidered.txt"

with open(theology_reconsidered_path, "r") as f:
    theology_reconsidered = f.read()

def split_into_chapters(input_filename, verbose=False):
    """Split the input file into chapters."""
    titles = []
    chapters = []
    # Open the input file and read its contents
    with open(input_filename, 'r', encoding='utf-8') as input_file:
        contents = input_file.read()

    # Split the contents into chunks based on the separator criteria
    chunks = contents.split('\n\n\n\n')             # empty lines in the text

    ch_cnt = 0

    # Create an output file for each chunk
    for i, chunk in enumerate(chunks):
        # Skip empty chunks
        if not chunk.strip():
            continue

        # Extract the title and content from the chunk
        lines = chunk.strip().split('\n')
        title = lines[0]
        content = '\n'.join(lines[1:])

        if verbose:
            print("processing CH" + str(ch_cnt) + ": " + title + "...")

        # Create the output file
        chapters.append(content)
        titles.append(title)

        ch_cnt += 1
    return titles, chapters

titles, chapters = split_into_chapters(theology_reconsidered_path)

chapters[0]

# Should I remove the word Figure?

# Remove \n from the chapters.
chapters = [chapter.replace("\n", "") for chapter in chapters]
# Remove \t from the chapters.
chapters = [chapter.replace("\t", " ") for chapter in chapters]


# import re

# Separate the chapters into sentences.
sentences_per_chapter_prep = [re.split("\.  ", chapter) for chapter in chapters]
sentences_per_chapter_prep = [[sentence + "." for sentence in chapter] for chapter in sentences_per_chapter_prep]

sentences_per_chapter = []
for chapter_prep in sentences_per_chapter_prep:
    chapter = []
    for sentence in chapter_prep:
        chapter.extend(re.split("(?<!i.e|etc|e.g|iii)\. ", sentence))
    sentences_per_chapter.append(chapter)

#sentences_per_chapter[0]
for title_index, title in enumerate(titles):
    print(title_index, title)

0 Introduction
1 Prologue: Mythos and Mysticism in Antiquity
2 A Brief History of the Mystical Arts: Beyond Yoga
3 Meditation as a Mystical Art: A Bridge Through Time 
4 Overarching Themes: The Laurasian Hypothesis and a New Metaphysics
5 Setting the Stage: A Brief of History of Modern Man
6 Part I: On Creation Mythos (Cosmogony)
7 From Language to Writing: The Dawn of History
8 The Ancient Hebrews: The Tanakh, Torah and Five Books of Moses
9 Ancient Egyptian Mythos: The Weighing of the Heart, Ra and Ma'at
10 The Enûma Eliš: Sumer- Babylonian Creation Mythos
11 Ancient Persian Theology: Zarathustra and the Avesta
12 Classic Hellenic Theogony: Chaos, Chronos and Eros
13 Orphic Theogony: Thanes and the Great Cosmic Egg
14 Vedic Cosmogony: Skepticism, Puruṣa and Hiraṇyagarbha 
15 Ancient Chinese Theology: Shàngdì, Pángǔ, Tiān and the Dao
16 Roman Cosmogony: The Metamorphoses of Ovid
17 Eurasian Mythos: Establishing the Laurasian Hypothesis 
18 Part II: On Ancient Philosophy (Logos)
19 Fro

In [4]:
part_indices = [0]
part_indices.extend([index for index, title in enumerate(titles) if title[:4]=="Part"])
part_indices.append(len(titles))
part_indices

[0, 6, 18, 31, 43, 52, 58]

In [5]:
# Specify the chapters used in the analysis.
part = 1
chosen_chapters = range(part_indices[part-1], part_indices[part], 1)
print("Chosen chapters ", chosen_chapters)

# Specify the number of top words to use.
n = 50

text_per_chapter = []
for chapter_idx in chosen_chapters:
    text_per_chapter.append(chapters[chapter_idx])

tf_idf_pre_filtering = get_word_types_with_tf_idf(
    text_per_chapter,
    "tf",
    skip_stopwords=True,
    include_verbs=False,
    include_determiners=False,
    include_pronouns=False,
    include_adverbs=False,
    include_numbers=False,
)

top_n_words = tf_idf_pre_filtering.head(n)["word"].values

top_n_words

Chosen chapters  range(52, 58)


array(['intellectual', 'metaphysics', 'philosophical', 'philosophy',
       'reality', 'metaphysical', 'awareness', 'quality', 'knowledge',
       'ontological', 'experience', 'least', 'order', 'western', 'upon',
       'notion', 'within', 'pirsig', 'fact', 'principle', 'framework',
       'existence', 'sense', 'quantum', 'kant', 'world', 'perspective',
       'science', 'eastern', 'west', 'rational', 'epistemological', 'way',
       'time', 'course', 'example', 'mind', 'mechanics', 'ground',
       'level', 'higher', 'aristotle', 'system', 'modern', 'fundamental',
       'mystical', 'work', 'cognitive', 'paradigm', 'states'],
      dtype=object)

In [6]:
tf_idf_pre_filtering.head(n)

Unnamed: 0,word,word_type,tc,tf,dc,idf,tf_idf
0,intellectual,"{'JJ': 228, 'NN': 3}",234,0.00547,5,0.182322,0.000997
1,metaphysics,"{'NNS': 142, 'NNPS': 10, 'ORGANIZATION': 60, '...",166,0.003881,5,0.182322,0.000708
2,philosophical,"{'JJ': 88, 'NN': 3, 'NNP': 2, 'GSP': 1}",150,0.003507,5,0.182322,0.000639
3,philosophy,"{'NN': 108, 'NNP': 32, 'PERSON': 11, 'GPE': 13...",145,0.00339,5,0.182322,0.000618
4,reality,{'NN': 138},138,0.003226,5,0.182322,0.000588
7,metaphysical,"{'JJ': 117, 'NN': 8}",127,0.002969,5,0.182322,0.000541
8,awareness,"{'NN': 17, 'JJ': 1, 'NNP': 100, 'PERSON': 14, ...",124,0.002899,3,0.693147,0.002009
10,quality,"{'NNP': 93, 'GPE': 25, 'ORGANIZATION': 35, 'PE...",119,0.002782,5,0.182322,0.000507
11,knowledge,"{'NN': 107, 'NNP': 1, 'VB': 2, 'VBP': 2, 'JJ': 1}",116,0.002712,5,0.182322,0.000494
12,ontological,"{'JJ': 109, 'NNP': 1, 'ORGANIZATION': 1}",111,0.002595,5,0.182322,0.000473


In [7]:
# print the row with word "cross"
tf_idf_pre_filtering[tf_idf_pre_filtering["word"] == "kant"]

Unnamed: 0,word,word_type,tc,tf,dc,idf,tf_idf
29,kant,"{'NNP': 75, 'ORGANIZATION': 34, 'PERSON': 30, ...",79,0.001847,3,0.693147,0.00128


# Extract relations from the corpus

In [8]:
# Create a list of all verses of the chosen books.
all_verses = []
for chapter_idx in chosen_chapters:
    all_verses.extend(sentences_per_chapter[chapter_idx])

In [9]:
import spacy

nlp = spacy.load("en_core_web_lg")
doc_1 = nlp(all_verses[0])
spacy.displacy.render(doc_1, style="dep", jupyter=True)

In [10]:
directed_relations = get_directed_relations(
    top_n_words=top_n_words,
    all_verses=all_verses,
    verbose=True,
)


 1  sentences in verse  0
sentence:  .
ents:  []

 1  sentences in verse  1
sentence:  From an ontological perspective, a term that was coined only in the last century or two to denote a specific branch of philosophy related to being, or reality itself, in deep antiquity our ancestors simply had myth.
ents:  [the last century, two]
subject:  that
object:  century
subject:  term
subject:  ancestors
object:  perspective
object:  antiquity
object:  myth

 1  sentences in verse  2
sentence:  Various tales and stories that were handed down from generation to generation, that spoke of topics such as the creation of the world and mankind, stories of great valor and love, and destruction stories too no doubt.
ents:  []
subject:  that
object:  generation
object:  generation
subject:  tales
subject:  that
object:  topics
object:  creation

 1  sentences in verse  3
sentence:  These myths, these tales or narratives - what we refer to throughout collectively as a people's mythos, were the means b

In [11]:
ordered_directed_relations = order_directed_relations(
    directed_relations=directed_relations,
    tf_idf_pre_filtering=tf_idf_pre_filtering,
    order_by="product",
    include_ordering_wrt_occurences=True,
    verbose=True,
)
ordered_directed_relations

                     relation  occurances
0           (pirsig, quality)           7
1       (pirsig, metaphysics)           5
2        (metaphysics, sense)           4
3            (pirsig, notion)           3
4         (awareness, ground)           3
5         (pirsig, framework)           2
6    (awareness, perspective)           2
7      (awareness, framework)           2
8          (awareness, level)           2
9        (quality, awareness)           2
10             (quality, way)           2
11        (philosophy, sense)           2
12          (knowledge, time)           2
13        (system, framework)           2
14         (metaphysics, way)           1
15        (metaphysics, fact)           1
16  (metaphysics, philosophy)           1
17      (metaphysics, notion)           1
18   (metaphysics, framework)           1
19   (metaphysics, principle)           1
20   (metaphysics, awareness)           1
21       (metaphysics, order)           1
22     (metaphysics, reality)     

[('pirsig', 'quality'),
 ('pirsig', 'metaphysics'),
 ('metaphysics', 'sense'),
 ('pirsig', 'notion'),
 ('awareness', 'ground'),
 ('pirsig', 'framework'),
 ('awareness', 'perspective'),
 ('awareness', 'framework'),
 ('awareness', 'level'),
 ('quality', 'awareness'),
 ('quality', 'way'),
 ('philosophy', 'sense'),
 ('knowledge', 'time'),
 ('system', 'framework'),
 ('metaphysics', 'way'),
 ('metaphysics', 'fact'),
 ('metaphysics', 'philosophy'),
 ('metaphysics', 'notion'),
 ('metaphysics', 'framework'),
 ('metaphysics', 'principle'),
 ('metaphysics', 'awareness'),
 ('metaphysics', 'order'),
 ('metaphysics', 'reality'),
 ('pirsig', 'experience'),
 ('pirsig', 'fact'),
 ('pirsig', 'work'),
 ('awareness', 'principle'),
 ('awareness', 'way'),
 ('awareness', 'philosophy'),
 ('quality', 'time'),
 ('quality', 'example'),
 ('quality', 'order'),
 ('quality', 'ground'),
 ('quality', 'existence'),
 ('quality', 'experience'),
 ('quality', 'philosophy'),
 ('notion', 'mechanics'),
 ('notion', 'level'),
 

## Construct the ontology hierarchy

In [12]:
ontology_hierarchy, words_with_parents = construct_ontology_hierarchy(
    ordered_directed_relations=ordered_directed_relations,
)
print_hierarchy_tree_from_ontology(
    ontological_hierarchy=ontology_hierarchy,
    words_with_parents=words_with_parents,
)

pirsig
├── quality
│   ├── awareness
│   │   ├── ground
│   │   ├── perspective
│   │   └── level
│   ├── way
│   │   └── kant
│   │       ├── course
│   │       ├── mind
│   │       └── knowledge
│   │           └── time
│   ├── example
│   └── existence
├── metaphysics
│   ├── sense
│   ├── fact
│   ├── philosophy
│   ├── principle
│   ├── order
│   └── reality
├── notion
│   └── mechanics
├── framework
├── experience
└── work
aristotle
├── science
│   └── world
└── system
paradigm
└── states


## Run for all parts of the text

In [13]:
# Specify the chapters used in the analysis.
for part in range(1, 7):
    print("Part: ", part-1)
    chosen_chapters = range(part_indices[part-1], part_indices[part], 1)
    print("Chosen chapters ", chosen_chapters)
    print("")

    # Specify the number of top words to use.
    n = 80

    text_per_chapter = []
    for chapter_idx in chosen_chapters:
        text_per_chapter.append(chapters[chapter_idx])

    tf_idf_pre_filtering = get_word_types_with_tf_idf(
        text_per_chapter,
        "tf",
        skip_stopwords=True,
        include_verbs=False,
        include_determiners=False,
        include_pronouns=False,
        include_adverbs=False,
        include_numbers=False,
    )

    top_n_words = tf_idf_pre_filtering.head(n)["word"].values

    # Create a list of all verses of the chosen books.
    all_verses = []
    for chapter_idx in chosen_chapters:
        all_verses.extend(sentences_per_chapter[chapter_idx])
    directed_relations = get_directed_relations(
        top_n_words=top_n_words,
        all_verses=all_verses,
        verbose=False,
    )
    ordered_directed_relations = order_directed_relations(
        directed_relations=directed_relations,
        tf_idf_pre_filtering=tf_idf_pre_filtering,
        order_by="product",
        include_ordering_wrt_occurences=True,
        verbose=False,
    )
    ontology_hierarchy, words_with_parents = construct_ontology_hierarchy(
        ordered_directed_relations=ordered_directed_relations,
    )
    print_hierarchy_tree_from_ontology(
        ontological_hierarchy=ontology_hierarchy,
        words_with_parents=words_with_parents,
    )

Part:  0
Chosen chapters  range(0, 6)

meditation
├── context
├── tradition
├── course
├── sense
├── work
│   ├── perspective
│   ├── antiquity
│   ├── traditions
│   └── author
│       └── era
└── time
man
├── years
├── eurasia
└── term
    ├── definition
    └── history
yoga
├── part
│   └── notion
├── west
└── forms
many
├── transmission
│   ├── form
│   └── knowledge
│       └── reality
└── peoples
    └── nature
systems
├── existence
└── turn
system
└── fact
mysticism
└── order
hypothesis
└── philosophy
practices
└── east
Part:  1
Chosen chapters  range(6, 18)

ovid
├── creation
├── narrative
├── bce
│   └── word
├── east
├── work
│   └── mythology
│       └── universe
│           ├── narratives
│           ├── turn
│           ├── deities
│           ├── mythos
│           ├── life
│           ├── order
│           └── day
└── philosophy
god
├── heaven
├── earth
│   ├── generation
│   ├── principles
│   └── man
├── world
│   └── society
│       └── time
└── people
    ├── traditi