In [1]:
# Importing libraries
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [2]:
# The data will be taken directly from the gutenberg project's page
url = "https://www.gutenberg.org/files/10681/old/20040627-10681-h-body-pos.htm"

# get request to URL
page = requests.get(url)

# Html layout
soup = BeautifulSoup(page.text, 'html.parser')

The script meticulously parses the entire webpage content, employing regular expressions to categorize words into distinct types. It first initializes storage structures for class names, section titles, and extracted data. As it traverses through each text element, it identifies:

  - Class Names (starting with 'CLASS')
  - Section Titles (containing 'SECTION')
  - Numeric Entries (beginning with '#')
  - Ignored Elements (denoted by '--')
  - Data Entries (capitalized words)

In [3]:
# Initialize variables to store text
class_text = []
section_text = []
data = []

# Find all text elements
text_elements = soup.find_all(string=True)

# Initialize variables to keep track of current class and section
current_class = None
current_division = None
current_section = None
current_number = None

# Iterate through text elements
for text in text_elements:
    # Check if text starts with 'CLASS' and save it in class_text
    if text.strip().startswith('CLASS'):
        current_class = text.strip().replace('\r\n', '')
    # Check if text contains 'DIVISION' and save it in section_text
    elif 'DIVISION' in text.strip():
        current_division = text.strip().replace('\r\n', '')
    # Check if text contains 'SECTION' and save it in section_text
    elif 'SECTION' in text.strip():
        current_section = text.strip().replace('\r\n', '').replace('.', '')
    # Check if text starts with '#' and add it under the current section
    elif re.match(r'^#\d+', text.strip()):
        current_number = text.strip().replace('#', '').strip()
        # Remove '--' from current_number if present
        current_number = current_number.replace('--', '')
    elif '--' in text.strip():
        current_number = None
    # Check if text starts with a capital letter and add it under the current section
    elif text.strip().istitle() and current_class is not None and current_section is not None and current_number is not None:
        words = []
        words.append(text.strip())
        # Find all subsequent words until the next number starting with '#'
        next_element = text.find_next(string=True)
        while next_element is not None and not re.match(r'^#\d+', next_element.strip()):
            words.append(next_element.strip())
            next_element = next_element.find_next(string=True)
        words_combined = ' '.join(words)
        # Separate the Words column at "N." and combine it with the Number column
        if 'N.' in words_combined:
            number_index = words_combined.find('N.')
            relating_words = words_combined[number_index + 2:].strip()
            words_part = words_combined[:number_index].strip()
            data.append((current_class, current_division, current_section, current_number + " " + words_part, relating_words))
        else:
            data.append((current_class, current_division, current_section, current_number, words_combined))

# Create a DataFrame
df = pd.DataFrame(data, columns=['Class', 'Division', 'Section', 'Number', 'Words'])

# Iterate through the 'Number' column and remove '--' from each entry
df['Number'] = df['Number'].apply(lambda x: re.sub(r'^--|(--)+$', '', x.strip()))

df.head()

Unnamed: 0,Class,Division,Section,Number,Words
0,CLASS I,,SECTION I,1. Existence,"existence,\r\nbeing,\r\nentity,\r\nens [Lat.] ..."
1,CLASS I,,SECTION I,2. Inexistence,"inexistence;\r\nnonexistence,\r\nnonsubsisten..."
2,CLASS I,,SECTION I,3. Substantiality,"substantiality,\r\nhypostasis;\r\nperson,\r\nb..."
3,CLASS I,,SECTION I,4. Unsubstantiality,"unsubstantiality,\r\ninsubstantiality;\r\nnot..."
4,CLASS I,,SECTION I,5. Intrinsicality,"intrinsicality,\r\ninbeing,\r\ninherence,\r\..."


The form of the data stored in date arrayis a tuple. 
Each element of the tuple corresponds to a specific piece of information extracted from the webpage. <br>
Also, the title of each section and each class is saved in the other two arrays
<br>
Now, 'WORDS' column obviously needs to be fixed.
The stopwords along with the other symbols on the last column, need to be identified as they do not offer valuable information.
This will be done using the nltk library.

In [4]:
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords corpus (if not already downloaded)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/aris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Cleanup function
def cleanup_text(text):
    # Remove special characters and symbols
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    words = text.split()
    # Remove single-letter words and stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if len(word) > 1 and word not in stop_words and not word.isdigit() 
                      and word not in ['adj', 'adv', 'non', 'phr', 'ens', 'lat']]
    # Combine the filtered words into a string
    cleaned_text = ', '.join(filtered_words)
    return cleaned_text

# Apply cleanup function to the 'Words' column
df['Words'] = df['Words'].apply(lambda x: cleanup_text(x))
df.head(5)

Unnamed: 0,Class,Division,Section,Number,Words
0,CLASS I,,SECTION I,1. Existence,"existence, entity, esse, subsistence, reality,..."
1,CLASS I,,SECTION I,2. Inexistence,"inexistence, nonexistence, nonsubsistence, non..."
2,CLASS I,,SECTION I,3. Substantiality,"substantiality, hypostasis, person, thing, obj..."
3,CLASS I,,SECTION I,4. Unsubstantiality,"unsubstantiality, insubstantiality, nothingnes..."
4,CLASS I,,SECTION I,5. Intrinsicality,"intrinsicality, inbeing, inherence, inhesion, ..."


<h3> Now words column has all the words that describe each word from the dictionary seperated with comma </h3>

In [6]:
#df.to_csv("All_data.csv, index=False") # File is already written on this directory

Later on the project, it turned out that a hierarchy needs to be specified,
<br> as the dictionary is divided not only in classes and sections but also divisions in a special order.
<p>To solve this, we will create another csv with two levels of hierarchy:</p>

    -First one will have the class as it is now
    -Next one will either have division or section, whatever comes first to the hierarchy

In [7]:
# Define a function to determine level2 and level3
def assign_levels(row):
    if pd.notna(row['Division']):
        return row['Division'], row['Section']
    else:
        return row['Section'], None

# Apply the function to create level2 and level3 columns
df[['level2', 'level3']] = df.apply(assign_levels, axis=1, result_type='expand')

# Add a new column combining Class and level2
df['Categories'] = df['Class'] + df['level2']

# Drop the unnecessary columns
#df.drop(['Division', 'Section'], axis=1, inplace=True)

df.head(10)

Unnamed: 0,Class,Division,Section,Number,Words,level2,level3,Categories
0,CLASS I,,SECTION I,1. Existence,"existence, entity, esse, subsistence, reality,...",SECTION I,,CLASS ISECTION I
1,CLASS I,,SECTION I,2. Inexistence,"inexistence, nonexistence, nonsubsistence, non...",SECTION I,,CLASS ISECTION I
2,CLASS I,,SECTION I,3. Substantiality,"substantiality, hypostasis, person, thing, obj...",SECTION I,,CLASS ISECTION I
3,CLASS I,,SECTION I,4. Unsubstantiality,"unsubstantiality, insubstantiality, nothingnes...",SECTION I,,CLASS ISECTION I
4,CLASS I,,SECTION I,5. Intrinsicality,"intrinsicality, inbeing, inherence, inhesion, ...",SECTION I,,CLASS ISECTION I
5,CLASS I,,SECTION I,6. Extrinsicality,"extrinsicality, objectiveness, ego, extraneous...",SECTION I,,CLASS ISECTION I
6,CLASS I,,SECTION I,7. State,"state, condition, category, estate, lot, ease,...",SECTION I,,CLASS ISECTION I
7,CLASS I,,SECTION I,8. Circumstance,"circumstance, situation, phase, position, post...",SECTION I,,CLASS ISECTION I
8,CLASS I,,SECTION II,9. Relation,"relation, bearing, reference, connection, conc...",SECTION II,,CLASS ISECTION II
9,CLASS I,,SECTION II,10. Irrelation,"irrelation, dissociation, misrelation, inappli...",SECTION II,,CLASS ISECTION II


In [8]:
# Let's organise it and save it in another file

# These columns are not needed 
#df.drop(columns=['level3'], inplace=True)

# Reorder the columns with 'Words' at the end
df = df[['Class', 'level2', 'Categories', 'Number', 'Words']]

df.head(5)

Unnamed: 0,Class,level2,Categories,Number,Words
0,CLASS I,SECTION I,CLASS ISECTION I,1. Existence,"existence, entity, esse, subsistence, reality,..."
1,CLASS I,SECTION I,CLASS ISECTION I,2. Inexistence,"inexistence, nonexistence, nonsubsistence, non..."
2,CLASS I,SECTION I,CLASS ISECTION I,3. Substantiality,"substantiality, hypostasis, person, thing, obj..."
3,CLASS I,SECTION I,CLASS ISECTION I,4. Unsubstantiality,"unsubstantiality, insubstantiality, nothingnes..."
4,CLASS I,SECTION I,CLASS ISECTION I,5. Intrinsicality,"intrinsicality, inbeing, inherence, inhesion, ..."


In [9]:
#df.to_csv("hierarchy.csv",index=False)

There is also column categories, which combined the first two levels of hierarchy, which will be used in the analysis to define the centroids