In [1]:
import requests
from bs4 import BeautifulSoup, Comment
import re
import pandas as pd
import os
import json

## Data Collection

### Data Cleaning - helper function

In [6]:
def clean_text(text):
    """Clean the extracted text by removing extra spaces, unnecessary citations, and handling links properly."""
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)\s+\1', r'[\1](\2)', text)
    text = re.sub(r'\[\]', '', text)
    text = re.sub(r'\(https://en.wikipedia.org#cite[^\)]+\)', '', text)
    text = re.sub(r'\(https://en.wikipedia.org/wiki/Wikipedia:Please_clarify\)', '', text)
    return text


## Data Labelling - helper

For wikilinks within a wikipage, we want to capture these titles and paths to update to our wikipages.csv

In [7]:
def label_linked_entities(element):
    """Extract text from an HTML element, preserving links as Markdown."""
    result = ''
    for child in element.descendants:
        if child.name == 'a':
            if 'reference' in child.get('class', []) or 'citation' in child.get('class', []):
                continue
            
            href = child.get('href')
            if href and href.startswith('/wiki/'):
                link_text = child.get_text()
                result += f"[{link_text}](https://en.wikipedia.org{href}) "
        elif child.name == 'sup':
            continue
        elif isinstance(child, str):
            result += child + ' '

    patterns = [
        r'(\[\[.*)(\]?)(\(https:\/\/en\.wikipedia\.org\/wiki\/Wikipedia\:)(\w{3,25}\)\])(?=(\s|\.|,|$))',
        r'(\[\[.*)(\]?)(\(https:\/\/en\.wikipedia\.org\/wiki\/Wikipedia\:)(Manual_of_Style\/)?(Words_to_watch|Dates_and_numbers|Citing_sources)(\#)(\w{3,25}\)\])(?=(\s|\.|,|$))',
    ]
    for pattern in patterns:
        result = re.sub(pattern, '', result)
    
    return result.strip()

### Data Extraction
The following function pulls data from our wikipages, splits it by section and sub-section using h2 and h3 headers. We then use our helper functions to clean the content (within each section) and label any identified wiki titles and paths for our wikipages. 

In [9]:
def extract_wikipedia_contents(wiki_path, ontology_term, kb_lookup):
    """
    Extract and process content from a Wikipedia page, updating the knowledge base lookup.

    Args:
    wiki_path (str): The Wikipedia page path.
    ontology_term (str): The ontology term to associate with the Wikipedia content.
    kb_lookup (dict): The existing knowledge base lookup dictionary.

    Returns:
    tuple: (Updated kb_lookup, DataFrame of extracted content, Status message)
    """
    wiki_path = wiki_path.strip().lower()
    ontology_term = ontology_term.strip().lower()
    
    url = f"https://en.wikipedia.org/wiki/{wiki_path}"
    response = requests.get(url)
    if response.status_code != 200:
        return kb_lookup, pd.DataFrame(), f"Error: Unable to fetch the page '{wiki_path}'. Status code: {response.status_code}"

    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.find('h1', id='firstHeading').text.strip().lower()
    
    is_redirect = False
    redirect_span = soup.find('span', class_='mw-redirectedfrom')
    if redirect_span:
        canonical_link = soup.find('link', rel='canonical')
        wiki_path = canonical_link['href'].split("/wiki/")[-1].strip().lower()
        is_redirect = True
    
    if wiki_path in kb_lookup["wiki_paths"]:
        if ontology_term in kb_lookup["wiki_paths"][wiki_path]["ontology_terms"]:
            return kb_lookup, pd.DataFrame(), f"Error: Kb_lookup Entry for ontology_term '{ontology_term}' already exists in wiki_path '{wiki_path}'."
        else:
            kb_lookup["wiki_paths"][wiki_path]["ontology_terms"].append({"term": ontology_term, "redirect": is_redirect})
            return kb_lookup, pd.DataFrame(), f"Success: Kb_lookup Entry for ontology_term '{ontology_term}' has been updated under wiki_path '{wiki_path}'."
    else:
        kb_lookup["wiki_paths"][wiki_path] = {"wiki_title": title, "ontology_terms": [{"term": ontology_term, "redirect": is_redirect}]}
    
    content_div = soup.find('div', class_='mw-parser-output')
    
    for tag in content_div(['script', 'style', 'svg']):
        tag.decompose()
    
    for comment in content_div.find_all(string=lambda string: isinstance(string, Comment)): 
        comment.extract()
       
    for sup in content_div.find_all('sup'):
        sup.decompose()
    
    sections = []
    current_h2 = "Introduction"
    current_h3 = None
    current_content = ""

    for element in content_div.find_all(['h2', 'h3', 'p', 'ul', 'ol']):
        if current_h2 == "Introduction":
            if element.name == 'h2':
                if current_content.strip():
                    sections.append({
                        'wiki_path': wiki_path,
                        'title': title,
                        'h2': current_h2,
                        'h3': current_h3 if current_h3 else '',
                        'content': clean_text(current_content)
                    })
                current_h2 = element.get_text(strip=True)
                current_h3 = None
                current_content = ""
            elif element.name == 'p':
                current_content += label_linked_entities(element) + " "
        elif element.name == 'h2':
            h2_text = element.get_text(strip=True)
            if h2_text == 'References':
                break
            if current_content.strip():
                sections.append({
                    'wiki_path': wiki_path,
                    'title': title,
                    'h2': current_h2,
                    'h3': current_h3 if current_h3 else '',
                    'content': clean_text(current_content)
                })
            current_h2 = h2_text
            current_h3 = None
            current_content = ""
        elif element.name == 'h3':
            if current_h3 and current_content.strip():
                sections.append({
                    'wiki_path': wiki_path,
                    'title': title,
                    'h2': current_h2,
                    'h3': current_h3,
                    'content': clean_text(current_content)
                })
            current_h3 = element.get_text(strip=True)
            current_content = ""
        elif element.name in ['p', 'ul', 'ol']:
            current_content += label_linked_entities(element) + " "

    if current_content.strip():
        sections.append({
            'wiki_path': wiki_path,
            'title': title,
            'h2': current_h2,
            'h3': current_h3 if current_h3 else '',
            'content': clean_text(current_content)
        })

    if sections:
        return kb_lookup, pd.DataFrame(sections), f"Success: New Kb_lookup Entry added for ontology_term '{ontology_term}' and wiki_path '{wiki_path}'."
    else:
        return kb_lookup, pd.DataFrame([{
            'wiki_path': wiki_path,
            'title': title,
            'h2': 'No sections',
            'h3': '',
            'content': 'No content found or all content filtered out.'
        }]), f"Warning: New Kb_lookup Entry added for ontology_term '{ontology_term}' and wiki_path '{wiki_path}', but no content was found."


In [11]:
ontology_links = '../ontology_links.csv'
ontology_links_df = pd.read_csv(ontology_links)

In [17]:
ontology_links_df = ontology_links_df.rename(columns={'taxonomy_term': 'ontology_term'})
ontology_links_wiki = ontology_links_df.dropna(subset=['wiki_title', 'wiki_path'])
ontology_links_wiki.head()

Unnamed: 0,ontology_term,term_regex,wiki_title,wiki_path
1,Cluster Analysis,cluster(\s*|\-)?analysi,cluster analysis,cluster_analysis
4,Sigmoid,sigmoid,sigmoid,sigmoid_function
5,Generative Adversarial Network,generative(\s*|\-)?adversarial(\s*|\-)?network,generative adversarial network,generative_adversarial_network
6,Least-Angle Regression,least(\s*|\-)?angle(\s*|\-)?regression,least-angle regression,least-angle_regression
8,Part-Of-Speech Tagging,part(\s*|\-)?of(\s*|\-)?speech(\s*|\-)?tagging,part-of-speech tagging,part-of-speech_tagging


In [None]:
wiki_content_list = []
kb_lookup = {"wiki_paths": {}, "ontology_terms": {}}

for index, row in ontology_links_wiki.iterrows():
    wiki_path = row['wiki_path']
    ontology_term = row['ontology_term']
    kb_lookup = kb_lookup
    try:
        kb_lookup, df, status = extract_wikipedia_contents(wiki_path, ontology_term, kb_lookup)
        
        if status.startswith("Success: New Kb_lookup Entry added"):
            wiki_content_list.append(df)
            print(f"{status}...and successfully added new content to knowledge base")
        else:
            print(status)
    except Exception as e:
        print(f"Error processing {wiki_path}: {str(e)}")



Success: New Kb_lookup Entry added for ontology_term 'cluster analysis' and wiki_path 'cluster_analysis'....and successfully added new content to knowledge base
Success: New Kb_lookup Entry added for ontology_term 'sigmoid' and wiki_path 'sigmoid_function'....and successfully added new content to knowledge base
Success: New Kb_lookup Entry added for ontology_term 'generative adversarial network' and wiki_path 'generative_adversarial_network'....and successfully added new content to knowledge base
Success: New Kb_lookup Entry added for ontology_term 'least-angle regression' and wiki_path 'least-angle_regression'....and successfully added new content to knowledge base
Success: New Kb_lookup Entry added for ontology_term 'part-of-speech tagging' and wiki_path 'part-of-speech_tagging'....and successfully added new content to knowledge base
Success: New Kb_lookup Entry added for ontology_term 'false positive' and wiki_path 'false_positives_and_false_negatives'....and successfully added new 

In [23]:
ontology_terms_available = len(ontology_links_wiki)
ontology_terms_added = sum(len(entry["ontology_terms"]) for entry in kb_lookup["wiki_paths"].values())
ontology_terms_not_added = ontology_terms_available - ontology_terms_added

print(f"Total number of ontology terms to link: {ontology_terms_available}")
print(f"Total number of ontology terms linked: {ontology_terms_added}")
print(f"Total number of ontology terms with errors: {ontology_terms_not_added}")


Total number of ontology terms to link: 284
Total number of ontology terms linked: 268
Total number of ontology terms with errors: 16


## Save files

In [22]:

if wiki_content_list:
    wiki_content = pd.concat(wiki_content_list, ignore_index=True)
    
    print(f"Number of rows in wiki_content: {len(wiki_content)}")
    
    relative_path = '../../siads_capstone/'
    
    absolute_path = os.path.abspath(relative_path)
    
    file_name = "knowledge_base.csv"
    
    file_path = os.path.join(absolute_path, file_name)

    wiki_content.to_csv(file_path, index=False)
    print(f"wiki_content has been saved as '{file_name}' in the following directory:")
    print(absolute_path)
else:
    print("No valid DataFrames were generated. wiki_content.csv was not created.")


Number of rows in wiki_content: 3110
wiki_content has been saved as 'knowledge_base.csv' in the following directory:
/home/sagemaker-user/siads_capstone


In [24]:
kb_lookup

{'wiki_paths': {'cluster_analysis': {'wiki_title': 'cluster analysis',
   'ontology_terms': [{'term': 'cluster analysis', 'redirect': False}]},
  'sigmoid_function': {'wiki_title': 'sigmoid function',
   'ontology_terms': [{'term': 'sigmoid', 'redirect': False}]},
  'generative_adversarial_network': {'wiki_title': 'generative adversarial network',
   'ontology_terms': [{'term': 'generative adversarial network',
     'redirect': False}]},
  'least-angle_regression': {'wiki_title': 'least-angle regression',
   'ontology_terms': [{'term': 'least-angle regression', 'redirect': False},
    {'term': 'least angle regression', 'redirect': False}]},
  'part-of-speech_tagging': {'wiki_title': 'part-of-speech tagging',
   'ontology_terms': [{'term': 'part-of-speech tagging', 'redirect': False}]},
  'false_positives_and_false_negatives': {'wiki_title': 'false positives and false negatives',
   'ontology_terms': [{'term': 'false positive', 'redirect': True}]},
  'mixture_model#gaussian_mixture_mode

## Save files

In [25]:
if kb_lookup:

    relative_path = '../../siads_capstone/'
    
    absolute_path = os.path.abspath(relative_path)

    file_name = "kb_lookup.json"
    
    file_path = os.path.join(absolute_path, file_name)

    with open(file_path, 'w') as f:
        json.dump(kb_lookup, f, indent=4)
    
    print(f"kb_lookup has been saved as '{file_name}' in the following directory:")
    print(absolute_path)
    
    print(f"Number of wiki_paths in kb_lookup: {len(kb_lookup['wiki_paths'])}")
    print(f"Number of ontology_terms in kb_lookup: {len(kb_lookup['ontology_terms'])}")
else:
    print("kb_lookup is empty. No file was created.")

kb_lookup has been saved as 'kb_lookup.json' in the following directory:
/home/sagemaker-user/siads_capstone
Number of wiki_paths in kb_lookup: 258
Number of ontology_terms in kb_lookup: 0


In [28]:
file_path = '../../siads_capstone/kb_lookup.json'

with open(file_path, 'r') as f:
    kb_lookup = json.load(f)

# Check entries
random_keys = random.sample(list(kb_lookup['wiki_paths'].keys()), 5)

for i, key in enumerate(random_keys, 1):
    value = kb_lookup['wiki_paths'][key]
    print(f"\n{i}. Wiki path: {key}")
    print(f"   Title: {value['wiki_title']}")
    print("   Ontology terms:")
    for term in value['ontology_terms']:
        print(f"     - {term['term']} (Redirect: {term['redirect']})")


1. Wiki path: binary_decoder
   Title: binary decoder
   Ontology terms:
     - decoder (Redirect: False)

2. Wiki path: bleu
   Title: bleu
   Ontology terms:
     - bleu score (Redirect: False)

3. Wiki path: newton%27s_method
   Title: newton's method
   Ontology terms:
     - newton's method (Redirect: False)

4. Wiki path: data_cleansing
   Title: data cleansing
   Ontology terms:
     - data cleaning (Redirect: True)

5. Wiki path: hellinger_distance
   Title: hellinger distance
   Ontology terms:
     - hellinger distance (Redirect: False)


In [29]:
for wiki_path, data in kb_lookup['wiki_paths'].items():
    if len(data['ontology_terms']) > 1:
        print(f"Wiki Path: {wiki_path}")
        print(f"Wiki Title: {data['wiki_title']}")
        print("Ontology Terms:")
        for term in data['ontology_terms']:
            print(f"  - {term['term']} (Redirect: {term['redirect']})")
        print("-" * 50)

Wiki Path: least-angle_regression
Wiki Title: least-angle regression
Ontology Terms:
  - least-angle regression (Redirect: False)
  - least angle regression (Redirect: False)
--------------------------------------------------
Wiki Path: word_embedding
Wiki Title: word embedding
Ontology Terms:
  - embeddings (Redirect: False)
  - word embeddings (Redirect: False)
--------------------------------------------------
Wiki Path: diffusion_model
Wiki Title: diffusion model
Ontology Terms:
  - latent diffusion models (Redirect: False)
  - diffusion models (Redirect: False)
--------------------------------------------------
Wiki Path: cross-validation_(statistics)
Wiki Title: cross-validation (statistics)
Ontology Terms:
  - cross-validation (Redirect: False)
  - k-fold (Redirect: False)
--------------------------------------------------
Wiki Path: bayesian_network
Wiki Title: bayesian network
Ontology Terms:
  - bayesian network (Redirect: False)
  - bayesian belief network (Redirect: True)
-