In [44]:
import requests
from bs4 import BeautifulSoup, Comment
import re
import pandas as pd
import numpy as np
import os

# Ontology
Ontology consists of our Taxonomy, entity labels, synonyms, initialisms/acroynms, and regular expressions for matching.

### Challenges in building an Ontology of Terms

Global Issues

An Ontology term can appear in an article multiple forms:
1. Terms may be abbreviated i.e. acronyms and initialisms
2. Terms may be known by other names i.e. synonyms
3. Terms may be generalised in nature
4. Terms can change within the article i.e. term becomes initialised, singular v. plural

## Unique terms

In [24]:
uniqueterms_path = '../unique_terms.csv'
uniqueterms_df = pd.read_csv(uniqueterms_path)

In [25]:
uniqueterms_df.head()

Unnamed: 0,taxonomy_term,term_regex
0,Trainable Layers,trainable(\s*|\-)?layer
1,Cluster Analysis,cluster(\s*|\-)?analysi
2,Temperature,temperature
3,One-Sample,one(\s*|\-)?sample
4,Resnet,resnet


## Wikipages

The following code was used to generate an initial list of wikipage paths and titles in order to start building the Term Entity Recognition Model. 

Key Taxonomy terms (e.g. Supervised Learning) were manually identified on wikipedia to their page ("Supervised_learning") in order to identify further links we could use to initialise our ontology. 

"""
def get_wiki_page_links(page_name):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": page_name,
        "prop": "links",
        "format": "json"
    }
    response = requests.get(url, params=params)
    data = response.json()

    links_data = []
    for link in data['parse']['links']:
        if link['ns'] == 0:  # Only include main namespace links
            title = link['*']
            page_name = title.replace(' ', '_')
            links_data.append({
                'wiki_title': title,
                'wiki_page': page_name,
                'ignore': False
            })

    return links_data

page_name = "Supervised_learning"
page_name = "Neural_network_(machine_learning)"
page_name = "Outline_of_machine_learning"
page_name = "Neural_network_(machine_learning)"
page_name = "Machine_learning"
links = get_wiki_page_links(page_name)
wikipages = pd.DataFrame(links)
df.to_csv('wikipages.csv', index=False)
"""

In [36]:
wikipages_path = '../wikipages_final.csv'
wikipages_path_df = pd.read_csv(wikipages_path)

wikipages_path_df = wikipages_path_df.drop(['ignore_path', 'redirect'], axis=1, errors='ignore')

wikipages_path_df = wikipages_path_df.dropna(subset=['title', 'path'])

## Unique Wiki titles and paths

In [37]:
def unique_wikipaths():
    """
    Generate a DataFrame of unique Wikipedia paths and titles.

    This function processes the global wikipages*path_df:
    1. Extracts unique combinations of paths and titles
    2. Converts paths and titles to lowercase and strips whitespace
    3. Removes duplicate path-title combinations

    Returns:
    pd.DataFrame: A DataFrame with columns 'path' and 'title', 
                  containing unique lowercase path-title combinations.
    """
    unique_wikipaths = []
    wikipaths = set()
    
    for _, row in wikipages*path_df.iterrows():
        path = row['path'].lower().strip()
        title = row['title'].lower().strip()
        
        path_title_tuple = (path, title)
        
        if path_title_tuple not in wikipaths:
            wikipaths.add(path_title_tuple)
            unique_wikipaths.append({
                'path': path,
                'title': title
            })
    
    unique_wikipaths_df = pd.DataFrame(unique_wikipaths)
    
    return unique_wikipaths_df



In [38]:
unique_wikipaths_df = unique_wikipaths()
unique_wikipaths_df

Unnamed: 0,path,title
0,machine_learning_(journal),machine learning (journal)
1,statistical_learning_in_language_acquisition,statistical learning in language acquisition
2,statistical_learning,statistical learning
3,timeline_of_machine_learning,timeline of machine learning
4,data_compression,data compression
...,...,...
10074,extended_backus%e2%80%93naur_form,extended backus–naur form
10075,feature_(machine_learning),feature
10076,squared_error_loss,squared error loss
10077,gerard_salton,gerard salton


# Ontology linking

In [170]:
ontology_links = uniqueterms_df.copy()

In [171]:
ontology_links['wiki_title'] = np.nan
ontology_links['wiki_path'] = np.nan
ontology_links

Unnamed: 0,taxonomy_term,term_regex,wiki_title,wiki_path
0,Trainable Layers,trainable(\s*|\-)?layer,,
1,Cluster Analysis,cluster(\s*|\-)?analysi,,
2,Temperature,temperature,,
3,One-Sample,one(\s*|\-)?sample,,
4,Resnet,resnet,,
...,...,...,...,...
532,Feature Engineering,feature(\s*|\-)?engineering,,
533,Embedding Layer,embedding(\s*|\-)?layer,,
534,Stacked Generalization,stacked(\s*|\-)?generalization,,
535,Underfitting,underfitting,,


## Manual

In [172]:

# Add manual matches for strategically important terms
path = '../../siads_capstone/db/wiki_data/wiki_search_man.csv' 
wikipages_manual_df = pd.read_csv(path)
wikipages_manual_df.head()

wikipages_manual_df['wiki_path'] = wikipages_manual_df['path'].str.strip().str.replace(' ', '_').str.lower()
wikipages_manual_df['wiki_title'] = wikipages_manual_df['title'].str.strip().str.replace('_', ' ').str.lower()
wikipages_manual_df


Unnamed: 0,id,tax_term,title,path,wiki_path,wiki_title
0,1,Least Absolute Shrinkage and Selection Operato...,Lasso (statistics),Lasso (statistics),lasso_(statistics),lasso (statistics)
1,2,Temperature,Temperature (softmax function),Temperature (softmax_function),temperature_(softmax_function),temperature (softmax function)
2,3,Meta-learning,Meta-learning (computer science),Meta-learning (computer science),meta-learning_(computer_science),meta-learning (computer science)
3,5,t-distributed stochastic neighbor embedding (t...,t-distributed stochastic neighbor embedding,t-distributed stochastic neighbor embedding,t-distributed_stochastic_neighbor_embedding,t-distributed stochastic neighbor embedding
4,6,Gradient Descent Algorithms,Gradient Descent,Gradient Descent,gradient_descent,gradient descent
...,...,...,...,...,...,...
73,78,Self-Attention Layer,Attention (machine learning),Attention_(machine_learning),attention_(machine_learning),attention (machine learning)
74,79,Shot Boundary Detection,Shot transition detection,Shot_transition_detection,shot_transition_detection,shot transition detection
75,80,Stemming,Stemming,Stemming,stemming,stemming
76,81,Variational Lower Bound,Evidence lower bound,Evidence_lower_bound,evidence_lower_bound,evidence lower bound


In [173]:
def manual_wikipage_match(df1, df2):
    """
    Match taxonomy terms from df1 with Wikipedia titles and paths from df2.

    This function:
    1. Normalizes terms in both DataFrames (strips whitespace and converts to lowercase)
    2. Finds exact matches between df1's 'taxonomy_term' and df2's 'tax_term'
    3. For each term in df1, collects matching titles and paths from df2
    4. Joins multiple matches with '|' separator

    Args:
    df1 (pd.DataFrame): DataFrame containing 'taxonomy_term' column
    df2 (pd.DataFrame): DataFrame containing 'tax_term', 'wiki_title', and 'wiki_path' columns

    Returns:
    pd.DataFrame: df1 with additional 'wiki_title' and 'wiki_path' columns
    """
    wikipages_titles = []
    wikipages_paths = []
    
    df1_terms = df1['taxonomy_term'].str.strip().str.lower()
    df2_terms = df2['tax_term'].str.strip().str.lower()
    
    for term in df1_terms:
        matches = df2_terms == term
        
        matching_titles = df2.loc[matches, 'wiki_title'].fillna('').astype(str)
        matching_paths = df2.loc[matches, 'wiki_path'].fillna('').astype(str)
        
        wikipages_titles.append('|'.join(matching_titles) if not matching_titles.empty else None)
        wikipages_paths.append('|'.join(matching_paths) if not matching_paths.empty else None)
    
    df1['wiki_title'] = wikipages_titles
    df1['wiki_path'] = wikipages_paths
    
    return df1

ontology_links_manual = manual_wikipage_match(ontology_links, wikipages_manual_df)


In [174]:
ontology_links_manual_matches = ontology_links_manual.dropna(subset=['wiki_title', 'wiki_path'])

ontology_links_manual_unmatches = ontology_links_manual[ontology_links_manual['wiki_title'].isna() & ontology_links_manual['wiki_path'].isna()]

ontology_links_manual_matches = ontology_links_manual_matches.reset_index(drop=True)
ontology_links_manual_unmatches = ontology_links_manual_unmatches.reset_index(drop=True)

In [175]:
def find_matches(df1, df2):
    """
    Match regex patterns from df1 with Wikipedia titles and paths from df2.

    This function:
    1. Normalizes terms in both DataFrames (strips whitespace and converts to lowercase)
    2. Finds regex matches between df1's 'term_regex' and df2's 'title'
    3. For multiple matches, selects the shortest matching title
    4. For each term in df1, collects matching titles and paths from df2
    5. Joins multiple matches with '|' separator

    Args:
    df1 (pd.DataFrame): DataFrame containing 'term_regex' column
    df2 (pd.DataFrame): DataFrame containing 'title' and 'path' columns

    Returns:
    pd.DataFrame: Copy of df1 with additional 'wiki_title' and 'wiki_path' columns
    """
    df1 = df1.copy()
    
    wikipages_titles = []
    wikipages_paths = []
    
    df1_terms = df1['term_regex'].str.strip().str.lower()
    df2_terms = df2['title'].str.strip().str.lower()
    
    for term in df1_terms:
        matches = df2_terms.str.match(f"^{term}", case=False)
        
        if matches.sum() > 1:
            longest_match = df2_terms[matches].str.len().idxmin()
            matches = matches & (df2_terms.index == longest_match)
        
        matching_titles = df2.loc[matches, 'title'].fillna('').astype(str)
        matching_paths = df2.loc[matches, 'path'].fillna('').astype(str)
        
        wikipages_titles.append('|'.join(matching_titles) if not matching_titles.empty else None)
        wikipages_paths.append('|'.join(matching_paths) if not matching_paths.empty else None)
    
    df1['wiki_title'] = wikipages_titles
    df1['wiki_path'] = wikipages_paths
    
    return df1

ontology_links = find_matches(ontology_links_manual_unmatches, unique_wikipaths_df)


In [176]:
print(ontology_links.info())

ontology_links_all = [ontology_links, ontology_links_manual_matches]

ontology_wikilinks = pd.concat(ontology_links_all)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   taxonomy_term  462 non-null    object
 1   term_regex     462 non-null    object
 2   wiki_title     210 non-null    object
 3   wiki_path      210 non-null    object
dtypes: object(4)
memory usage: 14.6+ KB
None


In [177]:
match_count = ontology_wikilinks['wiki_title'].notna().sum()
total_rows = len(ontology_wikilinks)
match_percentage = (match_count / total_rows) * 100

print(f"Number of matches: {match_count}")
print(f"Percentage of matches: {match_percentage:.2f}%")
print(f"Number of non-matches: {total_rows - match_count}")

print("\nExample matches:")
match_examples = ontology_wikilinks[ontology_wikilinks['wiki_title'].notna()].sample(3)
for _, row in match_examples.iterrows():
    print(f"Taxonomy Term: {row['taxonomy_term']}")
    print(f"Term Regex: {row['term_regex']}")
    print(f"Matched Title: {row['wiki_title']}")
    print(f"Matched Path: {row['wiki_path']}")
    print("-" * 50)

Number of matches: 285
Percentage of matches: 53.07%
Number of non-matches: 252

Example matches:
Taxonomy Term: Bhattacharyya Distance
Term Regex: bhattacharyya(\s*|\-)?distance
Matched Title: bhattacharyya distance
Matched Path: bhattacharyya_distance
--------------------------------------------------
Taxonomy Term: Group Lasso
Term Regex: group(\s*|\-)?lasso
Matched Title: 
Matched Path: 
--------------------------------------------------
Taxonomy Term: Temporal Difference Learning
Term Regex: temporal(\s*|\-)?difference(\s*|\-)?learning
Matched Title: temporal difference learning
Matched Path: temporal_difference_learning
--------------------------------------------------


In [178]:
ontology_wikilinks.head(5)

Unnamed: 0,taxonomy_term,term_regex,wiki_title,wiki_path
0,Trainable Layers,trainable(\s*|\-)?layer,,
1,Cluster Analysis,cluster(\s*|\-)?analysi,cluster analysis,cluster_analysis
2,One-Sample,one(\s*|\-)?sample,,
3,Adapter Layers,adapter(\s*|\-)?layer,,
4,Sigmoid,sigmoid,sigmoid,sigmoid_function


In [179]:
duplicates = ontology_wikilinks[ontology_wikilinks['wiki_title'].notna() & ontology_wikilinks['wiki_title'].duplicated(keep=False)]

duplicates = duplicates.sort_values('wiki_title')

# Display the duplicates
duplicates[['taxonomy_term', 'term_regex', 'wiki_title']]

Unnamed: 0,taxonomy_term,term_regex,wiki_title
19,Feature Transformation,feature(\s*|\-)?transformation,data transformation (computing)
63,Data Transformation,data(\s*|\-)?transformation,data transformation (computing)
418,Gradient Descent,gradient(\s*|\-)?descent,gradient descent
16,Gradient Descent Algorithms,gradient(\s*|\-)?descent(\s*|\-)?algorithm,gradient descent
6,Least-Angle Regression,least(\s*|\-)?angle(\s*|\-)?regression,least-angle regression
15,Least Angle Regression,least(\s*|\-)?angle(\s*|\-)?regression,least-angle regression


In [181]:
ontology_wikilinks.head()

Unnamed: 0,taxonomy_term,term_regex,wiki_title,wiki_path
0,Trainable Layers,trainable(\s*|\-)?layer,,
1,Cluster Analysis,cluster(\s*|\-)?analysi,cluster analysis,cluster_analysis
2,One-Sample,one(\s*|\-)?sample,,
3,Adapter Layers,adapter(\s*|\-)?layer,,
4,Sigmoid,sigmoid,sigmoid,sigmoid_function


## Save files

In [182]:
relative_path = '../../siads_capstone/'
    
absolute_path = os.path.abspath(relative_path)
    
ontology_links_file_name = "ontology_links.csv"
    
ontology_links_file_path = os.path.join(absolute_path, ontology_links_file_name)

ontology_wikilinks.to_csv(ontology_links_file_path, index=False)

print(f"unique terms has been linked to wikipedia pages, saved as '{ontology_links_file_name}' in the following directory:")
print(absolute_path)

unique terms has been linked to wikipedia pages, saved as 'ontology_links.csv' in the following directory:
/home/sagemaker-user/siads_capstone
