In [1]:
import re
import pandas as pd
import os

# Taxonomy
Build a controlled vocab reflecting hierarchical structure and key concepts of the subject-matter e.g. AI/Machine Learning

a) label/tag taxonomy entities according to wikipage titles

Populate as many pages as possible to begin with for wiki_results table to get started. Why? Wikipedia doesn't do fuzzy matching. These are:

"Neural_network_(machine_learning)" "Machine_learning" Outline_of_machine_learning

In [37]:
# Taxonomy Folder
path = '../../siads_capstone/db/taxonomy_data/'  

# Get all CSV files in the Folder
csv_files = [file for file in os.listdir(path) if file.endswith('.csv')]

dfs = []

for file in csv_files:
    df = pd.read_csv(os.path.join(path, file))
    df = df.drop('id', axis=1, errors='ignore')
    dfs.append(df)

# Concatenate all dataframes in the list
taxonomy_df = pd.concat(dfs, ignore_index=True)



## Unique terms

In [38]:
def clean_term(term):
    """
    Clean and standardize a given term.

    This function performs the following operations:
    1. Handles NaN values
    2. Removes acronyms in parentheses if the result is not too short
    3. Preserves terms with 3 or more consecutive capital letters
    4. Strips whitespace and title-cases the term

    Args:
    term (str or float): The term to be cleaned. Can be a string or NaN.

    Returns:
    str or float: The cleaned term, or NaN if the input was NaN.
    """
    if pd.isna(term):
        return term
    
    if re.search(r'\s*\([A-Z]{2,4}\)\s*', term):
        drop_acronym = re.sub(r'\s*\([A-Z]{2,4}\)\s*', '', term)
        
        if len(drop_acronym.strip()) > 3:
            return drop_acronym.strip().title()
        else:
            return term.strip().title()
    
    if re.search(r'[A-Z]{3,}', term):
        return term.strip()
    
    return term.strip().title()


In [39]:
for column in ['level_1', 'level_2', 'level_3', 'level_4', 'level_5']:
    taxonomy_df[column] = taxonomy_df[column].apply(clean_term)

In [40]:
taxonomy_df

Unnamed: 0,level_1,level_2,level_3,level_4,level_5
0,Model Deployment,,,,
1,Model Deployment,Computational Efficiency,,,
2,Model Deployment,Computational Efficiency,Gradient Accumulation,,
3,Model Deployment,Computational Efficiency,Automatic Mixed Precision,,
4,Model Deployment,Computational Efficiency,Quantization,,
...,...,...,...,...,...
577,Data Preparation And Feature Engineering,Feature Engineering,Class-Imbalanced Datasets,Ensemble Methods,Easyensemble
578,Data Preparation And Feature Engineering,Feature Engineering,Class-Imbalanced Datasets,Ensemble Methods,RUSBoost
579,Data Preparation And Feature Engineering,Feature Engineering,Feature Extraction,Text Features,Regular Expressions
580,Data Preparation And Feature Engineering,Feature Engineering,Data Cleaning,Error Correction,Regular Expressions


## Unique Taxonomy Terms

In order to address both known and unknown spellings of our Ontology terms and increase our matching coverage we'll look to strip as much 'noise' from our Ontology term strings as possible while allowing for scenarios such as hyphenation, capitalisation, and singular/plurals etc. through the strategic design of our regular expressions

In [41]:
def unique_term_regex():
    """
    Generate a DataFrame of unique taxonomy terms with their corresponding regex patterns.

    This function processes the taxonomy terms from all levels in the global taxonomy_df:
    1. Removes duplicates and null values
    2. Cleans and standardizes terms
    3. Removes trailing 's' for plurals
    4. Creates regex patterns allowing for flexible spacing and hyphenation

    Returns:
    pd.DataFrame: A DataFrame with columns 'taxonomy_term' (original term) and 
                  'term_regex' (corresponding regex pattern).
    """
    unique_terms = []
    taxonomy_terms = set()
    for level in ['level_1', 'level_2', 'level_3', 'level_4', 'level_5']:
        taxonomy_terms.update(taxonomy_df[level].dropna().unique())
    
    for term in taxonomy_terms:
        term_proc = term.strip().lower()
        term_proc = re.sub(r's$', '', term_proc)
        
        words = re.split(r'(\s|\-)', term_proc)
        processed_words = []
        for word in words:
            if word in [' ', '-']:
                processed_words.append(r'(\s*|\-)?')
            else:
                processed_words.append(re.escape(word))
        
        term_regex = ''.join(processed_words)
        
        unique_terms.append({
            'taxonomy_term': term,
            'term_regex': term_regex
        })
    
    unique_term_df = pd.DataFrame(unique_terms)
    
    return unique_term_df


In [42]:
unique_term_df = unique_term_regex()

In [43]:

print(unique_term_df.head())
print(f"\nTotal unique terms: {len(unique_term_df)}")
print(f"Sample of term_regex:")
print(unique_term_df['term_regex'].sample(5))
    

      taxonomy_term               term_regex
0  Trainable Layers  trainable(\s*|\-)?layer
1  Cluster Analysis  cluster(\s*|\-)?analysi
2       Temperature              temperature
3        One-Sample       one(\s*|\-)?sample
4            Resnet                   resnet

Total unique terms: 537
Sample of term_regex:
341    area(\s*|\-)?under(\s*|\-)?pr(\s*|\-)?curve
16                          false(\s*|\-)?positive
261                            triplet(\s*|\-)?los
99             mean(\s*|\-)?absolute(\s*|\-)?error
114                   power(\s*|\-)?transformation
Name: term_regex, dtype: object


## Save files

In [44]:

relative_path = '../../siads_capstone/'
    
absolute_path = os.path.abspath(relative_path)
    
taxonomy_file_name = "taxonomy.csv"
uniqueterms_file_name = "unique_terms.csv"
    
taxonomy_file_path = os.path.join(absolute_path, taxonomy_file_name)
uniqueterms_file_path = os.path.join(absolute_path, uniqueterms_file_name)

taxonomy_df.to_csv(taxonomy_file_path, index=False)
unique_term_df.to_csv(uniqueterms_file_path, index=False)

print(f"unique terms has been saved as '{taxonomy_file_name}' in the following directory:")
print(absolute_path)
print(f"unique terms has been saved as '{uniqueterms_file_name}' in the following directory:")
print(absolute_path)


unique terms has been saved as 'taxonomy.csv' in the following directory:
/home/sagemaker-user/siads_capstone
unique terms has been saved as 'unique_terms.csv' in the following directory:
/home/sagemaker-user/siads_capstone


In [45]:
taxonomy_path = '../taxonomy.csv'
taxonomy_df = pd.read_csv(taxonomy_path)

In [50]:
taxonomy_df.head()

Unnamed: 0,level_1,level_2,level_3,level_4,level_5
0,Model Deployment,,,,
1,Model Deployment,Computational Efficiency,,,
2,Model Deployment,Computational Efficiency,Gradient Accumulation,,
3,Model Deployment,Computational Efficiency,Automatic Mixed Precision,,
4,Model Deployment,Computational Efficiency,Quantization,,


In [48]:
uniqueterms_path = '../unique_terms.csv'
uniqueterms_df = pd.read_csv(uniqueterms_path)

In [49]:
uniqueterms_df.head()

Unnamed: 0,taxonomy_term,term_regex
0,Trainable Layers,trainable(\s*|\-)?layer
1,Cluster Analysis,cluster(\s*|\-)?analysi
2,Temperature,temperature
3,One-Sample,one(\s*|\-)?sample
4,Resnet,resnet
