# Preprocessing and Data Extractions with GROBID
***

The purpose of this notebook is to process and extract information from a collection of MICCAI 2023 XML documents. The notebook organises this information into structured data formats (e.g., CSV files, dataframes) by:

1. Text wrapping utility function that wrappes text lines within a dataframe row.
2. Interacting with the GROBID client for processing documents (the client has to run via terminal before activated in the notebook).
3. Renaming MICCAI 2023 XML files based on their individual titles
4. Parsing XML files to extract headers and related text into a dataframe.
5. Aggregating and cleaning data from multiple sources.
6. Identifying specific content (e.g., papers related to cancer) within the aggregated data.

In [2]:
# Libraries 
import os
import re
import pandas as pd

# The GROBID client to process and extract information from XML files
from grobid_client.grobid_client import GrobidClient

# For parsing XML files
from xml.etree import ElementTree as et



1. Text wrapping utility function that wrappes text lines within a dataframe row
***

In [2]:
def wrap_text(text, width=80):
    if pd.isnull(text):
        return text
    
    wrapped_lines = []
    for paragraph in text.split('\n'):
        line = ''
        for word in paragraph.split():
            if len(line) + len(word) + 1 > width:
                wrapped_lines.append(line)
                line = word
            else:
                line += ' ' + word if line else word
        wrapped_lines.append(line)
    return '\n'.join(wrapped_lines)

2. Interacting with the GROBID client for processing documents 
***

In [3]:
def process_fulltext_document(client, process_file, output_dir):
    client.process('processFulltextDocument', process_file, output=output_dir, force=True)

3. Renaming MICCAI 2023 XML files based on their individual titles
***

In [None]:

def rename_xml_files_in_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(folder_path, filename)
            try:
                tree = et.parse(file_path)
                root = tree.getroot()
                paper_title = find_title(root)
                if paper_title:
                    new_filename = paper_title.replace(" ", "_") + '.xml'
                    new_file_path = os.path.join(folder_path, new_filename)
                    os.rename(file_path, new_file_path)
            except et.ParseError as e:
                print(f"Error parsing '{filename}': {e}")

def find_title(element):
    if 'title' in element.tag.lower() and element.text:
        return element.text.strip()
    for child in element:
        title = find_title(child)
        if title:
            return title
    return None

4. Parsing XML files to extract headers and related text into a dataframe
***

In [None]:

def parse_xml_and_extract_headers(file_path):
    tree = etree.parse(file_path)
    root = tree.getroot()
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

    # Extract the paper title by XPath in the XML's structure
    paper_title_element = root.find('.//tei:title', ns)
    paper_title = paper_title_element.text if paper_title_element is not None else "No Title Found"

    headers = root.xpath('//tei:head', namespaces=ns)
    print(f"Found {len(headers)} headers in '{paper_title}'")
    
    data = []
    for header in headers:
        # Use XPath string() function to get all text within the <p> tags, including nested elements
        text_content = ''.join(header.getparent().xpath('.//tei:p//text()', namespaces=ns))
        data.append({
            'Paper Title': paper_title,
            'Header Number': header.get('n'),
            'Header Title': header.text,
            'Text': text_content  # Updated to use text_content
        })

    df = pd.DataFrame(data, columns=['Paper Title', 'Header Number', 'Header Title', 'Text'])
    return df

5. Aggregating and cleaning data from multiple sources.
***

In [3]:
def process_xml_folder(folder_path):
    # Aggregates data from multiple XML files in a given folder
    all_data_frames = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".xml"):
            file_path = os.path.join(folder_path, file_name)
            df = parse_xml_and_extract_headers(file_path)
            all_data_frames.append(df)

    if all_data_frames:
        final_df = pd.concat(all_data_frames, ignore_index=True)
    else:
        final_df = pd.DataFrame()

    return final_df

# Clean dataframe from duplicates 
def clean_dataframe(df):
    df_cleaned = df.dropna(subset=['Header Title', 'Text'], how='all').dropna(subset=['Text'], how='any')
    return df_cleaned

# Merge each dataframe (where 1 dataframe contains all information from volume 1 etc.) into a final dataframe with all papers from MICCAI 2023
def merge_dataframes(cleaned_dataframes):
    for vol, df in cleaned_dataframes.items():
        volume_number = int(re.search(r'\d+', vol).group())
        df['Paper Title'] += f' (vol{volume_number})'
        df['Volume'] = volume_number
    return pd.concat(cleaned_dataframes.values(), ignore_index=True)

In [5]:
# Run GROBID in terminal before running the notebook
# Installation and running commands
# wget https://github.com/kermitt2/grobid/archive/0.8.0.zip
# unzip 0.8.0.zip
# cd grobid-0.8.0
# ./gradlew run

# GROBID library: Runs in terminal
grobid_server = 'http://localhost:8070'
client = GrobidClient(grobid_server=grobid_server)

# MICCAI 2023 PDF files, organised by volumes into separate folders
# 730 PDFs in total, divided into 10 folders
process_file = '/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/miccai_2023'

# Output folder for processed PDF files as XML files
output_dir = './processed_documents'

# Call the function to start the preprocessing and data extraction 
process_fulltext_document(client, process_file, output_dir)

#combined_df = process_fulltext_document(client, process_file, output_dir)

#combined_df.to_csv('refined_all_papers_extracted_w_text.csv')

GROBID server does not appear up and running, the connection to the server failed


ServerUnavailableException: 

### **Scope of papers #1: cancer-related medical AI's**
***

6. Selection of papers: identifying cancer-related keywords in papers within the aggregated data.
***

In [7]:
filename = '/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/outputs/databases/refined_all_papers_extracted_w_text.csv'
combined_df = pd.read_csv(filename, index_col=0)
print(len(combined_df['title'].unique()))
combined_df

730


Unnamed: 0,title,header_no,header_title,text,volume
0,AMAE: Adaptation of Pre-trained Masked Autoenc...,1.0,Introduction,to reduce radiologists' reading burden and mak...,1
1,AMAE: Adaptation of Pre-trained Masked Autoenc...,2.0,Method,notation. we first formally define the problem...,1
2,AMAE: Adaptation of Pre-trained Masked Autoenc...,2.1,Stage 1-Proxy Task to Detect Synthetic Anomalies,amae starts the first training stage using onl...,1
3,AMAE: Adaptation of Pre-trained Masked Autoenc...,2.2,Stage 2-MAE Inter-Discrepancy Adaptation,the proposed mae adaptation scheme is inspired...,1
4,AMAE: Adaptation of Pre-trained Masked Autoenc...,3.0,Experiments,datasets. we evaluated our method on three pub...,1
...,...,...,...,...,...
6963,LightNeuS: Neural Surface Reconstruction in En...,3.1,Using Illumination Decline as a Depth Cue,the neus formulation of sect. 2 assumes distan...,10
6964,LightNeuS: Neural Surface Reconstruction in En...,3.2,Endoscope Photometric Model,"apart from illumination decline, there are sev...",10
6965,LightNeuS: Neural Surface Reconstruction in En...,4.0,Experiments,we validate our method on the c3vd dataset [4]...,10
6966,LightNeuS: Neural Surface Reconstruction in En...,5.0,Conclusion,we have presented a method for 3d dense multi-...,10


In [8]:
# Search for 'cancer, tumor, tumour' in the text column, case insensitive
cancer_papers_mask = combined_df['text'].str.contains('cancer|tumor|tumour', case=False, na=False)
papers_with_cancer = combined_df[cancer_papers_mask]

# Get the unique titles of papers that mention 'cancer'
unique_titles_with_cancer = papers_with_cancer['title'].unique()

# Extract all headers and their related text for papers that mention 'cancer'
extracted_info = pd.DataFrame()
for title in unique_titles_with_cancer:
    paper_info = combined_df[combined_df['title'] == title]
    extracted_info = pd.concat([extracted_info, paper_info])

# Reset index of the resulting DataFrame
extracted_info.reset_index(drop=True, inplace=True)

unique_paper_titles_with_cancer = extracted_info['title'].unique()
print(len(unique_paper_titles_with_cancer))

# Save the extracted information to a CSV file
#extracted_info.to_csv('cancer_related_papers_w_text.csv')

263


### **Scope of papers #2: Cancer-related medical AI's wording 'patients' in their research articles**
***

As an experiment, I have narrowed down the 263 cancer-related medical AI's papers down to a scope of papers, working with datasets with a subgroup defined as 'patient/patients'. Mindful, that this code only selects papers by keyword-match.

The total number of papers are now down to 155.

In [None]:
categories = {
    'age': ['age'],
    'gender': ['gender', 'sex', 'women', 'woman', 'female', 'male'],
    'ethnicity': ['ethnicity', 'ethnicities', 'race', 'white patients', 'black patients'],
    'geolocation': ['geolocation', 'geographical', 'geographic', 'country', 'countries', 'city', 'cities', 
                    'hospital', 'hospitals', 'clinic', 'clinics'],
    'patients': ['patient', 'patients'],
    'bias': ['bias', 'biases'],
}

# Flatten the list of all keywords excluding 'patients' to avoid redundancy
all_keywords = sum([kw for cat, kw in categories.items() if cat != 'patients'], [])

# Filter papers that mention 'patient' or 'patients'
scope_mask = extracted_info['text'].str.contains('patient|patients', case=False, na=False)
papers_with_patients = extracted_info[scope_mask]

# Prepare a list to collect paper info dictionaries
papers_info_list = []

# Iterate over unique titles in the filtered DataFrame
for title in papers_with_patients['title'].unique():
    paper_info = papers_with_patients[papers_with_patients['title'] == title]
    # Initialize a dictionary for the current paper with zeros for all keywords
    paper_keywords = dict.fromkeys(all_keywords, 0)
    paper_keywords['title'] = title
    # Check for each keyword in the text of the paper
    for keyword in all_keywords:
        if any(paper_info['text'].str.contains(keyword, case=False, na=False)):
            paper_keywords[keyword] = 1
    # Collect the keyword matches for the current paper
    papers_info_list.append(paper_keywords)

# Create a DataFrame from the list of dictionaries
keywords_per_paper = pd.DataFrame(papers_info_list)

# Display or work with the keywords_per_paper DataFrame
#keywords_per_paper.to_csv('keywords_per_paper.csv')
#papers_with_patients.to_csv('papers_with_patients.csv')

print('Number of unique titles for papers containing the keyword <patient/patients>:', len(papers_with_patients['title'].unique()))

Number of unique titles for papers containing the keyword <patient/patients>: 155
