***
## Libraries
***

In [32]:
%%capture
#!pip install chime
import chime 

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import re
import fitz  # PyMuPDF
import uuid # for generating unique identifiers for each paper

In [11]:
%%capture
# Setup and installation of the required packages
#!pip install spacy nltk PyMuPDF
#!python -m spacy download en_core_web_sm

***
## Important paths
***

In [34]:
# Base path to folder where output files will be stored
output_path = '/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/code/finals'

# Base path to folders 
base_path = '/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/'

# Path to the MICCAI 2023 pdfs
pdf_path = base_path + 'miccai_2023/'

# Path to the MICCAI 2023 database of all 730 papers and their metadata
database_path = base_path + 'databases/'

In [5]:
def save_to_csv(df, title):
    df.to_csv(title + '.csv', index=True)

def read_csv_file(path, filename, var_name):
    var_name = pd.read_csv(path + filename + '.csv')
    return var_name

***
### Dataframe 1: MICCAI 2023
***

In [4]:
# Read the database of all MICCAI 2023 papers
#df_miccai = pd.read_csv(database_path +'updated_database_miccai_2023.csv', index_col=[0], header=[0], encoding='utf-8')

# Refine the database by adding a unique identifier for each paper
#df_miccai.sort_values(by='Title', inplace=True)
#df_miccai.reset_index(drop=True, inplace=True)
#df_miccai['paper_id'] = range(1, len(df_miccai) + 1)

# Save the dataframe to a csv file
#df_miccai.to_csv(database_path + 'updated_df_miccai.csv', index=True)
filename = '/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/databases/updated_df_miccai.csv'
df_miccai = pd.read_csv(filename, index_col=[0], header=[0], encoding='utf-8')
df_miccai

Unnamed: 0,title,authors,page_numbers,doi,publication_year,volume,paper_id
0,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,1
1,3D Dental Mesh Segmentation Using Semantics-Ba...,"Fan Duan, Li Chen",456-465,10.1007/978-3-031-43990-2_43,2023,7,2
2,3D Medical Image Segmentation with Sparse Anno...,"Heng Cai, Lei Qi, Qian Yu, Yinghuan Shi, Yang Gao",614-624,10.1007/978-3-031-43898-1_59,2023,3,3
3,3D Mitochondria Instance Segmentation with Spa...,"Omkar Thawakar, Rao Muhammad Anwer, Jorma Laak...",613-623,10.1007/978-3-031-43993-3_59,2023,8,4
4,3D Teeth Reconstruction from Panoramic Radiogr...,"Sihwa Park, Seongjun Kim, In-Seok Song, Seung ...",376-386,10.1007/978-3-031-43999-5_36,2023,10,5
...,...,...,...,...,...,...,...
725,\(\mathrm {H^{2}}\)GM: A Hierarchical Hypergra...,"Zhibin He, Wuyang Li, Tuo Zhang, Yixuan Yuan",548-558,10.1007/978-3-031-43999-5_52,2023,10,726
726,"\(\textsf{GLSFormer}\): Gated - Long, Short Se...","Nisarg A. Shah, Shameema Sikder, S. Swaroop Ve...",386-396,10.1007/978-3-031-43996-4_37,2023,9,727
727,atTRACTive: Semi-automatic White Matter Tract ...,"Robin Peretzke, Klaus H. Maier-Hein, Jonas Boh...",237-246,10.1007/978-3-031-43993-3_23,2023,8,728
728,cOOpD: Reformulating COPD Classification on Ch...,"Silvia D. Almeida, Carsten T. Lüth, Tobias Nor...",33-43,10.1007/978-3-031-43904-9_4,2023,5,729


In [5]:
df_miccai.info()

#730 entries, 0 to 729 
#6 columns in total
#title, authors, page numbers, doi, year of publication, part of publication
#dtype: int64(3), object(4)

<class 'pandas.core.frame.DataFrame'>
Index: 730 entries, 0 to 729
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             730 non-null    object
 1   authors           730 non-null    object
 2   page_numbers      730 non-null    object
 3   doi               730 non-null    object
 4   publication_year  730 non-null    int64 
 5   volume            730 non-null    int64 
 6   paper_id          730 non-null    int64 
dtypes: int64(3), object(4)
memory usage: 45.6+ KB


There is a total of 730 papers in MICCAI 2023. 

In [6]:
# count the number of papers for each Volume. There is 10 Volumes in total

print('Number of papers in Volume 1:', df_miccai['volume'].value_counts()[1]) #73
print('Number of papers in Volume 2:', df_miccai['volume'].value_counts()[2]) #73
print('Number of papers in Volume 3:', df_miccai['volume'].value_counts()[3]) #72
print('Number of papers in Volume 4:', df_miccai['volume'].value_counts()[4]) #75
print('Number of papers in Volume 5:', df_miccai['volume'].value_counts()[5]) #76
print('Number of papers in Volume 6:', df_miccai['volume'].value_counts()[6]) #77
print('Number of papers in Volume 7:', df_miccai['volume'].value_counts()[7]) #75
print('Number of papers in Volume 8:', df_miccai['volume'].value_counts()[8]) #65
print('Number of papers in Volume 9:', df_miccai['volume'].value_counts()[9]) #70
print('Number of papers in Volume 10:', df_miccai['volume'].value_counts()[10]) #74

# count the total number of papers in the dataframe
print('Total number of papers:', df_miccai['volume'].value_counts().sum()) #730

Number of papers in Volume 1: 73
Number of papers in Volume 2: 73
Number of papers in Volume 3: 72
Number of papers in Volume 4: 75
Number of papers in Volume 5: 76
Number of papers in Volume 6: 77
Number of papers in Volume 7: 75
Number of papers in Volume 8: 65
Number of papers in Volume 9: 70
Number of papers in Volume 10: 74
Total number of papers: 730


***
# **Selecting a scope of papers from MICCAI 2023**
***
***

**Scope criteria:** Selecting papers, that researched within the field of cancer-related illnesses by searching for cancer-related keywords in the text of each research paper. The text is defined from the start of Abstraction ending with the last line of Conclusion, exluding the Title of the paper, the authors and affiliations, the Acknowlegdement and the References. 

Cancer-related keywords could be words such as 'cancer', 'tumor' and/or 'tumours'.


### Extract papers that contain the word 'cancer' in the text
***

In [None]:
# Function to extract the full text from the PDF
def extract_text(pdf_path):
    with fitz.open(pdf_path) as doc:
        full_text = ""
        for page in doc:
            full_text += page.get_text()
    return full_text

# Function to find if any of the keywords appear in the section between Abstract and Conclusion
def find_keywords_section(full_text, keywords):
     # Regular expressions to find the end of the affiliations section
    affiliations_end = re.search(r'\d{1,2}\s+(?:\w+\.)+@\w+\.\w{2,}', full_text)
    
    # Start searching from the end of affiliations if found, otherwise from the start of the text
    start_idx = affiliations_end.end() if affiliations_end else 0
    
    # Look for the Abstract and Conclusion sections
    abstract_idx = full_text.lower().find("abstract", start_idx)
    conclusion_idx = full_text.lower().rfind("conclusion", abstract_idx)
    acknowledgements_idx = full_text.lower().find("acknowledgements", conclusion_idx)
    
    # Adjust the end index to stop at Acknowledgements if it exists, otherwise use Conclusion index
    end_search_idx = acknowledgements_idx if acknowledgements_idx != -1 else conclusion_idx
    
    # If neither Abstract nor Conclusion is found, search the entire text
    if abstract_idx == -1 and conclusion_idx == -1:
        searchable_text = full_text[start_idx:]
    else:
        # Search from Abstract to Conclusion or Acknowledgements
        searchable_text = full_text[abstract_idx:end_search_idx].lower()
    
    # Search for each keyword within the determined section, stop at first match
    for keyword in keywords:
        if keyword.lower() in searchable_text:
            return True
    return False

# Function to extract the title from the PDF
def extract_title(pdf_path):
    with fitz.open(pdf_path) as doc:
        first_page_text = doc[0].get_text("text")
        
        # Regular expression to find the start of affiliations or author names
        # Looks for sequences in author lists or affiliations, such as numbers and parentheses
        author_or_affiliations_start = re.search(r'\b[A-Z][a-z]+ [A-Z]\.|\b[A-Z][a-z]+\s[A-Z][a-z]+[1-9]', first_page_text)

        title = ""
        if author_or_affiliations_start:
            # Extract text up to the start of the author list or affiliations as potential title text
            potential_title_text = first_page_text[:author_or_affiliations_start.start()].strip()
            title_lines = potential_title_text.split('\n')
            
            # The title is expected to be a continuous block of text at the top of the page,
            # possibly after a journal header or similar: look for a large continuous block of text.
            for line in reversed(title_lines):
                if line.strip():  
                    # Prepend to keep the title in the correct order
                    title = line + " " + title
                else:
                    # An empty line might indicate the end of the title block
                    break
        else:
            # If no author list or affiliation section is identified, use the first non-empty line
            for line in first_page_text.split('\n'):
                if line.strip():
                    title = line
                    break

        title = title.strip()  # Clean up whitespace
        return title

selected_papers = []
titles = []

# List of keywords to search for
keywords = ["cancer"]

# Iterate over each volume and search for keywords
for i in range(1, 11):  # Volumes 1 to 10
    folder_name = f"miccai23vol{i}"
    folder_path = os.path.join(pdf_path, folder_name)
    
    for pdf in os.listdir(folder_path):
        if pdf.endswith(".pdf"):
            pdf_path_ = os.path.join(folder_path, pdf)
            full_text = extract_text(pdf_path_)
            if find_keywords_section(full_text, keywords):
                selected_papers.append(os.path.join(folder_name, pdf))

# Extract titles from selected papers
for paper_path in selected_papers:
    full_paper_path = os.path.join(pdf_path, paper_path)
    title = extract_title(full_paper_path)
    titles.append(title)

print(f"Extracted titles from {len(titles)} selected papers.")
print(f"With the keyword(s) being {keywords}, {len(selected_papers)} papers were selected as relevant to cancer research.")

### Store the selected papers into a dataframe with their related paths
***

In [17]:
# Save the selected papers and their paths to a CSV file
#selected_papers_df = pd.DataFrame({"path": selected_papers, "title": titles})

# Refine the selection of papers by adding a unique identifier for each paper
# This will be useful for tracking papers in the pipeline
#filename = 'selected_papers_paperid'

# Rename the columns to lowercase
#selected_papers_df.rename(columns={'Title': 'title', 'Path': 'path'}, inplace=True)

# Sort the dataframe by title
#selected_papers_df.sort_values(by='title', inplace=True)

# Reset the index
##selected_papers_df.reset_index(drop=True, inplace=True)

# Add a unique identifier for each paper
#selected_papers_df['paper_id'] = range(1, len(selected_papers_df) + 1)

# Save the dataframe to a CSV file for later use in the pipeline. columns: path, title, paper_id
#selected_papers_df.to_csv(filename +'.csv', index=False)

***
## **Selected papers**
***
***

In [35]:
selected_papers_df = pd.read_csv('/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/code/finals/1st_attempt/selected_papers_paperid.csv')
selected_papers_df

Unnamed: 0,path,title,paper_id
0,miccai23vol1/paper_14.pdf,3D Arterial Segmentation via Single 2D Project...,1
1,miccai23vol8/paper_59.pdf,3D Mitochondria Instance Segmentation with Spa...,2
2,miccai23vol2/paper_46.pdf,A Spatial-Temporal Deformable Attention Based ...,3
3,miccai23vol8/paper_46.pdf,A Texture Neural Network to Predict,4
4,miccai23vol6/paper_74.pdf,ALL-IN: A Local GLobal Graph-Based DIstillatio...,5
...,...,...,...
184,miccai23vol3/paper_72.pdf,WeakPolyp: You only Look Bounding Box for Poly...,185
185,miccai23vol1/paper_22.pdf,Weakly-Supervised Positional Contrastive Learn...,186
186,miccai23vol10/paper_66.pdf,X2Vision: 3D CT Reconstruction from Biplanar X...,187
187,miccai23vol5/paper_5.pdf,YONA: You Only Need One Adjacent Reference-Fra...,188


### Store the list of complete paths
***

In [36]:
# List of selected_papers by path
selected_papers = selected_papers_df['path'].tolist()
selected_papers

['miccai23vol1/paper_14.pdf',
 'miccai23vol8/paper_59.pdf',
 'miccai23vol2/paper_46.pdf',
 'miccai23vol8/paper_46.pdf',
 'miccai23vol6/paper_74.pdf',
 'miccai23vol2/paper_9.pdf',
 'miccai23vol6/paper_61.pdf',
 'miccai23vol6/paper_68.pdf',
 'miccai23vol6/paper_56.pdf',
 'miccai23vol10/paper_5.pdf',
 'miccai23vol3/paper_45.pdf',
 'miccai23vol1/paper_6.pdf',
 'miccai23vol7/paper_50.pdf',
 'miccai23vol4/paper_53.pdf',
 'miccai23vol2/paper_33.pdf',
 'miccai23vol6/paper_50.pdf',
 'miccai23vol7/paper_40.pdf',
 'miccai23vol5/paper_1.pdf',
 'miccai23vol8/paper_50.pdf',
 'miccai23vol1/paper_71.pdf',
 'miccai23vol5/paper_43.pdf',
 'miccai23vol4/paper_33.pdf',
 'miccai23vol7/paper_53.pdf',
 'miccai23vol5/paper_35.pdf',
 'miccai23vol7/paper_38.pdf',
 'miccai23vol9/paper_69.pdf',
 'miccai23vol4/paper_1.pdf',
 'miccai23vol6/paper_47.pdf',
 'miccai23vol2/paper_20.pdf',
 'miccai23vol8/paper_48.pdf',
 'miccai23vol7/paper_51.pdf',
 'miccai23vol5/paper_15.pdf',
 'miccai23vol7/paper_68.pdf',
 'miccai23vol3

In [37]:
# Store the complete paths of the selected papers in a list for later use in the pipeline
selected_papers_paths = []
for i in range(0, len(selected_papers)):  # Volumes 1 to 10
    selected_papers_paths.append([base_path + 'miccai_2023/' + selected_papers[i]])

# Check if the total number of paths is equal to the number of selected papers
len(selected_papers_paths)
selected_papers_paths[:5]

[['/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/miccai_2023/miccai23vol1/paper_14.pdf'],
 ['/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/miccai_2023/miccai23vol8/paper_59.pdf'],
 ['/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/miccai_2023/miccai23vol2/paper_46.pdf'],
 ['/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/miccai_2023/miccai23vol8/paper_46.pdf'],
 ['/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/miccai_2023/miccai23vol6/paper_74.pdf']]

## Function for keyword searching and sentence extractions
***

In [114]:
import spacy
nlp = spacy.load("en_core_web_sm")

# check text for keywords
 # Function to extract the full text from the PDF
def extract_text(pdf_path):
    with fitz.open(pdf_path) as doc:
        full_text = ""
        for page in doc:
            full_text += page.get_text()
            # Regular expressions to find the end of the affiliations section
            affiliations_end = re.search(r'\d{1,2}\s+(?:\w+\.)+@\w+\.\w{2,}', full_text)
            
            # Start searching from the end of affiliations if found, otherwise from the start of the text
            start_idx = affiliations_end.end() if affiliations_end else 0
            
            # Look for the Abstract and Conclusion sections
            abstract_idx = full_text.lower().find("abstract", start_idx)
            conclusion_idx = full_text.lower().rfind("conclusion", abstract_idx)
            acknowledgements_idx = full_text.lower().find("acknowledgements", conclusion_idx)
            
            # Adjust the end index to stop at Acknowledgements if it exists, otherwise use Conclusion index
            end_search_idx = acknowledgements_idx if acknowledgements_idx != -1 else conclusion_idx
            
            # If neither Abstract nor Conclusion is found, search the entire text
            if abstract_idx == -1 and conclusion_idx == -1:
                searchable_text = full_text[start_idx:]
            else:
                # Search from Abstract to Conclusion or Acknowledgements
                searchable_text = full_text[abstract_idx:end_search_idx].lower()          
        
    return searchable_text

In [156]:
import fitz  # PyMuPDF
import re

def extract_text(pdf_path):
    with fitz.open(pdf_path) as doc:
        full_text = ""
        for page in doc:
            full_text += page.get_text()

        # Find the end of the affiliations section
        affiliations_end = re.search(r'\d{1,2}\s+(?:\w+\.)+@\w+\.\w{2,}', full_text)
        print(affiliations_end)
        start_idx = affiliations_end.end() if affiliations_end else 0
        
        # Find the start of the References section
        references_idx = full_text.lower().rfind('references', start_idx)
        
        # Find the Abstract, Conclusion, and Acknowledgements sections
        abstract_idx = full_text.lower().find("abstract", start_idx)
        conclusion_idx = full_text.lower().rfind("conclusion", abstract_idx)
        acknowledgements_idx = full_text.lower().find("acknowledgements", conclusion_idx)

        # Use References index if it exists, otherwise fall back to Acknowledgements or Conclusion
        if references_idx != -1:
            end_search_idx = references_idx
        elif acknowledgements_idx != -1:
            end_search_idx = acknowledgements_idx
        elif conclusion_idx != -1:
            end_search_idx = conclusion_idx
        else:
            end_search_idx = len(full_text)  # No References, Acknowledgements, or Conclusion found

        # Extract the searchable text
        searchable_text = full_text[start_idx:end_search_idx].lower()

    return searchable_text


In [157]:
import spacy
import re

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

def is_valid_sentence(sentence):
    # Exclude blocks with too few words (threshold can be adjusted)
    if len(sentence.split()) < 3:
        return False
    
    # Exclude blocks that don't end with sentence-ending punctuation
    if not re.search(r'[.!?]$', sentence.strip()):
        return False
    
    # Exclude blocks that contain mostly non-alphabetic characters
    if sum(c.isalpha() for c in sentence) / len(sentence) < 0.5:
        return False

    # Exclude lines that match the pattern for tables, figures, or author citations
    if re.search(r'^Table\s+\d+|^Figure\s+\d+|^\d+\s+[A-Z][a-z]+\s+et\s+al\.$', sentence.strip()):
        return False

    return True


In [121]:
import re

def is_valid_sentence(sentence):
    # Exclude blocks with too few words (threshold can be adjusted)
    if len(sentence.split()) < 3:
        return False
    
    # Exclude blocks that don't end with sentence-ending punctuation
    if not re.search(r'[.!?]$', sentence.strip()):
        return False
    
    # Exclude blocks that contain mostly non-alphabetic characters
    if sum(c.isalpha() for c in sentence) / len(sentence) < 0.5:
        return False

    # Exclude lines that match the pattern for tables, figures, or author citations
    if re.search(r'^Table\s+\d+|^Figure\s+\d+|^\d+\s+[A-Z][a-z]+\s+et\s+al\.$', sentence.strip()):
        return False

    # Additional exclusion for lines with page numbers followed by author lists
    if re.search(r'^\d+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+et\s+al\.', sentence.strip()):
        return False

    return True


In [158]:

# Function to extract text blocks around keywords, avoiding duplicates
def extract_text_blocks(text, keywords, context=1):
    doc = nlp(text)
    text_blocks = []
    pattern = r'\b(' + '|'.join(re.escape(keyword) for keyword in keywords) + r')\b'
    sentences = [sentence for sentence in doc.sents if is_valid_sentence(sentence.text)]
    
    # We use a set to track which sentences have been included based on index
    included_indices = set()

    for index, sentence in enumerate(sentences):
        if re.search(pattern, sentence.text, re.IGNORECASE):
            # Calculate start and end indices for context sentences
            start = max(index - context, 0)
            end = min(index + context + 1, len(sentences))
            context_indices = set(range(start, end))

            # If the sentence's context indices haven't been included already, we add the block
            if not context_indices & included_indices:
                block = ' '.join(sentences[i].text.strip() for i in range(start, end))
                text_blocks.append(block)
                included_indices.update(context_indices)

    return text_blocks


In [159]:
# Initialize an empty dictionary to hold the extracted info
def extract_sents_by_keywords(list_of_pdf_paths, keywords, col_title):
    extracted_sents = {}

    for pdf_path in list_of_pdf_paths:
        path = pdf_path[0]  # pdf_path is a list with the first element being the file path
        text = extract_text(path)
        #relevant_sentences = extract_relevant_sentences(text, keywords)
        relevant_sentences = extract_text_blocks(text, keywords)
        
        # If no relevant sentences were extracted, include the paper with extracted_sentence set to None
        if not relevant_sentences:
            extracted_sents[path] = [None] # Probably better to change this to 0
        else:
            extracted_sents[path] = relevant_sentences

    # Convert to DataFrame
    rows = []
    for paper_id, sentences in extracted_sents.items():
        if sentences == 0:  # Check if the list contains only None, indicating no sentences were extracted
            rows.append({'paper_id': paper_id, col_title: None})
        else:
            for sentence in sentences:
                rows.append({'paper_id': paper_id, col_title: sentence})

    extracted_sents_df = pd.DataFrame(rows)
    return extracted_sents_df

### Sentence extraction
***

#### Extract sentences by keyword = cancer

In [37]:
# # Search for 'cancer' in text for all 730 papers
# # Check procedure to see if the selected paper contains the word 'cancer'
# keywords = ["cancer"]

# # Extract sentences by cancer
# sents_by_cancer = extract_sents_by_keywords(selected_papers_paths, keywords, col_title='extracted_sent_cancer')

In [None]:
# Save to CSV
#csv_filename = 'extracted_sent_cancer.csv'
#sents_by_cancer.to_csv(csv_filename, index=False)

#### Extract sentences by list of keywords

#### 1st attempt

https://github.com/yasminsarkhosh/machine-learning-bsc-thesis-2024/blob/b5e3dbb5669b8629d81fa080d3b1ea375d35bc81/code/finals/1st_attempt

In [17]:
# # Extract relevant sentences from the selected papers by these keywords
# keywords = ['age', 'gender', 'sex', 'women', 'woman', 'female', 'male',
#             'geolocation', 'geographical', 'geographic', 'country', 'countries', 'city', 'cities', 'hospital', 'hospitals', 'clinic', 'clinics', 
#             'society', 'societies',
#             'etnicity', 'etnicities', 'race', 
#             'bias', 'biases', 'fair', 'unfair', 'fairness', 'transparency',
#             'imbalance', 'imbalanced', 'balance', 'balanced']

# # Extract sentences by keywords
# sents_by_keywords = extract_sents_by_keywords(selected_papers_paths, keywords, col_title='extracted_sents_keywords')

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1063d7550>>
Traceback (most recent call last):
  File "/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/venv/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [23]:
#sents_by_keywords.to_csv('sents_keywords.csv', index=False)

In [None]:
# Save to CSV
#csv_filename = 'extracted_sents_keywords.csv'
#sents_by_keywords.to_csv(csv_filename, index=False)

#### 2nd attempt

https://github.com/yasminsarkhosh/machine-learning-bsc-thesis-2024/blob/b5e3dbb5669b8629d81fa080d3b1ea375d35bc81/code/finals/2nd_attempt

In [160]:
# Extract relevant sentences from the selected papers by these keywords
keywords = ['age', 'gender', 'sex', 'women', 'woman', 'female', 'male',
            'geolocation', 'geographical', 'geographic', 'country', 'countries', 'city', 'cities', 
            'hospital', 'hospitals', 'clinic', 'clinics', 'society', 'societies',
            'etnicity', 'etnicities', 'race', 
            'bias', 'biases', 'fair', 'unfair', 'fairness', 'transparency', 'awareness',
            'imbalance', 'imbalanced', 'balance', 'balanced',
            'problem', 'problems', 'issue', 'issues', 'challenge', 'challenges', 
            'difficult', 'difficulty', 'difficulties']

# Extract sentences by keywords
sents_by_keywords = extract_sents_by_keywords(selected_papers_paths, keywords, col_title='extracted_sents_keywords')

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


In [129]:
sents_by_keywords.to_csv('sents_keywords_2.csv', index=False)

In [128]:
sents_by_keywords

Unnamed: 0,paper_id,extracted_sents_keywords
0,/Users/yasminsarkhosh/Documents/GitHub/machine...,"[13], obtaining results comparable to full 3d ..."
1,/Users/yasminsarkhosh/Documents/GitHub/machine...,4\nexperimental design\ndataset. the cohort co...
2,/Users/yasminsarkhosh/Documents/GitHub/machine...,we implement [13] as a baseline on our dataset...
3,/Users/yasminsarkhosh/Documents/GitHub/machine...,3d mitochondria instance segmentation\nwith sp...
4,/Users/yasminsarkhosh/Documents/GitHub/machine...,"however, automatic 3d mito-\nchondria instance..."
...,...,...
1025,/Users/yasminsarkhosh/Documents/GitHub/machine...,vox2vec: a framework for self-supervised\ncont...
1026,/Users/yasminsarkhosh/Documents/GitHub/machine...,the reason is that producing a fea-\nture map ...
1027,/Users/yasminsarkhosh/Documents/GitHub/machine...,we\ndemonstrate an example of the excellent pe...
1028,/Users/yasminsarkhosh/Documents/GitHub/machine...,the number of trainable paramaters of\ndiﬀeren...


In [18]:
# Save to CSV
#csv_filename = 'extracted_sents_keywords_2.csv'
#sents_by_keywords.to_csv(csv_filename, index=False)

#### Extract sentences by list of organs
***

In [26]:
# Extract relevant sentences from the selected papers by organs

organs = pd.read_csv(database_path + 'search words/organs.csv', header=None)
organs = organs[0].tolist()
organs

keywords = ['adrenal', 'anal', 'anusarteries', 'gi', 'tract', 'gi-tract', 'colon', 'bladder', 'bone', 'marrow', 'bronchi', 'bronchioles', 
            'bulbourethral', 'capillaries', 'cecum', 'cerebellum', 'cerebral', 'cervix', 'choroid', 'plexus', 'ciliary', 'body', 'clitoris', 
            'cochlea', 'cornea', 'cranial', 'nerves', 'duodenum', 'eardrum', 'nervous', 'system', 'epididymis', 'esophagus', 'fallopian', 'tubes', 
            'gallbladder', 'ganglia', 'heart', 'skeleton', 'hypothalamus', 'ileum', 'interstitium', 'iris', 'jejunum', 'joint', 'joints', 'kidneys', 
            'larynx', 'ligament', 'ligaments', 'liver', 'lung', 'lungs', 'lymph', 'node', 'lymphatic', 'vessel', 'glands', 'oblongata', 'mesentery', 
            'brain', 'ear', 'ossicles', 'muscles', 'nasal', 'cavity', 'olfactory', 'epithelium', 'ovaries', 'pancreas', 'parathyroid', 'parotid', 'penis', 
            'pharynx', 'pineal', 'pituitary', 'placenta', 'prostate', 'rectum', 'retina', 'sigmoid', 'skin', 'spinal', 'nerves', 'spleen', 'stomach', 
            'tissue', 'sublingual', 'submandibular', 'teeth', 'tendons', 'testes', 'thalamus', 'spinal', 'cord', 'thymus', 'thyroid', 'tongue', 'tonsils', 
            'trachea', 'transverse', 'ureter', 'urethra', 'uterus', 'vagina', 'veins', 'vulva', 'lung', 'lungs', 'pulmonary', 'respiratory', 'bronchial', 
            'bronchi', 'bronchus', 'bronchial', 'trachea', 'tracheal', 'thoracic', 'thorax', 'diaphragm', 'diaphragmatic', 'pleural', 'pleura', 'alveolar', 
            'alveoli', 'gi-tract', 'gastrointestinal', 'gastro', 'intestinal', 'digestive', 'digestion', 'stomach', 'gastric', 'intestine', 'intestines', 
            'intestinal', 'colon', 'colonic', 'rectum', 'rectal', 'anus', 'anal', 'liver', 'hepatic', 'hepatitis', 'hepatocellular', 'hepatoma', 'hepatocarcinoma',
            'cervical', 'cervix', 'uterus', 'uterine', 'endometrial', 'ovarian', 'ovary', 'fallopian', 'tube', 'vaginal', 'gland', 'prostate gland',
            'prostate glands','testicular', 'testis', 
            'penile', 'breast', 'breast tissue']

# Extract sentences by keywords
sents_by_organs = extract_sents_by_keywords(selected_papers_paths, keywords, col_title='extracted_sents_organs')

In [29]:
# Save to CSV
#csv_filename = 'extracted_sents_organs.csv'
#sents_by_organs.to_csv(csv_filename, index=False)

## Extract information about datasets used in each paper
***

In [130]:
%%capture
!pip install gensim 

In [135]:
selected_papers_paths

[['/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/miccai_2023/miccai23vol1/paper_14.pdf'],
 ['/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/miccai_2023/miccai23vol8/paper_59.pdf'],
 ['/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/miccai_2023/miccai23vol2/paper_46.pdf'],
 ['/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/miccai_2023/miccai23vol8/paper_46.pdf'],
 ['/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/miccai_2023/miccai23vol6/paper_74.pdf'],
 ['/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/miccai_2023/miccai23vol2/paper_9.pdf'],
 ['/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/miccai_2023/miccai23vol6/paper_61.pdf'],
 ['/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/miccai_2023/miccai23vol6/paper_68.pdf'],
 ['/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-202

In [149]:
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import nltk

documents = []

for pdf_path in selected_papers_paths:
    path = pdf_path[0]  # pdf_path is a list with the first element being the file path
    text = extract_text(path)
    documents.append(text)

In [155]:
documents

['3d arterial segmentation via single 2d\nprojections and depth supervision\nin contrast-enhanced ct images\nalina f. dima1,2(b), veronika a. zimmer1,2, martin j. menten1,4,\nhongwei bran li1,3, markus graf2, tristan lemke2, philipp raﬄer2,\nrobert graf1,2, jan s. kirschke2, rickmer braren2, and daniel rueckert1,2,4\n1 school of computation, information and technology,\ntechnical university of munich, munich, germany\nalina.dima@tum.de\n2 school of medicine, klinikum rechts der isar,\ntechnical university of munich, munich, germany\n3 department of quantitative biomedicine, university of zurich, zurich, switzerland\n4 department of computing, imperial college london, london, uk\nabstract. automated segmentation of the blood vessels in 3d volumes\nis an essential step for the quantitative diagnosis and treatment of many\nvascular diseases. 3d vessel segmentation is being actively investigated\nin existing works, mostly in deep learning approaches. however, training\n3d deep networks req

In [152]:

# Tokenize, remove stopwords, and lemmatize
nltk.download('stopwords')
stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
texts = [[lemmatizer.lemmatize(word) for word in document.lower().split() if word not in stop] for document in documents]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(texts)

# Create a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
# Fit LDA model
ldamodel = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)

# Print topics
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yasminsarkhosh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[[(0, 1), (1, 1), (2, 2), (3, 2), (4, 1), (5, 1), (6, 1), (7, 3), (8, 3), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 3), (25, 1), (26, 1), (27, 1), (28, 4), (29, 1), (30, 1), (31, 2), (32, 2), (33, 1), (34, 4), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 3), (51, 1), (52, 1), (53, 2), (54, 2), (55, 4), (56, 4), (57, 2), (58, 4), (59, 1), (60, 3), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 2), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 2), (78, 1), (79, 1), (80, 1), (81, 3), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 6), (99, 1), (100, 1), (101, 3), (102, 1), (103, 2), (104, 1), (105, 2), (106, 1), (107, 1), (108, 2), (109, 1), (110, 2)

In [58]:
# Extract relevant sentences from the selected papers by these keywords
keywords = ['data', 'dataset', 'datasets', 'database', 'databases']

# Extract sentences by keywords
info_datasets = extract_sents_by_keywords(selected_papers_paths, keywords, col_title='datasets_info')

In [60]:
info_datasets.to_csv('datasets_info_2.csv', index=False)

In [59]:
info_datasets

Unnamed: 0,paper_id,datasets_info
0,/Users/yasminsarkhosh/Documents/GitHub/machine...,"in this work, we pro-\npose a novel method to ..."
1,/Users/yasminsarkhosh/Documents/GitHub/machine...,"furthermore, by mapping the 2d labels to the 3..."
2,/Users/yasminsarkhosh/Documents/GitHub/machine...,"[13], obtaining results comparable to full 3d ..."
3,/Users/yasminsarkhosh/Documents/GitHub/machine...,our contribution to 3d vessel segmentation is ...
4,/Users/yasminsarkhosh/Documents/GitHub/machine...,2\nrelated work\nlearning from weak annotation...
...,...,...
3266,/Users/yasminsarkhosh/Documents/GitHub/machine...,"in the ﬁne-tuning setup, we freeze the backbon..."
3267,/Users/yasminsarkhosh/Documents/GitHub/machine...,we\ndemonstrate an example of the excellent pe...
3268,/Users/yasminsarkhosh/Documents/GitHub/machine...,612\nm. goncharov et al.\ntable 1. average cro...
3269,/Users/yasminsarkhosh/Documents/GitHub/machine...,our method expands the contrastive\nlearning s...


In [71]:
df = info_datasets

In [79]:
import pandas as pd
import spacy
import re

# Load the spaCy model for English
nlp = spacy.load('en_core_web_sm')

# Define a function to clean non-relevant text and retain relevant sentences
def clean_dataset_info(text):
    # Remove common non-relevant patterns (e.g., references like [3], [4], [22], etc.)
    text = re.sub(r'\[\d+\]', '', text)

    # Exclude blocks that don't end with sentence-ending punctuation
    text = re.sub(r'[.!?]$', '', text)
    
    # Remove standalone mathematical expressions and isolated symbols/numbers
    text = re.sub(r'\b\w+\d+|\d+\.\d+|\(\d+\)|\w+=\w+\b', '', text)

    # Pattern to match equations and isolated mathematical expressions
    equation_pattern = r'\b\d+\.?\d*|\([\w\s]*?\)|\[[\w\s]*?\]|\{[\w\s]*?\}|\w+\d+|\d+\w+|[\+\-\*\/=<>^]'
    text = re.sub(equation_pattern, '', text)
    
    # Use spaCy to segment the text into sentences and extract relevant ones
    doc = nlp(text)
    relevant_sentences = []
    for sent in doc.sents:
        # Check if the sentence is likely to be relevant
        # (e.g., contains more than a certain number of words, not just a sequence of numbers, etc.)
        if len(sent.text.split()) > 3 and not re.fullmatch(r'\d+', sent.text.strip()):
            relevant_sentences.append(sent.text.strip())
    return ' '.join(relevant_sentences)


# Apply the cleaning function to each row in the 'dataset_info' column
df['datasets_info'] = df['datasets_info'].apply(clean_dataset_info)


In [80]:

# Display the cleaned DataFrame
df

Unnamed: 0,paper_id,datasets_info
0,/Users/yasminsarkhosh/Documents/GitHub/machine...,automated segmentation of the blood vessels in...
1,/Users/yasminsarkhosh/Documents/GitHub/machine...,accurate d mitochondria instance segmentation ...
2,/Users/yasminsarkhosh/Documents/GitHub/machine...,detecting breast lesion in videos is crucial f...
3,/Users/yasminsarkhosh/Documents/GitHub/machine...,brachial plexopathy is a form of peripheral ne...
4,/Users/yasminsarkhosh/Documents/GitHub/machine...,the utility of machine learning models in hist...
...,...,...
209,/Users/yasminsarkhosh/Documents/GitHub/machine...,n\nj̸i exp\n⎞\n⎠\n\nby deﬁning p {i : yi yt...
210,/Users/yasminsarkhosh/Documents/GitHub/machine...,"where wσ(dt, di) is normalized over i ∈ p. in ..."
211,/Users/yasminsarkhosh/Documents/GitHub/machine...,we propose an unsupervised deep learning metho...
212,/Users/yasminsarkhosh/Documents/GitHub/machine...,accurate polyp detection is essential for assi...


In [81]:
# Save to CSV
csv_filename = 'extracted_info_datasets.csv'
df.to_csv(csv_filename, index=False)

### CSV files with extracted sentences
***


In [21]:
# Read from CSV
f_name_keywords = 'extracted_sents_keywords'
f_name_keywords_2 = 'extracted_sents_keywords_2'
f_name_cancer = 'extracted_sents_cancer'
f_name_organs = 'extracted_sents_organs'

1st attempt

In [31]:
# Refine the dataframe for extracted sentences by the list of keywords: 1st attempt
f_name = f_name_keywords
file_path = '/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/code/finals/' + f_name + '.csv'
extracted_sents = pd.read_csv(file_path)

# Fill NaN values with 'None'
extracted_sents.fillna('None', inplace=True)

# Refine the paper_id column to only include part of the pdf path 
extracted_sents['path'] = extracted_sents['paper_id'].str.split('/').apply(lambda x: '/'.join(x[-2:]))
extracted_sents.rename(columns={'paper_id': 'path_long'}, inplace=True)

# Save to CSV file for later use in the pipeline. columns: complete path, title, path
# extracted_sents.to_csv('extracted_sents_keywords_refined.csv', index=False)

2nd attempt

In [25]:
# Refine the dataframe for extracted sentences by the list of keywords: 2nd attempt
f_name = f_name_keywords_2
file_path = '/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/code/finals/' + f_name + '.csv'
extracted_sents = pd.read_csv(file_path)

# Fill NaN values with 'None'
extracted_sents.fillna('None', inplace=True)

# Refine the paper_id column to only include part of the pdf path 
extracted_sents['path'] = extracted_sents['paper_id'].str.split('/').apply(lambda x: '/'.join(x[-2:]))
extracted_sents.rename(columns={'paper_id': 'path_long'}, inplace=True)

# Save to CSV file for later use in the pipeline. columns: complete path, title, path
#extracted_sents.to_csv('extracted_sents_keywords_2_refined.csv', index=False)

In [24]:
extracted_sents

Unnamed: 0,path_long,extracted_sents_keywords,path
0,/Users/yasminsarkhosh/Documents/GitHub/machine...,"however, novel unsupervised\nanomaly detectors...",miccai23vol1/paper_29.pdf
1,/Users/yasminsarkhosh/Documents/GitHub/machine...,this transforms the problem to learning the co...,miccai23vol1/paper_29.pdf
2,/Users/yasminsarkhosh/Documents/GitHub/machine...,we found when training the vq-vae model on dat...,miccai23vol1/paper_29.pdf
3,/Users/yasminsarkhosh/Documents/GitHub/machine...,"compared to\ncenterline segmentation, where th...",miccai23vol1/paper_14.pdf
4,/Users/yasminsarkhosh/Documents/GitHub/machine...,the cohort consists of 141 patients with pancr...,miccai23vol1/paper_14.pdf
...,...,...,...
1298,/Users/yasminsarkhosh/Documents/GitHub/machine...,such inconsis-\ntent metrics suggest that the ...,miccai23vol10/paper_3.pdf
1299,/Users/yasminsarkhosh/Documents/GitHub/machine...,our work challenges the conventional\nassumpti...,miccai23vol10/paper_21.pdf
1300,/Users/yasminsarkhosh/Documents/GitHub/machine...,"to address these issues, we concentrate on the...",miccai23vol10/paper_24.pdf
1301,/Users/yasminsarkhosh/Documents/GitHub/machine...,lowering the dose of ct\nscans has been widely...,miccai23vol10/paper_24.pdf


### Merge selected papers with metadata from the complete list of MICCAI 2023 papers
***

#### List of keywords

1 attempt

In [62]:
# Merge MICCAI 2023 database with the selected papers to get the metadata 
#papers = pd.merge(df_miccai, selected_papers_df, on='paper_id', how='inner').drop(columns=['title_y']).rename(columns={'title_x': 'title'})
#papers.to_csv('papers.csv', index=False)

# Merge the papers with the extracted sentences by keywords
#pd.merge(papers, extracted_sents, on='path', how='inner').to_csv('papers_with_sents_by_keywords_metadata.csv', index=False)

2nd attempt

In [27]:
# Merge MICCAI 2023 database with the selected papers to get the metadata 
#papers = pd.merge(df_miccai, selected_papers_df, on='paper_id', how='inner').drop(columns=['title_y']).rename(columns={'title_x': 'title'})
#papers.to_csv('papers.csv', index=False)

# Merge the papers with the extracted sentences by keywords
#pd.merge(papers, extracted_sents, on='path', how='inner').to_csv('papers_with_sents_by_keywords_metadata.csv_2', index=False)

#### Organs

In [40]:
# Merge MICCAI 2023 database with the selected papers to get the metadata 
papers = pd.merge(df_miccai, selected_papers_df, on='paper_id', how='inner').drop(columns=['title_y']).rename(columns={'title_x': 'title'})
#papers.to_csv('papers.csv', index=False)

# Merge the papers with the extracted sentences by keywords
papers_sents_organs = pd.merge(papers, extracted_sents, on='path', how='inner').to_csv('papers_with_sents_by_organs_metadata.csv', index=False)

## Final dataframe with extracted sentences by the list of keywords and metadata

In [30]:
def move_col_position_to_first(df, col_name):
    # Column to move to the first position
    column_to_move = col_name

    # Create a new list of column names with the specified column first
    new_columns = [column_to_move] + [col for col in df.columns if col != column_to_move]

    # Reindex the DataFrame with the new column order
    df = df[new_columns]
    return df

def move_col_position_to_last(df, col_name):
    # Column to move to the first position
    column_to_move = col_name

    # Create a new list of column names with the specified column first
    new_columns = [column_to_move] + [col for col in df.columns if col != column_to_move]

    # Create a new list of column names with the specified column last
    new_columns = [col for col in df.columns if col != column_to_move] + [column_to_move] 

    # Reindex the DataFrame with the new column order
    df = df[new_columns]
    return df

### List of keywords
***

1st attempt

In [68]:
filename = '/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/code/papers_with_sents_by_keywords_metadata.csv'
papers_with_sentences_df = pd.read_csv(filename)

# Fill NaN values with 'None'
papers_with_sentences_df.fillna('None', inplace=True)
papers_with_sentences_df

Unnamed: 0,title,authors,page_numbers,doi,publication_year,volume,paper_id,path,path_long,extracted_sents_keywords
0,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,the cohort consists of 141 patients with pancr...
1,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,we distinguish between models selected accordi...
2,3D Dental Mesh Segmentation Using Semantics-Ba...,"Fan Duan, Li Chen",456-465,10.1007/978-3-031-43990-2_43,2023,7,2,miccai23vol8/paper_59.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,"during training\nof mitoem, for the fair compa..."
3,3D Dental Mesh Segmentation Using Semantics-Ba...,"Fan Duan, Li Chen",456-465,10.1007/978-3-031-43990-2_43,2023,7,2,miccai23vol8/paper_59.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,"for fair comparison with previous\nworks, we u..."
4,3D Medical Image Segmentation with Sparse Anno...,"Heng Cai, Lei Qi, Qian Yu, Yinghuan Shi, Yang Gao",614-624,10.1007/978-3-031-43898-1_59,2023,3,3,miccai23vol2/paper_46.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,video\nresnet-50 40.0 70.3\n43.3\nthe previous...
...,...,...,...,...,...,...,...,...,...,...
656,DeepGraphDMD: Interpretable Spatio-Temporal De...,"Md Asadullah Turja, Martin Styner, Guorong Wu",358-368,10.1007/978-3-031-43993-3_35,2023,8,186,miccai23vol1/paper_22.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,"for d2\nhisto, which has fewer patients than t..."
657,DeepGraphDMD: Interpretable Spatio-Temporal De...,"Md Asadullah Turja, Martin Styner, Guorong Wu",358-368,10.1007/978-3-031-43993-3_35,2023,8,186,miccai23vol1/paper_22.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,the method depth-aware manages to correctly en...
658,DeepSOZ: A Robust Deep Model for Joint Tempora...,"Deeksha M. Shama, Jiasen Jing, Archana Venkata...",184-194,10.1007/978-3-031-43993-3_18,2023,8,187,miccai23vol10/paper_66.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,"by grid search on the\nvalidation set, we sele..."
659,Democratizing Pathological Image Segmentation ...,"Ruining Deng, Yanwei Li, Peize Li, Jiacheng Wa...",497-507,10.1007/978-3-031-43987-2_48,2023,6,188,miccai23vol5/paper_5.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,"for the fairness of the experiments, we keep t..."


In [73]:
# Rearrange the columns to have paper_id first
df = move_col_position_to_last(papers_with_sentences_df, 'extracted_sents_keywords')
df = move_col_position_to_first(df, 'paper_id')
#df.to_csv('papers_with_sents_by_keywords_metadata.csv', index=False)

2nd attempt

In [31]:
filename = '/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/code/papers_with_sents_by_keywords_metadata_2.csv'
papers_with_sentences_df = pd.read_csv(filename)

# Fill NaN values with 'None'
papers_with_sentences_df.fillna('None', inplace=True)
papers_with_sentences_df

Unnamed: 0,title,authors,page_numbers,doi,publication_year,volume,paper_id,path,path_long,extracted_sents_keywords
0,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,"compared to\ncenterline segmentation, where th..."
1,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,the cohort consists of 141 patients with pancr...
2,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,we distinguish between models selected accordi...
3,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,with the exception of the single ﬁxed viewpoin...
4,3D Dental Mesh Segmentation Using Semantics-Ba...,"Fan Duan, Li Chen",456-465,10.1007/978-3-031-43990-2_43,2023,7,2,miccai23vol8/paper_59.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,accurate 3d mitochondria instance segmentation...
...,...,...,...,...,...,...,...,...,...,...
1298,Democratizing Pathological Image Segmentation ...,"Ruining Deng, Yanwei Li, Peize Li, Jiacheng Wa...",497-507,10.1007/978-3-031-43987-2_48,2023,6,188,miccai23vol5/paper_5.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,"to address the problem of fast-moving polyps, ..."
1299,Dense Transformer based Enhanced Coding Networ...,"Wangduo Xie, Matthew B. Blaschko",77-86,10.1007/978-3-031-43907-0_8,2023,1,189,miccai23vol1/paper_58.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,"meanwhile, to be suitable for many downstream ..."
1300,Dense Transformer based Enhanced Coding Networ...,"Wangduo Xie, Matthew B. Blaschko",77-86,10.1007/978-3-031-43907-0_8,2023,1,189,miccai23vol1/paper_58.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,we reproduce the key results on msd challenge ...
1301,Dense Transformer based Enhanced Coding Networ...,"Wangduo Xie, Matthew B. Blaschko",77-86,10.1007/978-3-031-43907-0_8,2023,1,189,miccai23vol1/paper_58.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,cross validation dice score on ct tasks of msd...


In [32]:
# Rearrange the columns to have paper_id first
df = move_col_position_to_last(papers_with_sentences_df, 'extracted_sents_keywords')
df = move_col_position_to_first(df, 'paper_id')
#df.to_csv('papers_with_sents_by_keywords_metadata_2.csv', index=False)

### Organs
***

In [43]:
filename = '/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/code/papers_with_sents_by_organs_metadata.csv'
papers_with_sentences_organs_df = pd.read_csv(filename)

# Fill NaN values with 'None'
papers_with_sentences_organs_df.fillna('None', inplace=True)
papers_with_sentences_organs_df

Unnamed: 0,title,authors,page_numbers,doi,publication_year,volume,paper_id,path,path_long,extracted_sents_organs
0,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,3d vessel segmentation is being actively inves...
1,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,this is especially the case for 3d\nvessel seg...
2,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,our\ncode is available at: https://github.com/...
3,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,automatic vessel segmentation has been extensi...
4,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,"[8], or more recently with deep learning [3,5,..."
...,...,...,...,...,...,...,...,...,...,...
2660,Dense Transformer based Enhanced Coding Networ...,"Wangduo Xie, Matthew B. Blaschko",77-86,10.1007/978-3-031-43907-0_8,2023,1,189,miccai23vol1/paper_58.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,[8] or non-contrastive [9] joint embedding met...
2661,Dense Transformer based Enhanced Coding Networ...,"Wangduo Xie, Matthew B. Blaschko",77-86,10.1007/978-3-031-43907-0_8,2023,1,189,miccai23vol1/paper_58.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,[24]. unet’s back-\nbone returns a feature map...
2662,Dense Transformer based Enhanced Coding Networ...,"Wangduo Xie, Matthew B. Blaschko",77-86,10.1007/978-3-031-43907-0_8,2023,1,189,miccai23vol1/paper_58.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,"[1,3,5,15,21,27], totaling\nmore than 6550 cts..."
2663,Dense Transformer based Enhanced Coding Networ...,"Wangduo Xie, Matthew B. Blaschko",77-86,10.1007/978-3-031-43907-0_8,2023,1,189,miccai23vol1/paper_58.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,4.2\nevaluation\nwe evaluate our method on the...


In [194]:
#df_cancer = pd.read_csv('/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/code/finals/finalspapers_with_sentences_metadata.csv')
#df_cancer.drop(columns='extracted_sents_keywords', inplace=True)
#df_cancer.to_csv(output_path + 'papers_with_sentences_cancer_metadata.csv')

***
***

# **Preliminary analysis of MICCAI 2023 - Selected papers**

***

In [33]:
# Read the dataframe with extracted sentences by list of keywords
df

Unnamed: 0,paper_id,title,authors,page_numbers,doi,publication_year,volume,path,path_long,extracted_sents_keywords
0,1,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,"compared to\ncenterline segmentation, where th..."
1,1,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,the cohort consists of 141 patients with pancr...
2,1,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,we distinguish between models selected accordi...
3,1,3D Arterial Segmentation via Single 2D Project...,"Alina F. Dima, Veronika A. Zimmer, Martin J. M...",141-151,10.1007/978-3-031-43907-0_14,2023,1,miccai23vol1/paper_14.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,with the exception of the single ﬁxed viewpoin...
4,2,3D Dental Mesh Segmentation Using Semantics-Ba...,"Fan Duan, Li Chen",456-465,10.1007/978-3-031-43990-2_43,2023,7,miccai23vol8/paper_59.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,accurate 3d mitochondria instance segmentation...
...,...,...,...,...,...,...,...,...,...,...
1298,188,Democratizing Pathological Image Segmentation ...,"Ruining Deng, Yanwei Li, Peize Li, Jiacheng Wa...",497-507,10.1007/978-3-031-43987-2_48,2023,6,miccai23vol5/paper_5.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,"to address the problem of fast-moving polyps, ..."
1299,189,Dense Transformer based Enhanced Coding Networ...,"Wangduo Xie, Matthew B. Blaschko",77-86,10.1007/978-3-031-43907-0_8,2023,1,miccai23vol1/paper_58.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,"meanwhile, to be suitable for many downstream ..."
1300,189,Dense Transformer based Enhanced Coding Networ...,"Wangduo Xie, Matthew B. Blaschko",77-86,10.1007/978-3-031-43907-0_8,2023,1,miccai23vol1/paper_58.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,we reproduce the key results on msd challenge ...
1301,189,Dense Transformer based Enhanced Coding Networ...,"Wangduo Xie, Matthew B. Blaschko",77-86,10.1007/978-3-031-43907-0_8,2023,1,miccai23vol1/paper_58.pdf,/Users/yasminsarkhosh/Documents/GitHub/machine...,cross validation dice score on ct tasks of msd...


In [34]:
import pandas as pd
from collections import Counter

# Function to count keywords in a text
def count_keywords(text, keywords):
    # Counter object to count occurrences of each keyword
    counts = Counter()
    for keyword in keywords:
        # Count occurrences of the keyword in the text
        counts[keyword] = text.lower().count(keyword)
    return counts

def agg_keywords(df, col_title, keywords):
    # Aggregate 'extracted_sentences' for each 'title' and count keywords
    results = {}
    for title, group in df.groupby('title'):
        # Combine all extracted sentences into one large text block
        aggregated_text = " ".join(group[col_title].tolist())
        # Count the keywords in this aggregated text
        keyword_counts = count_keywords(aggregated_text, keywords)
        # Store the result
        results[title] = keyword_counts

    # Convert the results dictionary to a DataFrame 
    results_df = pd.DataFrame.from_dict(results, orient='index')
    return results_df


In [35]:
# Reverse the mapping for aggregation
def agg_columns_to_categories(df, keyword_to_category):
    category_to_keywords = {}
    for keyword, category in keyword_to_category.items():
        category_to_keywords.setdefault(category, []).append(keyword)

    # Aggregate columns into categories
    for category, keywords in category_to_keywords.items():
        if category in df.columns:
            # If the category already exists, add to it
            df[category] += df[keywords].sum(axis=1)
        else:
            # Otherwise, create a new column for the category
            df[category] = df[keywords].sum(axis=1)
        # Drop the original keyword columns
        df.drop(columns=keywords, inplace=True)

    return df

In [36]:
# Mapping of keywords to main categories
keyword_to_category = {
    'age'   : 'age_',
    'gender': 'gender_',
    'sex'   : 'gender_',
    'female': 'gender_',
    'women' : 'gender_',
    'woman' : 'gender_',
    'male'  : 'gender_',
    'geolocation'   : 'geolocation_',
    'geographical'  : 'geolocation_',
    'geographic'    : 'geolocation_',
    'country'       : 'geolocation_',
    'countries'     : 'geolocation_',
    'city'          : 'geolocation_',
    'cities'        : 'geolocation_',
    'hospital'      : 'geolocation_',
    'hospitals'     : 'geolocation_',
    'clinic'        : 'geolocation_',
    'clinics'       : 'geolocation_',
    'society'       : 'social factors',
    'societies'     : 'social factors',
    'etnicity'      : 'etnicity_',
    'etnicities'    : 'etnicity_',
    'race'          : 'etnicity_',
    'bias'          : 'bias_',
    'biases'        : 'bias_',
    'unfair'        : 'fairness_',
    'fair'          : 'fairness_',
    'fairness'      : 'fairness_',
    'transparency'  : 'fairness_',
    'imbalance'     : 'fairness_',
    'imbalanced'    : 'fairness_',
    'balance'       : 'fairness_',
    'balanced'      :'fairness_',
}

In [37]:
# Convert counts to binary values
def convert_to_binary_values(df):
    columns_to_convert = df.columns.tolist()

    # Convert to binary: 1 if the count is greater than 0, else 0
    for column in columns_to_convert:
        df[column] = df[column].apply(lambda x: 1 if x > 0 else 0)
    
    return df

### List of keywords
***

In [None]:
# Search for keywords in the selected papers
keywords = ['age', 'gender', 'sex', 'women', 'woman', 'female', 'male',
            'geolocation', 'geographical', 'geographic', 'country', 'countries', 'city', 'cities', 'hospital', 'hospitals', 'clinic', 'clinics', 
            'society', 'societies',
            'etnicity', 'etnicities', 'race', 
            'bias', 'biases', 'fair', 'unfair', 'fairness', 'transparency',
            'imbalance', 'imbalanced', 'balance', 'balanced']

#keywords_df = pd.DataFrame(keywords, columns=['keyword'])
#keywords_df.to_csv('list_of_keywords.csv', index=False)

In [124]:
# Count the number of occurrences of each keyword in the extracted sentences
count_keywords_df = agg_keywords(df, 'extracted_sents_keywords', keywords)
#count_keywords_df.to_csv('keyword_counts.csv', index=True)

In [111]:
# Aggregate the keywords into categories and aggregate the counts by category
res = agg_columns_to_categories(count_keywords_df, keyword_to_category)
#save_to_csv(res, 'agg_counts')

In [116]:
# Convert the counts to binary values for each category
binary_df =  convert_to_binary_values(res)
#save_to_csv(binary_df, 'agg_columns_binary_values')

In [125]:
count_keywords_df

Unnamed: 0,age,gender,sex,women,woman,female,male,geolocation,geographical,geographic,...,bias,biases,fair,unfair,fairness,transparency,imbalance,imbalanced,balance,balanced
3D Arterial Segmentation via Single 2D Projections and Depth Supervision in Contrast-Enhanced CT Images,0,0,0,0,0,1,2,0,0,0,...,0,0,2,1,0,0,0,0,0,0
3D Dental Mesh Segmentation Using Semantics-Based Feature Learning with Graph-Transformer,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
3D Medical Image Segmentation with Sparse Annotation via Cross-Teaching Between 3D and 2D Networks,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3D Mitochondria Instance Segmentation with Spatio-Temporal Transformers,1,0,0,0,0,1,2,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3D Teeth Reconstruction from Panoramic Radiographs Using Neural Implicit Functions,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Deep Unsupervised Clustering for Conditional Identification of Subgroups Within a Digital Pathology Image Set,0,0,0,0,0,0,0,0,0,0,...,6,0,0,0,0,0,0,0,0,0
DeepGraphDMD: Interpretable Spatio-Temporal Decomposition of Non-linear Functional Brain Network Dynamics,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
DeepSOZ: A Robust Deep Model for Joint Temporal and Spatial Seizure Onset Localization from Multichannel EEG Data,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Democratizing Pathological Image Segmentation with Lay Annotators via Molecular-Empowered Learning,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [122]:
res

Unnamed: 0,age_,gender_,geolocation_,social factors,etnicity_,bias_,fairness_
3D Arterial Segmentation via Single 2D Projections and Depth Supervision in Contrast-Enhanced CT Images,0,1,0,0,0,0,1
3D Dental Mesh Segmentation Using Semantics-Based Feature Learning with Graph-Transformer,0,0,0,0,0,0,1
3D Medical Image Segmentation with Sparse Annotation via Cross-Teaching Between 3D and 2D Networks,0,0,0,0,0,0,1
3D Mitochondria Instance Segmentation with Spatio-Temporal Transformers,1,1,1,0,0,0,1
3D Teeth Reconstruction from Panoramic Radiographs Using Neural Implicit Functions,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...
Deep Unsupervised Clustering for Conditional Identification of Subgroups Within a Digital Pathology Image Set,0,0,0,0,0,1,0
DeepGraphDMD: Interpretable Spatio-Temporal Decomposition of Non-linear Functional Brain Network Dynamics,1,0,1,0,0,0,1
DeepSOZ: A Robust Deep Model for Joint Temporal and Spatial Seizure Onset Localization from Multichannel EEG Data,0,0,0,0,0,0,1
Democratizing Pathological Image Segmentation with Lay Annotators via Molecular-Empowered Learning,0,0,0,0,0,0,1


In [121]:
binary_df

Unnamed: 0,age_,gender_,geolocation_,social factors,etnicity_,bias_,fairness_
3D Arterial Segmentation via Single 2D Projections and Depth Supervision in Contrast-Enhanced CT Images,0,1,0,0,0,0,1
3D Dental Mesh Segmentation Using Semantics-Based Feature Learning with Graph-Transformer,0,0,0,0,0,0,1
3D Medical Image Segmentation with Sparse Annotation via Cross-Teaching Between 3D and 2D Networks,0,0,0,0,0,0,1
3D Mitochondria Instance Segmentation with Spatio-Temporal Transformers,1,1,1,0,0,0,1
3D Teeth Reconstruction from Panoramic Radiographs Using Neural Implicit Functions,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...
Deep Unsupervised Clustering for Conditional Identification of Subgroups Within a Digital Pathology Image Set,0,0,0,0,0,1,0
DeepGraphDMD: Interpretable Spatio-Temporal Decomposition of Non-linear Functional Brain Network Dynamics,1,0,1,0,0,0,1
DeepSOZ: A Robust Deep Model for Joint Temporal and Spatial Seizure Onset Localization from Multichannel EEG Data,0,0,0,0,0,0,1
Democratizing Pathological Image Segmentation with Lay Annotators via Molecular-Empowered Learning,0,0,0,0,0,0,1


***
2nd attempt

In [56]:
# Mapping of keywords to main categories
keyword_to_category = {
    'age'   : 'age_',
    'gender': 'gender_',
    'sex'   : 'gender_',
    'female': 'gender_',
    'women' : 'gender_',
    'woman' : 'gender_',
    'male'  : 'gender_',
    'geolocation'   : 'geolocation_',
    'geographical'  : 'geolocation_',
    'geographic'    : 'geolocation_',
    'country'       : 'geolocation_',
    'countries'     : 'geolocation_',
    'city'          : 'geolocation_',
    'cities'        : 'geolocation_',
    'hospital'      : 'geolocation_',
    'hospitals'     : 'geolocation_',
    'clinic'        : 'geolocation_',
    'clinics'       : 'geolocation_',
    'society'       : 'social factors',
    'societies'     : 'social factors',
    'etnicity'      : 'etnicity_',
    'etnicities'    : 'etnicity_',
    'race'          : 'etnicity_',
    'bias'          : 'bias_',
    'biases'        : 'bias_',
    'unfair'        : 'fairness_',
    'fair'          : 'fairness_',
    'fairness'      : 'fairness_',
    'transparency'  : 'fairness_',
    'imbalance'     : 'fairness_',
    'imbalanced'    : 'fairness_',
    'balance'       : 'fairness_',
    'balanced'      :'fairness_',
    'problem'       : 'concerns',
    'problems'      : 'concerns',
    'issue'         : 'concerns',
    'issues'        : 'concerns',
    'challenge'     : 'concerns',
    'challenges'    : 'concerns',
    'difficulty'    : 'concerns',
    'difficulties'  : 'concerns',
    'critic'        : 'criticism_',
    'critics'       : 'criticism_',
    'criticism'     : 'criticism_',
    'criticize'     : 'criticism_',
    'criticized'    : 'criticism_',        
    'criticizing'   : 'criticism_',
    'critique'      : 'criticism_',
    'critiques'     : 'criticism_',
    'critiqued'     : 'criticism_',
    'critiquing'    : 'criticism_',
}

In [57]:
# Extract relevant sentences from the selected papers by these keywords
keywords = ['age', 'gender', 'sex', 'women', 'woman', 'female', 'male',
            'geolocation', 'geographical', 'geographic', 'country', 'countries', 'city', 'cities', 'hospital', 'hospitals', 'clinic', 'clinics', 
            'society', 'societies',
            'etnicity', 'etnicities', 'race', 
            'bias', 'biases', 'fair', 'unfair', 'fairness', 'transparency',
            'imbalance', 'imbalanced', 'balance', 'balanced',
            'problem', 'problems', 'issue', 'issues', 'challenge', 'challenges', 'difficulty', 'difficulties',
            'critic', 'critics', 'criticism', 'criticize', 'criticized', 'criticizing', 'critique', 'critiques', 'critiqued', 'critiquing']


#keywords_2_df = pd.DataFrame(keywords, columns=['keyword'])
#keywords_2_df.to_csv('list_of_keywords_2.csv', index=False)

In [58]:
# Count the number of occurrences of each keyword in the extracted sentences
count_keywords_2_df = agg_keywords(df, 'extracted_sents_keywords', keywords)
count_keywords_2_df.to_csv('keyword_counts_2.csv', index=True)

In [59]:
# Aggregate the keywords into categories and aggregate the counts by category
res = agg_columns_to_categories(count_keywords_2_df, keyword_to_category)
save_to_csv(res, 'agg_counts_2')

In [60]:

# Convert the counts to binary values for each category
binary_df =  convert_to_binary_values(res)
save_to_csv(binary_df, 'agg_columns_binary_values_2')

***
### Organs

In [49]:
df = papers_with_sentences_organs_df

In [50]:
# Search for organs in the selected papers
keywords = ['adrenal', 'anal', 'anusarteries', 'gi', 'tract', 'gi-tract', 'colon', 'bladder', 'bone', 'marrow', 'bronchi', 'bronchioles', 
            'bulbourethral', 'capillaries', 'cecum', 'cerebellum', 'cerebral', 'cervix', 'choroid', 'plexus', 'ciliary', 'body', 'clitoris', 
            'cochlea', 'cornea', 'cranial', 'nerves', 'duodenum', 'eardrum', 'nervous', 'system', 'epididymis', 'esophagus', 'fallopian', 'tubes', 
            'gallbladder', 'ganglia', 'heart', 'skeleton', 'hypothalamus', 'ileum', 'interstitium', 'iris', 'jejunum', 'joint', 'joints', 'kidneys', 
            'larynx', 'ligament', 'ligaments', 'liver', 'lung', 'lungs', 'lymph', 'node', 'lymphatic', 'vessel', 'glands', 'oblongata', 'mesentery', 
            'brain', 'ear', 'ossicles', 'muscles', 'nasal', 'cavity', 'olfactory', 'epithelium', 'ovaries', 'pancreas', 'parathyroid', 'parotid', 'penis', 
            'pharynx', 'pineal', 'pituitary', 'placenta', 'prostate', 'rectum', 'retina', 'sigmoid', 'skin', 'spinal', 'nerves', 'spleen', 'stomach', 
            'tissue', 'sublingual', 'submandibular', 'teeth', 'tendons', 'testes', 'thalamus', 'spinal', 'cord', 'thymus', 'thyroid', 'tongue', 'tonsils', 
            'trachea', 'transverse', 'ureter', 'urethra', 'uterus', 'vagina', 'veins', 'vulva', 'lung', 'lungs', 'pulmonary', 'respiratory', 'bronchial', 
            'bronchi', 'bronchus', 'bronchial', 'trachea', 'tracheal', 'thoracic', 'thorax', 'diaphragm', 'diaphragmatic', 'pleural', 'pleura', 'alveolar', 
            'alveoli', 'gi-tract', 'gastrointestinal', 'gastro', 'intestinal', 'digestive', 'digestion', 'stomach', 'gastric', 'intestine', 'intestines', 
            'intestinal', 'colon', 'colonic', 'rectum', 'rectal', 'anus', 'anal', 'liver', 'hepatic', 'hepatitis', 'hepatocellular', 'hepatoma', 'hepatocarcinoma',
            'cervical', 'cervix', 'uterus', 'uterine', 'endometrial', 'ovarian', 'ovary', 'fallopian', 'tube', 'vaginal', 'gland', 'prostate gland',
            'prostate glands','testicular', 'testis', 
            'penile', 'breast', 'breast tissue']

In [56]:

female = [
    'clitoris',       
    'cervix',        
    'fallopian',     
    'tubes',         
    'uterus',        
    'vagina',        
    'vulva',         
    'cervical',      
    'uterus',        
    'uterine',       
    'endometrial', 
    'ovarian', 
    'ovary', 
    'fallopian tube', 
    'vaginal', 
    'vaginal gland',
    'placenta',
    'ureter',
    'urethra']
 

male = [
    'prostate',      
    'prostate gland',
    'testes',
    'testicular', 
    'testis', 
    'penile',
    'penis',
    'pineal',
    'prostate glands']


both_sex = [
    'adrenal',
    'anal',
    'anusarteries',
    'gi',
    'tract',
    'gi-tract',
    'colon',
    'bladder',
    'bone',
    'marrow',
    'bronchi',
    'bronchioles',
    'bulbourethral',
    'capillaries',
    'cecum',
    'cerebellum',
    'cerebral',
    'choroid',
    'plexus',
    'ciliary',
    'body',
    'cochlea',
    'cornea',
    'cranial',
    'nerves',
    'duodenum',
    'eardrum',
    'nervous',
    'system',
    'epididymis',
    'esophagus',
    'gallbladder',
    'ganglia',
    'heart',
    'skeleton',
    'hypothalamus',
    'ileum',
    'interstitium',
    'iris',
    'jejunum',
    'joint',
    'joints',
    'kidneys',
    'larynx',
    'ligament',
    'ligaments',
    'liver',
    'lung',
    'lungs',
    'lymph',
    'node',
    'lymphatic',
    'vessel',
    'glands',
    'oblongata',
    'mesentery',
    'brain',
    'ear',
    'ossicles',
    'muscles',
    'nasal',
    'cavity',
    'olfactory',
    'epithelium',
    'pancreas',
    'parathyroid',
    'parotid',
    'pharynx',
    'pituitary',
    'rectum',
    'retina',
    'sigmoid',
    'skin',
    'spinal',
    'nerves',
    'spleen',
    'stomach',
    'tissue',
    'sublingual',
    'submandibular',
    'teeth',
    'tendons',
    'thalamus',
    'spinal',
    'cord',
    'thymus',
    'thyroid',
    'tongue',
    'tonsils',
    'trachea',
    'transverse',
    'veins',
    'vulva',
    'lung',
    'lungs',
    'pulmonary',
    'respiratory',
    'bronchial',
    'bronchi',
    'bronchus',
    'bronchial',
    'trachea',
    'tracheal',
    'thoracic',
    'thorax',
    'diaphragm',
    'diaphragmatic',
    'pleural',
    'pleura',
    'alveolar',
    'alveoli',
    'gi-tract',
    'gastrointestinal',
    'gastro',
    'intestinal',
    'digestive',
    'digestion',
    'stomach',
    'gastric',
    'intestine',
    'intestines',
    'intestinal',
    'colon',
    'colonic',
    'rectum',
    'rectal',
    'anus',
    'anal',
    'liver',
    'hepatic',
    'hepatitis',
    'hepatocellular',
    'hepatoma',
    'hepatocarcinoma',
    'breast',
    'breast tissue']



In [59]:
# Count the number of occurrences of each keyword in the extracted sentences
count_organs_df = agg_keywords(df, 'extracted_sents_organs', keywords)
count_organs_df
count_organs_df.to_csv('count_organs_df.csv', index=True)

In [70]:

# Convert the counts to binary values for each category
binary_organs_df =  convert_to_binary_values(count_organs_df)
#binary_organs_df.to_csv('agg_organs_binary_values.csv')

In [72]:
df_organs = pd.read_csv('/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/code/agg_organs_binary_values.csv')
df_organs.rename(columns={'Unnamed: 0': 'title'}, inplace=True)
df_organs

Unnamed: 0,title,adrenal,anal,anusarteries,gi,tract,gi-tract,colon,bladder,bone,...,tube,vaginal,gland,prostate gland,prostate glands,testicular,testis,penile,breast,breast tissue
0,3D Arterial Segmentation via Single 2D Project...,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,3D Dental Mesh Segmentation Using Semantics-Ba...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3D Medical Image Segmentation with Sparse Anno...,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,3D Mitochondria Instance Segmentation with Spa...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,3D Teeth Reconstruction from Panoramic Radiogr...,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,Deep Unsupervised Clustering for Conditional I...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
185,DeepGraphDMD: Interpretable Spatio-Temporal De...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
186,DeepSOZ: A Robust Deep Model for Joint Tempora...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
187,Democratizing Pathological Image Segmentation ...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
for i,row in df_organs.iterrows():
    if row[i] != 0:
        print(row)

title            3D Arterial Segmentation via Single 2D Project...
adrenal                                                          0
anal                                                             0
anusarteries                                                     0
gi                                                               1
                                       ...                        
testicular                                                       0
testis                                                           0
penile                                                           0
breast                                                           0
breast tissue                                                    0
Name: 0, Length: 150, dtype: object
title            3D Teeth Reconstruction from Panoramic Radiogr...
adrenal                                                          0
anal                                                             0
anusarteries              

  if row[i] != 0:
  if row[i] != 0:
  if row[i] != 0:
  if row[i] != 0:
  if row[i] != 0:
  if row[i] != 0:
  if row[i] != 0:
  if row[i] != 0:


IndexError: index 150 is out of bounds for axis 0 with size 150