In [6]:
import os
import re
import pandas as pd
import numpy as np
from collections import Counter

In [1]:
# Function to count keywords in a text
def count_keywords(text, keywords):
    # Counter object to count occurrences of each keyword
    counts = Counter()
    for keyword in keywords:
        # Count occurrences of the keyword in the text
        counts[keyword] = text.lower().count(keyword)
    return counts

def agg_keywords(df, col_title, keywords):
    # Aggregate 'extracted_sentences' for each 'title' and count keywords
    results = {}
    for title, group in df.groupby('title'):
        # Combine all extracted sentences into one large text block
        aggregated_text = " ".join(group[col_title].tolist())
        # Count the keywords in this aggregated text
        keyword_counts = count_keywords(aggregated_text, keywords)
        # Store the result
        results[title] = keyword_counts

    # Convert the results dictionary to a DataFrame 
    results_df = pd.DataFrame.from_dict(results, orient='index')
    return results_df

In [2]:
# Convert counts to binary values
def convert_to_binary_values(df):
    columns_to_convert = df.columns.tolist()

    # Convert to binary: 1 if the count is greater than 0, else 0
    for column in columns_to_convert:
        df[column] = df[column].apply(lambda x: 1 if x > 0 else 0)
    
    return df

In [3]:
# Reverse the mapping for aggregation
def agg_columns_to_categories(df, keyword_to_category):
    category_to_keywords = {}
    for keyword, category in keyword_to_category.items():
        category_to_keywords.setdefault(category, []).append(keyword)

    # Aggregate columns into categories
    for category, keywords in category_to_keywords.items():
        if category in df.columns:
            # If the category already exists, add to it
            df[category] += df[keywords].sum(axis=1)
        else:
            # Otherwise, create a new column for the category
            df[category] = df[keywords].sum(axis=1)
        # Drop the original keyword columns
        df.drop(columns=keywords, inplace=True)

    return df

In [4]:
# Mapping of keywords to main categories
keyword_to_category = {
    'age'   : 'age_',
    'gender': 'gender_',
    'sex'   : 'gender_',
    'female': 'gender_',
    'women' : 'gender_',
    'woman' : 'gender_',
    'male'  : 'gender_',
    'geolocation'   : 'geolocation_',
    'geographical'  : 'geolocation_',
    'geographic'    : 'geolocation_',
    'country'       : 'geolocation_',
    'countries'     : 'geolocation_',
    'city'          : 'geolocation_',
    'cities'        : 'geolocation_',
    'hospital'      : 'geolocation_',
    'hospitals'     : 'geolocation_',
    'clinic'        : 'geolocation_',
    'clinics'       : 'geolocation_',
    'society'       : 'social factors',
    'societies'     : 'social factors',
    'etnicity'      : 'etnicity_',
    'etnicities'    : 'etnicity_',
    'race'          : 'etnicity_',
    'bias'          : 'bias_',
    'biases'        : 'bias_',
    'unfair'        : 'fairness_',
    'fair'          : 'fairness_',
    'fairness'      : 'fairness_',
    'transparency'  : 'fairness_',
    'imbalance'     : 'fairness_',
    'imbalanced'    : 'fairness_',
    'balance'       : 'fairness_',
    'balanced'      :'fairness_',
    'problem'       : 'concerns',
    'problems'      : 'concerns',
    'issue'         : 'concerns',
    'issues'        : 'concerns',
    'challenge'     : 'concerns',
    'challenges'    : 'concerns',
    'difficulty'    : 'concerns',
    'difficulties'  : 'concerns'
}

***

In [9]:
# List of keywords
keywords_demographics_long = [
    'age', 'gender', 'sex', 'women', 'woman', 'female', 'male',
    'geolocation', 'geographical', 'geographic', 'country', 'countries', 'city', 'cities', 
    'hospital', 'hospitals', 'clinic', 'clinics', 'society', 'societies',
    'etnicity', 'etnicities', 'race', 
    'bias', 'biases', 'fair', 'unfair', 'fairness', 'transparency', 'awareness',
    'imbalance', 'imbalanced', 'balance', 'balanced',
    'problem', 'problems', 'issue', 'issues', 'challenge', 'challenges', 
    'difficult', 'difficulty', 'difficulties']
# List of keywords
keywords_demographics_short = [
    'age', 'gender', 'sex', 'women', 'woman', 'female', 'male',
    'etnicity', 'etnicities', 'race', 
    'bias', 'biases', 'fair', 'unfair', 'fairness', 'transparency', 
    'imbalance', 'imbalanced', 'balance', 'balanced']

In [7]:
extracted_df = pd.read_csv("/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/code/databases/extracted_sentences.csv", index_col=0)
extracted_df

Unnamed: 0,title,extracted_keyword_sent
0,Anatomy-Driven Pathology Detection on Chest X-...,none
1,Self-supervised Learning for Physiologically-B...,none
2,AME-CAM: Attentive Multiple-Exit CAM for Weakl...,"to address this issue, recent research has foc..."
3,AME-CAM: Attentive Multiple-Exit CAM for Weakl...,"to meet this need, many researchers have devot..."
4,AME-CAM: Attentive Multiple-Exit CAM for Weakl...,our proposed method has the following contribu...
...,...,...
324,Trackerless Volume Reconstruction from Intraop...,liver cancer is the most prevalent indication ...
325,Trackerless Volume Reconstruction from Intraop...,such motion is predominant in the context high...
326,CoLa-Diff: Conditional Latent Diffusion Model ...,it shows superiority in model training however...
327,CoLa-Diff: Conditional Latent Diffusion Model ...,-propose an auto-weight adaptation to balance ...


In [10]:
# Store keyword-match for extracted sentences

# Load the DataFrame from the uploaded file
df = extracted_df

# List of keywords
keywords = keywords_demographics_long

# Initialize columns for each keyword with default value 0
for keyword in keywords:
    df[keyword] = 0

# Function to update the keyword columns based on matches
def update_keyword_columns(row):
    text = row['extracted_keyword_sent']  # Assuming this column holds the extracted sentences
    for keyword in keywords:
        # Update the column for the keyword if it's found in the text
        if keyword in text:
            row[keyword] = 1
    return row

# Apply the function to each row in the DataFrame
df = df.apply(update_keyword_columns, axis=1)
df.head()  # Display the first few rows of the updated DataFrame

# Save the updated DataFrame, if needed
#df.to_csv('extracted_sentences_keyword_counts.csv', index=False)



Unnamed: 0,title,extracted_keyword_sent,age,gender,sex,women,woman,female,male,geolocation,...,balanced,problem,problems,issue,issues,challenge,challenges,difficult,difficulty,difficulties
0,Anatomy-Driven Pathology Detection on Chest X-...,none,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Self-supervised Learning for Physiologically-B...,none,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AME-CAM: Attentive Multiple-Exit CAM for Weakl...,"to address this issue, recent research has foc...",0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,AME-CAM: Attentive Multiple-Exit CAM for Weakl...,"to meet this need, many researchers have devot...",1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,AME-CAM: Attentive Multiple-Exit CAM for Weakl...,our proposed method has the following contribu...,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0


In [11]:
# Count the number of occurrences of each keyword in the extracted sentences
keyword_counts = agg_keywords(df, 'extracted_keyword_sent', keywords)
#keyword_counts.to_csv('total_keyword_counts.csv', index=True)

In [12]:
# Aggregate the keywords into categories and aggregate the counts by category
refined_keyword_counts_by_categories = agg_columns_to_categories(keyword_counts, keyword_to_category)
#refined_keyword_counts_by_categories.to_csv('refined_keyword_counts_by_categories.csv')


In [13]:
# Convert the counts to binary values for each category
binary_keyword_counts_by_categories =  convert_to_binary_values(refined_keyword_counts_by_categories)
#binary_keyword_counts_by_categories.to_csv('binary_keyword_counts_by_categories.csv')

In [14]:
binary_keyword_counts_by_categories

Unnamed: 0,awareness,difficult,age_,gender_,geolocation_,social factors,etnicity_,bias_,fairness_,concerns
3D Mitochondria Instance Segmentation with Spatio-Temporal Transformers (vol8),0,0,0,0,0,0,0,0,0,0
A Novel Video-CTU Registration Method with Structural Point Similarity for FURS Navigation (vol9),0,1,1,0,0,0,0,0,0,1
A Sheaf Theoretic Perspective for Robust Prostate Segmentation (vol4),0,0,0,0,0,0,0,0,0,0
A Spatial-Temporal Deformable Attention Based Framework for Breast Lesion Detection in Videos (vol2),0,0,1,0,0,0,0,0,0,1
A Texture Neural Network to Predict the Abnormal Brachial Plexus from Routine Magnetic Resonance Imaging (vol8),0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
Unsupervised Discovery of 3D Hierarchical Structure with Generative Diffusion Features (vol1),0,0,1,0,0,0,0,0,0,1
WeakPolyp: You only Look Bounding Box for Polyp Segmentation (vol3),0,0,0,0,0,0,0,1,0,0
X2Vision: 3D CT Reconstruction from Biplanar X-Rays with Deep Structure Prior (vol10),0,1,0,0,0,0,0,0,0,1
atTRACTive: Semi-automatic White Matter Tract Segmentation Using Active Learning (vol8),0,0,0,0,0,0,0,0,0,0
