In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

In [8]:
# List of keywords used for sentence extractions and annotations
keywords_age        = ['age']

keywords_gender     = ['gender', 'sex', 'women', 'woman', 'female', 'male']

keywords_etnicity   = ['etnicity', 'etnicities', 'race', 'white patients', 'black patients']

keywords_geoloc     = ['geolocation', 'geographical', 'geographic', 'country', 'countries', 'city', 'cities', 
                        'hospital', 'hospitals', 'clinic', 'clinics', 'society', 'societies',]

keywords_bias       = ['bias', 'biases','fairness']

# Unified keyword to category mapping including all themes
keywords_by_category = {
    'demographics_': [
        'age', 'gender', 'sex', 'women', 'woman', 'female', 'male',
        'etnicity', 'etnicities', 'race', 'white patients', 'black patients'],
    'geolocation_': [
        'geolocation', 'geographical', 'geographic', 'country', 'countries', 
        'city', 'cities', 'hospital', 'hospitals', 'clinic', 'clinics'],
    'bias_': ['bias', 'biases', 'fairness'],
    'patient_': ['patient', 'patients']
}

In [9]:
def count_keywords(text, keywords):
    # Counter object to count occurrences of each keyword
    counts = Counter()
    for keyword in keywords:
        # Count occurrences of the keyword in the text
        counts[keyword] = text.lower().count(keyword)
    return counts

In [10]:
def aggregate_keyword_counts(df, keywords, text_column = 'extracted_keyword_sent'):
    # Aggregate 'extracted_sentences' for each 'title' and count keywords
    results = {}
    for title, group in df.groupby('title'):
        # Combine all extracted sentences into one large text block
        aggregated_text = " ".join(group[text_column].tolist())
        # Count the keywords in this aggregated text
        keyword_counts = count_keywords(aggregated_text, keywords)
        # Store the result
        results[title] = keyword_counts

    # Convert the results dictionary to a DataFrame 
    results_df = pd.DataFrame.from_dict(results, orient='index')
    return results_df


In [11]:
def convert_counts_to_binary(df):
    return df.map(lambda x: 1 if x > 0 else 0)

In [46]:
# Reverse the mapping for aggregation
def agg_columns_to_categories(df, keyword_to_category):
    category_to_keywords = {}
    for keyword, category in keyword_to_category.values():
        category_to_keywords.setdefault(category, []).append(keyword)

    # Aggregate columns into categories
    for category, keywords in category_to_keywords.items():
        if category in df.columns:
            # If the category already exists, add to it
            df[category] += df[keywords].sum(axis=1)
        else:
            # Otherwise, create a new column for the category
            df[category] = df[keywords].sum(axis=1)
        # Drop the original keyword columns
        df.drop(columns=keywords, inplace=True)

    return df

In [47]:
keywords_by_category.values()

dict_values([['age', 'gender', 'sex', 'women', 'woman', 'female', 'male', 'etnicity', 'etnicities', 'race', 'white patients', 'black patients'], ['geolocation', 'geographical', 'geographic', 'country', 'countries', 'city', 'cities', 'hospital', 'hospitals', 'clinic', 'clinics'], ['bias', 'biases', 'fairness'], ['patient', 'patients']])

In [48]:
# With customized header title (each graph named after doc-title)
def visualize_category_counts(df, document_title):
    # Title-case the document title
    document_title = document_title.title()

    # Filter categories with counts greater than 0
    category_counts = df.sum().sort_values()
    filtered_counts = category_counts[category_counts > 0]

    # Plot only the filtered categories
    filtered_counts.plot(kind='barh', figsize=(10, 6))
    plt.title(f'Frequency of Keywords Across Selected MICCAI 2023 Papers - | {document_title} |')
    plt.xlabel('Total Counts')
    plt.ylabel('Keywords')
    plt.show()

In [49]:
from matplotlib import category


def analyze_keywords_in_documents(df, category, keywords_by_category, text_column='extracted_keyword_sent'):
    # Aggregate keyword counts
    keyword_counts = agg_columns_to_categories(df, keywords_by_category[category])
    
    # Convert counts to binary
    binary_keyword_counts = convert_counts_to_binary(keyword_counts)
    
    # Visualize category counts
    visualize_category_counts(binary_keyword_counts, keywords_by_category[category])

In [50]:
# Unified keyword to category mapping including all themes
keywords_by_category = {
    'demographics_': [
        'age', 'gender', 'sex', 'women', 'woman', 'female', 'male',
        'etnicity', 'etnicities', 'race', 'white patients', 'black patients'],
    'geolocation_': [
        'geolocation', 'geographical', 'geographic', 'country', 'countries', 
        'city', 'cities', 'hospital', 'hospitals', 'clinic', 'clinics'],
    'bias_': ['bias', 'biases', 'fairness'],
    'patient_': ['patient', 'patients']
}

In [51]:
def analyze_keywords_from_file(filenames, category):
    dfs = []  # List to store dataframes for merging
    for filename in filenames:
        # Load each file into a DataFrame and add it to the list
        df = pd.read_csv(filename)
        dfs.append(df)
    # Concatenate all dataframes in the list into a single dataframe
    merged_df = pd.concat(dfs, ignore_index=True)
    print(f"Analyzing category: {category}")
    print(f"Number of unique titles: {len(merged_df['title'].unique())}")
    # Here, you can replace this with any function that analyzes the merged_df  

    analyze_keywords_in_documents(merged_df, category, keywords_by_category) 

# Path to the folder where the files are stored 
base_path = '/Users/yasminsarkhosh/Documents/GitHub/machine-learning-bsc-thesis-2024/outputs/extracted_sentences'

# Categorizing files into categories for analysis
files_by_category = {
    'demographics': [
        base_path + '/age_related_sentences.csv',
        base_path + '/gender_related_sentences.csv',
        base_path + '/ethnicity_related_sentences.csv',
        base_path + '/geolocation_related_sentences.csv',
    ],
    'patients': [
        base_path + '/patients_related_sentences.csv',
    ],
    'bias': [
        base_path + '/bias_related_sentences.csv',
    ]
}

# Loop over each category and perform analysis
for category, filenames in files_by_category.items():
    analyze_keywords_from_file(filenames, category)


Analyzing category: demographics
Number of unique titles: 263


KeyError: 'demographics'