### Comparing Final Filtered df with originally scraped.

In [1]:
import os
import pandas as pd

file = "data/processed_final.csv"
main_df = pd.read_csv(file)
print(main_df.columns)
main_df.head(1)



Index(['year', 'paper_id', 'title', 'cleaned_title', 'authors', 'abstract',
       'citation_count', 'code', 'ai', 'venue', 'processed_emails',
       'mimic_count', 'eicu_count', 'uk_biobank_count', 'chest_x-ray14_count',
       'adni_count', 'physionet_count', 'oasis_count', 'tcga_count',
       'gdc_count', 'seer_count', 'tuh_eeg_corpus_count',
       'tuh_abnormal_eeg_corpus_count', 'tuh_eeg_artifact_corpus_count',
       'tuh_eeg_epilepsy_corpus_count', 'tuh_eeg_events_corpus_count',
       'tuh_eeg_seizure_corpus_count', 'tuh_eeg_slowing_corpus_count', 'topic',
       'affiliation', 'affiliation_class', 'paper_with_code_data_count'],
      dtype='object')


Unnamed: 0,year,paper_id,title,cleaned_title,authors,abstract,citation_count,code,ai,venue,...,tuh_abnormal_eeg_corpus_count,tuh_eeg_artifact_corpus_count,tuh_eeg_epilepsy_corpus_count,tuh_eeg_events_corpus_count,tuh_eeg_seizure_corpus_count,tuh_eeg_slowing_corpus_count,topic,affiliation,affiliation_class,paper_with_code_data_count
0,2018,26262333,A Frequency-based Strategy of Obtaining Senten...,,"['Dingcheng Li', 'Majid Rastegar Mojarad', 'Ya...","In clinical NLP, one major barrier to adopting...",0,0,2,pubmed,...,0,0,0,0,0,0,E.H.R,"Mayo Clinic, Rochester, MN, USA.||Mayo Clinic,...",academic,0


#### convert to excel

In [2]:
# import pandas as pd
# import re

# def clean_column_names(df):
#     # Function to clean a single string
#     def clean_string(s):
#         # Remove control characters and other non-printable characters
#         s = ''.join(char for char in s if ord(char) >= 32)
#         # Replace spaces and other problematic characters with underscores
#         s = re.sub(r'[^\w\s-]', '_', s)
#         # Replace multiple underscores with a single underscore
#         s = re.sub(r'_+', '_', s)
#         # Remove leading/trailing underscores
#         return s.strip('_')

#     # Clean column names
#     df.columns = [clean_string(col) for col in df.columns]
    
#     # Clean data in all string columns
#     for col in df.select_dtypes(include=['object']):
#         df[col] = df[col].astype(str).apply(clean_string)
    
#     return df


# # Clean the DataFrame
# main_df = clean_column_names(main_df)

# # Save to Excel
# # df.to_excel('your_file.xlsx', index=False, engine='openpyxl')

# # Save the DataFrame as an Excel file
# main_df.to_excel('AI4H_Reproducibility_Data.xls', index=False, engine='openpyxl')

# Validation Dataset

In [3]:
import pandas as pd
import numpy as np

# Load the main dataframe
# Assuming the main dataframe is stored in a CSV file named 'main_df.csv'
# main_df = pd.read_csv('main_df.csv')

# Randomly sample 30 papers
sampled_df = main_df.sample(n=30, random_state=42)  # Set random_state for reproducibility

# Use 'cleaned_title' if it's not NaN, otherwise use 'title'
sampled_df['final_title'] = np.where(sampled_df['cleaned_title'].notna(), 
                                     sampled_df['cleaned_title'], 
                                     sampled_df['title'])

# Determine if they have code or not
sampled_df['Has Code'] = sampled_df['code'] > 0

# Determine if they have used a public dataset
# This assumes that the dataset columns follow the pattern *_count, but excludes citation_count
dataset_columns = [col for col in sampled_df.columns if col.endswith('_count') and col != 'citation_count']

def has_public_dataset(row):
    used_datasets = []
    for col in dataset_columns:
        if row[col] > 0:
            dataset_name = col.replace('_count', '').replace('_', ' ').title()
            used_datasets.append(dataset_name)
    return ', '.join(used_datasets) if used_datasets else 'No Public Dataset'

sampled_df['Has Public Dataset'] = sampled_df.apply(has_public_dataset, axis=1)

# Select the relevant columns, now including 'topic'
result_df = sampled_df[['final_title', 'year', 'venue', 'topic', 'Has Code', 'Has Public Dataset']]

# Rename 'final_title' back to 'title' for consistency
result_df = result_df.rename(columns={'final_title': 'title'})

# Save the result to a new CSV file
result_df.to_csv('sampled_papers_with_code_and_dataset.csv', index=False)

print("Sample of 30 papers with code, dataset information, and topic has been saved to 'sampled_papers_with_code_and_dataset.csv'")

Sample of 30 papers with code, dataset information, and topic has been saved to 'sampled_papers_with_code_and_dataset.csv'


In [4]:
def calculate_paper_stats(topic_df):
    """
    Calculates various statistics about papers in the dataset:
    - Overall percentage of MLHC papers that do not share code
    - Total number of MLHC papers
    - Total number of PubMed papers
    - Total number of non-PubMed papers

    Parameters:
    topic_df (pandas.DataFrame): The input dataframe containing 'venue' and 'code' columns.

    Returns:
    dict: A dictionary containing the calculated statistics.
    """
    # Filter for MLHC papers
    mlhc_papers = topic_df[topic_df['venue'] == 'MLHC']
    
    # Count total MLHC papers
    total_mlhc_papers = len(mlhc_papers)
    
    # Count MLHC papers with no code shared (assuming 'code' column is 0 for no code)
    no_code_papers = len(mlhc_papers[mlhc_papers['code'] == 0])
    
    # Calculate overall percentage for MLHC papers not sharing code
    if total_mlhc_papers > 0:
        percentage_no_code = (no_code_papers / total_mlhc_papers) * 100
    else:
        percentage_no_code = 0  # Avoid division by zero if there are no MLHC papers

    # Count PubMed papers
    pubmed_papers = len(topic_df[topic_df['venue'] == 'pubmed'])
    
    # Count non-PubMed papers
    non_pubmed_papers = len(topic_df[topic_df['venue'] != 'pubmed'])
    
    # Prepare results
    results = {
        "mlhc_no_code_percentage": percentage_no_code,
        "total_mlhc_papers": total_mlhc_papers,
        "total_pubmed_papers": pubmed_papers,
        "total_non_pubmed_papers": non_pubmed_papers
    }
    
    return results

# Usage example:
stats = calculate_paper_stats(main_df)

print(f"Overall percentage of MLHC papers that do not share code: {stats['mlhc_no_code_percentage']:.2f}%")
print(f"Total number of MLHC papers: {stats['total_mlhc_papers']}")
print(f"Total number of PubMed papers: {stats['total_pubmed_papers']}")
print(f"Total number of non-PubMed papers: {stats['total_non_pubmed_papers']}")
def calculate_detailed_paper_stats(topic_df):
    """
    Calculates detailed statistics about papers in the dataset.

    Parameters:
    topic_df (pandas.DataFrame): The input dataframe containing 'venue' and 'code' columns.

    Returns:
    dict: A dictionary containing the calculated statistics.
    """
    results = calculate_paper_stats(topic_df)  # Get the basic stats
    
    # Get counts for each non-PubMed venue
    venue_counts = topic_df[topic_df['venue'] != 'pubmed']['venue'].value_counts().to_dict()
    
    results["non_pubmed_breakdown"] = venue_counts
    
    return results

# Usage example:
detailed_stats = calculate_detailed_paper_stats(main_df)

print(f"Overall percentage of MLHC papers that do not share code: {detailed_stats['mlhc_no_code_percentage']:.2f}%")
print(f"Total number of MLHC papers: {detailed_stats['total_mlhc_papers']}")
print(f"Total number of PubMed papers: {detailed_stats['total_pubmed_papers']}")
print(f"Total number of non-PubMed papers: {detailed_stats['total_non_pubmed_papers']}")
print("\nBreakdown of non-PubMed papers by venue:")
for venue, count in detailed_stats['non_pubmed_breakdown'].items():
    print(f"  {venue}: {count}")
print(len(main_df))

Overall percentage of MLHC papers that do not share code: 58.14%
Total number of MLHC papers: 258
Total number of PubMed papers: 2082
Total number of non-PubMed papers: 528
Overall percentage of MLHC papers that do not share code: 58.14%
Total number of MLHC papers: 258
Total number of PubMed papers: 2082
Total number of non-PubMed papers: 528

Breakdown of non-PubMed papers by venue:
  MLHC: 258
  CHIL: 147
  ML4H: 123
2610


#### Total Papers Across Time of Finally Processed Dataframe

In [6]:
import pandas as pd
import plotly.express as px

def plot_papers_per_year_plotly(df, width=800, height=600, save_path=None,
                                legend_font_size=24, axis_title_font_size=24,
                                axis_tick_font_size=20, total_annotation_font_size=20,
                                save_scale=3):
    # Ensure 'year' is of type int
    df['year'] = df['year'].astype(int)
    
    # Create a mapping for venue names
    venue_mapping = {'pubmed': 'PubMed', 'CHIL': 'CHIL', 'ML4H': 'ML4H', 'MLHC': 'MLHC'}
    df['venue_display'] = df['venue'].map(venue_mapping)
    
    # Group by year and venue, count the papers
    df_grouped = df.groupby(['year', 'venue_display']).size().unstack(fill_value=0).reset_index()
    
    # Ensure all venues are present, fill with 0 if missing
    all_venues = ['PubMed', 'CHIL', 'ML4H', 'MLHC']
    for venue in all_venues:
        if venue not in df_grouped.columns:
            df_grouped[venue] = 0
    
    # Calculate total papers per year
    df_grouped['Total'] = df_grouped[all_venues].sum(axis=1)
    
    # Get the Set2 color palette
    set2_colors = px.colors.qualitative.Set2[1:]
    print(set2_colors)
    color_palette = [
    'rgb(252,141,98)',   # Orange
    'rgb(255,179,150)',  # Lighter Orange (new color)
    'rgb(141,160,203)',  # Grayish-blue
    'rgb(103,122,165)',  # Darker Grayish-Blue
    # 'rgb(28,144,153)'    # Deeper Teal (accent color)
]
    
    color_palette = color_palette[::-1]
    set2_colors = color_palette
    # set2_colors = px.colors.qualitative.Pastel[:len(all_venues)]
    # set2_colors = ['#3594cc', '#ea801c', '#8cc5e3', '#f0b077']
    # set2_colors = ['#d31f11', '#62c8d3', '#f47a00', '#007191']
    # ggplot2_colors = ['#F8766D', '#7CAE00', '#00BFC4', '#C77CFF', '#00A9FF', '#FF61CC']
    # set2_colors = ggplot2_colors[:len(all_venues)]
    # set2_colors = px.colors.sequential.Blues[:len(all_venues)]
    # Create the stacked bar chart with Set2 colors
    fig = px.bar(df_grouped, x='year', y=all_venues,
                 labels={'value': 'Number of Papers', 'variable': 'Venue'},
                 color_discrete_sequence=set2_colors)
    
    # Add total numbers on top of each bar
    for year in df_grouped['year']:
        total = df_grouped.loc[df_grouped['year'] == year, 'Total'].values[0]
        fig.add_annotation(x=year, y=total,
                           text=str(total),
                           showarrow=False,
                           yshift=10,
                           font=dict(size=total_annotation_font_size))
    
    # Update layout for better readability and customization
    fig.update_layout(
        width=width,
        height=height,
        xaxis_title='Year',
        yaxis_title='Number of Papers',
        legend_title='',
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="center",
            x=0.5,
            font=dict(size=legend_font_size)
        ),
        barmode='stack',
        hovermode='x unified',
        font=dict(size=axis_tick_font_size),
        xaxis=dict(title=dict(font=dict(size=axis_title_font_size))),
        yaxis=dict(title=dict(font=dict(size=axis_title_font_size)))
    )
    
    # Show the figure
    fig.show()
    
    # Save the figure if a save path is provided
    if save_path:
        fig.write_image(save_path, scale=save_scale)

# Plot the data
plot_papers_per_year_plotly(
    main_df,
    width=800,
    height=600,
    save_path="figures/papers_per_year_high_res.png"
)

['rgb(252,141,98)', 'rgb(141,160,203)', 'rgb(231,138,195)', 'rgb(166,216,84)', 'rgb(255,217,47)', 'rgb(229,196,148)', 'rgb(179,179,179)']


#### Citation Counts over time

In [5]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

def plot_avg_citations_per_year_plotly(df, width=800, height=600, save_path=None,
                                       legend_font_size=12, axis_title_font_size=14, 
                                       axis_tick_font_size=10, line_thickness=2,
                                       only_overall_average=False, save_scale=3):
    # Ensure 'year' is of type int
    df['year'] = df['year'].astype(int)
    
    # Create a mapping for venue names
    venue_mapping = {'pubmed': 'PubMed', 'CHIL': 'CHIL', 'ML4H': 'ML4H', 'MLHC': 'MLHC'}
    df['venue_display'] = df['venue'].map(venue_mapping)
    
    # Calculate overall average citations per paper per year
    overall_avg = df.groupby('year').apply(lambda x: x['citation_count'].sum() / len(x)).reset_index()
    overall_avg.columns = ['year', 'Overall Average']
    
    if not only_overall_average:
        # Group by year and venue, calculate average citations per paper
        df_grouped = df.groupby(['year', 'venue_display']).agg({
            'citation_count': ['sum', 'count']
        }).reset_index()
        
        # Flatten the column multi-index
        df_grouped.columns = ['_'.join(col).strip() for col in df_grouped.columns.values]
        
        # Rename columns for clarity
        df_grouped = df_grouped.rename(columns={
            'year_': 'year',
            'venue_display_': 'venue_display',
            'citation_count_sum': 'total_citations',
            'citation_count_count': 'paper_count'
        })
        
        # Calculate average citations
        df_grouped['avg_citations'] = df_grouped['total_citations'] / df_grouped['paper_count']
        
        # Pivot the data
        df_pivoted = df_grouped.pivot(index='year', columns='venue_display', values='avg_citations').reset_index()
        
        # Ensure all venues are present, fill with 0 if missing
        all_venues = ['PubMed', 'CHIL', 'ML4H', 'MLHC']
        for venue in all_venues:
            if venue not in df_pivoted.columns:
                df_pivoted[venue] = 0
        
        # Merge overall average with pivoted data
        df_final = pd.merge(df_pivoted, overall_avg, on='year', how='outer')
    else:
        df_final = overall_avg
    
    # Create the line plot
    fig = go.Figure()
    
    # Get the Plotly Express orange color
    px_colors = px.colors.qualitative.Plotly
    orange_color = px_colors[1]  # The second color in the Plotly default color sequence is orange
    
    if only_overall_average:
        fig.add_trace(go.Scatter(x=df_final['year'], y=df_final['Overall Average'], 
                                 mode='lines', name='Overall Average', 
                                 line=dict(width=line_thickness, color=orange_color)))
    else:
        for venue in all_venues + ['Overall Average']:
            if venue == 'Overall Average':
                color = orange_color
            else:
                color = None  # Let Plotly choose colors for other venues
            fig.add_trace(go.Scatter(x=df_final['year'], y=df_final[venue], 
                                     mode='lines', name=venue, 
                                     line=dict(width=line_thickness, color=color)))

    # Update layout for better readability and customization
    fig.update_layout(
        width=width,
        height=height,
        xaxis_title='Year',
        yaxis_title='Average Citations per Paper',
        legend_title='Venue',
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="center",
            x=0.5,
            font=dict(size=legend_font_size)
        ),
        hovermode='x unified',
        font=dict(size=axis_tick_font_size),
        xaxis=dict(
            title=dict(font=dict(size=axis_title_font_size)),
            tickfont=dict(size=axis_tick_font_size)
        ),
        yaxis=dict(
            title=dict(font=dict(size=axis_title_font_size)),
            tickfont=dict(size=axis_tick_font_size)
        )
    )

    # Show the figure
    fig.show()

    # Save the figure if a save path is provided
    if save_path:
        fig.write_image(save_path, scale=save_scale)

# Example usage
# Assuming main_df is your DataFrame
plot_avg_citations_per_year_plotly(
    main_df, 
    width=800, 
    height=600, 
    save_path="figures/general/avg_citations_per_year_high_res.png",
    legend_font_size=20,
    axis_title_font_size=24,
    axis_tick_font_size=20,
    line_thickness=4,
    only_overall_average=True
)

In [6]:
import pandas as pd
import plotly.express as px

def plot_citation_boxplot_per_year(df, width=1000, height=600, save_path=None,
                                   axis_title_font_size=14, axis_tick_font_size=10,
                                   y_axis_cap=150, save_scale=3):
    # Ensure 'year' is of type int
    df['year'] = df['year'].astype(int)
    
    # Cap the citation count at y_axis_cap
    df['capped_citation_count'] = df['citation_count'].clip(upper=y_axis_cap)
    
    # Get the first color from the Set2 palette
    set2_color = px.colors.qualitative.Set2[2]
    # set2_color = '#1a80bb'
    # Create the box plot using Plotly Express
    fig = px.box(df, x='year', y='capped_citation_count',
                 labels={'year': 'Year', 'capped_citation_count': 'Citations'},
                 width=width, height=height,
                 color_discrete_sequence=[set2_color])  # Set color to the first Set2 color
    
    # Update the layout for better readability and customization
    fig.update_layout(
        font=dict(size=axis_tick_font_size),
        xaxis=dict(
            title=dict(font=dict(size=axis_title_font_size)),
            tickfont=dict(size=axis_tick_font_size)
        ),
        yaxis=dict(
            title=dict(font=dict(size=axis_title_font_size)),
            tickfont=dict(size=axis_tick_font_size),
            range=[0, y_axis_cap],
            zeroline=True,
            zerolinewidth=2,
            zerolinecolor='lightgray'
        ),
        plot_bgcolor='white',
        showlegend=False
    )
    
    # Customize outlier points
    fig.update_traces(
        boxpoints='outliers',  # Show outliers
        jitter=0.3,  # Add jitter to outlier points
        pointpos=-1.8,  # Offset the outlier points
        marker=dict(size=2, color=set2_color)  # Make outlier points smaller and use Set2 color
    )
    
    # Show the figure
    fig.show()
    
    # Save the figure if a save path is provided
    if save_path:
        fig.write_image(save_path, scale=save_scale)

# Example usage
# Assuming main_df is your DataFrame
plot_citation_boxplot_per_year(
    main_df,
    width=800,
    height=400,
    save_path="figures/general/citation_boxplot_per_year_capped_high_res.png",
    axis_title_font_size=24,
    axis_tick_font_size=20,
    y_axis_cap=100
)

### Affiliation Classification and Plotting

In [7]:
import pandas as pd
import re

def categorize_emails(email_list):
    industry_domains = ['.com', '.co', '.org', '.io']
    academic_domains = ['.edu', '.ac', '.eth.ch', '.cispa.de', 'dkfz-heidelberg.de']
    has_industry = False
    has_academic = False
    
    if isinstance(email_list, str):
        emails = [e.strip() for e in email_list.split('\n')]
    else:
        emails = []
    
    for email in emails:
        email_lower = email.lower()
        if any(domain in email_lower for domain in industry_domains):
            has_industry = True
        if any(domain in email_lower for domain in academic_domains):
            has_academic = True
    
    if has_industry and has_academic:
        return 'mixed'
    if has_industry:
        return 'industry'
    elif has_academic:
        return 'academic'
    else:
        return 'other'

def categorize_affiliations(affiliation):
    if not isinstance(affiliation, str):
        return 'other'
    
    affiliation_lower = affiliation.lower()
    
    # Keywords for industry and academic affiliations
    industry_keywords = ['google', '.com','widex','axispoint', 'optum','ibm', 'iqvia', 'microsoft', 'apple', 'amazon', 'facebook', 'intel', 'nvidia', 'deargen', 'riken', 'inc', 'corporation', 'tencent', 'analytix', 'llc']
    more_industry_keywords = ['google', 'ibm', 'iqvia', 'microsoft', 'apple', 'amazon', 'facebook', 'intel', 'nvidia', 'deargen', 'riken', 
                         'corporation', 'inc', 'llc', 'ltd', 'gmbh', 'company', 'biotech', 'analytics', 'corp', 'tno', 'leo innovation lab']
    industry_keywords = list(set(industry_keywords + more_industry_keywords))
    
    academic_keywords = ['national', 'isi foundation','institute for health','amsterdam umc','university', 'iust','school','univ' , 'college', 'institute of technology', 'polytechnic', 'instituto superior tecnico', 'universit', 'hospital', 'mayo clinic', 'cancer center', 'NIH', '.ac.uk', 'universidad', '.edu', 'ethz.ch']
    more_academic_keywords = ['university', 'school', 'college', 'institute of technology', 'polytechnic', 'instituto superior tecnico', 
                         'universit', 'hospital', 'medical center', 'health science', 'academia', 'faculty', 'department of', 
                         'school of medicine', 'research institute', 'national institute', 'cancer center', 'clinic', 'basque center for applied mathematics', 'association', 'fau erlangen-nurnberg']
    academic_keywords = list(set(academic_keywords + more_academic_keywords))


    has_industry = any(keyword in affiliation_lower for keyword in industry_keywords)
    has_academic = any(keyword in affiliation_lower for keyword in academic_keywords)
    
    if has_industry and has_academic:
        return 'mixed'
    elif has_industry:
        return 'industry'
    elif has_academic:
        return 'academic'
    else:
        return 'other'

def classify_row(row):
    if row['venue'] == 'pubmed':
        return categorize_affiliations(row['affiliation'])
    else:
        email_category = categorize_emails(row['processed_emails'])
        author_category = categorize_affiliations(row['authors'])
        
        if email_category == author_category:
            return email_category
        elif 'mixed' in [email_category, author_category]:
            return 'mixed'
        elif 'academic' in [email_category, author_category] and 'industry' in [email_category, author_category]:
            return 'mixed'
        elif email_category == 'other':
            return author_category
        elif author_category == 'other':
            return email_category
        else:
            return 'mixed'

# Load the dataframe
topic_df = pd.read_csv("data/topic_df_with_affiliations_filtered_validated.csv")

# Apply classification
topic_df['affiliation_class'] = topic_df.apply(classify_row, axis=1)

# Print classification results
print("\nAffiliation Classification Results:")
print(topic_df['affiliation_class'].value_counts(normalize=True) * 100)

# Save the updated dataframe
output_file = "data/topic_df_with_affiliation_classification.csv"
topic_df.to_csv(output_file, index=False)
print(f"\nUpdated dataframe saved as '{output_file}'")

# Print some examples
print("\nExamples of classified entries:")
for venue in topic_df['venue'].unique():
    print(f"\nVenue: {venue}")
    sample = topic_df[topic_df['venue'] == venue].sample(n=min(3, len(topic_df[topic_df['venue'] == venue])))
    for _, row in sample.iterrows():
        print(f"Title: {row['title']}")
        print(f"Affiliation: {row['affiliation']}")
        print(f"Processed Emails: {row['processed_emails']}")
        print(f"Authors: {row['authors']}")
        print(f"Classification: {row['affiliation_class']}")
        print("---")


Affiliation Classification Results:
academic    58.888889
mixed       36.858238
industry     2.260536
other        1.992337
Name: affiliation_class, dtype: float64

Updated dataframe saved as 'processed_data/topic_df_with_affiliation_classification.csv'

Examples of classified entries:

Venue: pubmed
Title: d-StructMAn: Containerized structural annotation on the scale from genetic variants to whole proteomes
Affiliation: Helmholtz Institute for Pharmaceutical Research Saarland (HIPS)/Helmholtz Centre for Infection Research (HZI), Saarbrucken 8: 66123, Germany.||Graduate School of Computer Science, Saarland University, Saarbrucken 5: 101990, Germany.||Helmholtz Institute for Pharmaceutical Research Saarland (HIPS)/Helmholtz Centre for Infection Research (HZI), Saarbrucken 8: 66123, Germany.||Graduate School of Computer Science, Saarland University, Saarbrucken 5: 101990, Germany.||Interdisciplinary Graduate School of Natural Product Research, Saarland University, Saarbrucken 6: 119991,

### Let's just accept that we can't clean all of them, and just ignore the 50 or so of the papers.

In [8]:
def print_other_papers(df):
    other_papers = df[df['affiliation_class'] == 'other']
    
    print(f"Total number of papers classified as 'other': {len(other_papers)}")
    print(f"Percentage of papers classified as 'other': {len(other_papers) / len(df) * 100:.2f}%")
    print("\nDetails of papers classified as 'other':")
    
    for idx, row in other_papers.iterrows():
        print(f"\nIndex: {idx}")
        print(f"Title: {row['title']}")
        if row['venue'] != 'pubmed':
            print(row['cleaned_title'])
        print(f"Venue: {row['venue']}")
        print(f"Affiliation: {row['affiliation']}")
        print(f"Processed Emails: {row['processed_emails']}")
        print(f"Authors: {row['authors']}")
        print("-" * 50)

# Load the dataframe
# topic_df = pd.read_csv("processed_data/topic_df_with_affiliation_classification.csv")

# Call the function to print 'other' papers
print_other_papers(topic_df)

Total number of papers classified as 'other': 52
Percentage of papers classified as 'other': 1.99%

Details of papers classified as 'other':

Index: 2098
Title: M Mi Da S- A E: Multi-modal Missing Dataaware Stacked Autoencoderfor Biomedical Abstract Screening
Multi-Modal Missing Data-Aware Stacked Autoencoder for Biomedical Abstract Screening
Venue: CHIL
Affiliation: nan
Processed Emails: No email addresses found. We are not using code here.
Authors: nan
--------------------------------------------------

Index: 2160
Title: Interpretable Missing Valuesin Healthcare Figure7: Impactoffather’seducationoninfantmor-talityrisk,. Appendix A. Testingfor M C A Rwith E B M: Case Study Insomecaseswehaveinformationaboutthemech-anismgeneratingmissingvaluesandthelikelihoodthatasimilarmechanismwillgeneratedatainthefuture. Asanexample,consider D Birth Cohort Linked Birth– Infant Death Data Files United States De-partmentof Healthand Human Services( U S H S)etal.. Thedatasetdescribespregnancyandbirthva

In [47]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

def create_affiliation_pie_chart(df, column_name='affiliation_class', exclude_category='other',
                                 width=800, height=600, text_size=12, legend_font_size=10,
                                 save_path=None, save_scale=3):
    """
    Create an interactive pie chart of affiliations using Plotly Express.
    Parameters:
    df (pandas.DataFrame): The input DataFrame containing the affiliation data.
    column_name (str): The name of the column containing affiliation categories.
    exclude_category (str): The category to exclude from the pie chart.
    width (int): Width of the figure in pixels.
    height (int): Height of the figure in pixels.
    text_size (int): Size of the text displayed on the pie slices.
    legend_font_size (int): Size of the font in the legend.
    save_path (str): The path to save the figure. If None, the figure is not saved.
    save_scale (int): The scale factor for saved image resolution.
    Returns:
    plotly.graph_objs._figure.Figure: The created figure object.
    """
    # Filter out the excluded category and calculate the counts
    affiliation_counts = df[df[column_name] != exclude_category][column_name].value_counts().reset_index()
    affiliation_counts.columns = ['Affiliation', 'Count']
    
    # Capitalize the affiliation categories
    affiliation_counts['Affiliation'] = affiliation_counts['Affiliation'].str.capitalize()
    
    # Calculate percentages and round to whole numbers
    total = affiliation_counts['Count'].sum()
    affiliation_counts['Percentage'] = (affiliation_counts['Count'] / total * 100).round().astype(int)
    
    # Create custom text for labels
    affiliation_counts['Label'] = affiliation_counts['Percentage'].astype(str) + '%'
    

    # color_discrete_sequence = px.colors.sequential.Reds[3:6]
    # color_discrete_sequence = ["#aa0000", "#c46666", "#d8a6a6"] # reds
    color_discrete_sequence = px.colors.qualitative.Set2[:len(affiliation_counts)]
    color_palette = [
        'rgb(252,141,98)',   # Orange
        'rgb(255,179,150)',  # Lighter Orange (new color)
        'rgb(141,160,203)',  # Grayish-blue
        'rgb(103,122,165)',  # Darker Grayish-Blue
        # 'rgb(28,144,153)'    # Deeper Teal (accent color)
    ]
    
    # color_palette = color_palette[::-1]
    color_discrete_sequence = color_palette
    # color_discrete_sequence = ['#2066a8', '#3594cc', '#8cc5e3'] # blues
    # color_discrete_sequence = px.colors.sequential.Oranges[3:6]
    # color_discrete_sequence = ['#1a80bb', '#ea801c', '#b8b8b8']
    # Create an interactive pie chart
    fig = px.pie(affiliation_counts,
                 values='Count',
                 names='Affiliation',
                 hover_data=['Percentage'],
                 labels={'Count':'Number of Papers'},
                 color_discrete_sequence=color_discrete_sequence,
                 width=width,
                 height=height)
    
    # Update traces to show whole number percentage on slices and keep hover template
    fig.update_traces(textposition='inside', textinfo='text',
                      text=affiliation_counts['Label'],
                      textfont_size=text_size,
                      hovertemplate="<b>%{label}</b><br>" +
                                    "Number of Papers: %{value}<br>" +
                                    "Percentage: %{text}<extra></extra>")
    
    # Move legend to the top and set font size
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1.0,
            font=dict(size=legend_font_size)
        ),
        margin=dict(t=80), # Increase top margin to accommodate legend
    )
    
    # Show the figure
    fig.show()
    
    # Save the figure if a save path is provided
    if save_path:
        fig.write_image(save_path, scale=save_scale)
        print(f"Pie chart saved as '{save_path}'")
    
    # Print the counts and percentages
    for _, row in affiliation_counts.iterrows():
        print(f"{row['Affiliation']}: {row['Count']} ({row['Percentage']}%)")
    print(f"\nTotal papers (excluding '{exclude_category.capitalize()}'): {total}")
    
    return fig

# Create the pie chart
fig = create_affiliation_pie_chart(
    main_df,
    width=600,
    height=400,
    text_size=28,  # Specify the desired text size for pie slices
    legend_font_size=24,  # Specify the desired font size for the legend
    save_path="figures/general/affiliation_distribution_pie_chart_capitalized_with_whole_percentages.png",
    save_scale=3
)

Pie chart saved as 'figures/general/affiliation_distribution_pie_chart_capitalized_with_whole_percentages.png'
Academic: 1537 (60%)
Mixed: 962 (38%)
Industry: 59 (2%)

Total papers (excluding 'Other'): 2558


In [10]:
topic_df.columns

Index(['year', 'paper_id', 'title', 'cleaned_title', 'authors', 'abstract',
       'citation_count', 'code', 'ai', 'venue', 'processed_emails',
       'mimic_count', 'eicu_count', 'uk_biobank_count', 'chest_x-ray14_count',
       'adni_count', 'physionet_count', 'oasis_count', 'tcga_count',
       'gdc_count', 'seer_count', 'tuh_eeg_corpus_count',
       'tuh_abnormal_eeg_corpus_count', 'tuh_eeg_artifact_corpus_count',
       'tuh_eeg_epilepsy_corpus_count', 'tuh_eeg_events_corpus_count',
       'tuh_eeg_seizure_corpus_count', 'tuh_eeg_slowing_corpus_count', 'topic',
       'affiliation', 'affiliation_class'],
      dtype='object')

In [8]:
import plotly.graph_objects as go

def plot_code_vs_nocode_overall(df, width=600, height=400, text_size=14, legend_font_size=12, save_path=None, save_scale=3):
    # Calculate percentages
    total = len(df)
    code_percentage = round((df['code'] > 0).sum() / total * 100)
    nocode_percentage = 100 - code_percentage

    # Create custom text for labels
    labels = ['Code', 'No Code']
    values = [code_percentage, nocode_percentage]
    custom_text = [f'{v}%' for v in values]

    color_scheme = px.colors.qualitative.Set2[1:3]  # Use the first two colors from the Set2 palette
    color_scheme = ['#ea801c', '#1a80bb']
    color_map = {'Private Data': color_scheme[0], 'Public Data': color_scheme[1]}
    color_scheme = [
    'rgb(252,141,98)',   # Orange
    # 'rgb(255,179,150)',  # Lighter Orange (new color)
    'rgb(141,160,203)',  # Grayish-blue
    # 'rgb(103,122,165)',  # Darker Grayish-Blue
        # 'rgb(28,144,153)'    # Deeper Teal (accent color)
    ]
    # Create the pie chart
    fig = go.Figure(data=[go.Pie(labels=labels,
                                 values=values,
                                 marker_colors=[color_scheme[1], color_scheme[0]],
                                 text=custom_text,
                                 textposition='inside',
                                 textfont_size=text_size,
                                 hovertemplate="%{label}: %{text}<extra></extra>")])

    fig.update_traces(textinfo='text')

    fig.update_layout(
        width=width,
        height=height,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="center",
            x=0.5,
            font=dict(size=legend_font_size)
        ),
        margin=dict(t=80, b=20, l=20, r=20)
    )

    fig.show()
    
    if save_path:
        fig.write_image(save_path, scale=save_scale)
        print(f"Pie chart saved as '{save_path}'")

    # Print the percentages
    print(f"Code: {code_percentage}%")
    print(f"No Code: {nocode_percentage}%")

# Usage:
# plot_code_vs_nocode_overall(your_dataframe, text_size=14, legend_font_size=12, save_path="code_vs_nocode_overall_piechart.png")

# Usage:
# plot_code_vs_nocode_overall(your_dataframe, text_size=14, legend_font_size=12, save_path="code_vs_nocode_overall_piechart.png")

# Usage:
plot_code_vs_nocode_overall(main_df, text_size=28, legend_font_size=24, save_path="figures/code_vs_nocode_overall_piechart.png")

Pie chart saved as 'figures/code_vs_nocode_overall_piechart.png'
Code: 26%
No Code: 74%


In [9]:
import plotly.graph_objects as go

def plot_public_vs_private_overall(df, width=600, height=400, text_size=14, legend_font_size=12, save_path=None, save_scale=3):
    # List of public dataset columns
    public_datasets = [
        'mimic_count', 'eicu_count', 'uk_biobank_count', 'chest_x-ray14_count',
        'adni_count', 'physionet_count', 'oasis_count', 'tcga_count', 'gdc_count',
        'seer_count', 'tuh_eeg_corpus_count', 'tuh_abnormal_eeg_corpus_count',
        'tuh_eeg_artifact_corpus_count', 'tuh_eeg_epilepsy_corpus_count',
        'tuh_eeg_events_corpus_count', 'tuh_eeg_seizure_corpus_count',
        'tuh_eeg_slowing_corpus_count', 'paper_with_code_data_count'
    ]

    # Calculate if any public dataset was used
    df['public_data_used'] = df[public_datasets].any(axis=1)

    # Calculate percentages
    total = len(df)
    public_percentage = round(df['public_data_used'].sum() / total * 100)
    private_percentage = 100 - public_percentage

    # Create custom text for labels
    labels = ['Public Data', 'Private Data']
    values = [public_percentage, private_percentage]
    custom_text = [f'{v}%' for v in values]

    color_scheme = px.colors.qualitative.Set2[1:3]  # Use the first two colors from the Set2 palette
    # color_scheme = ['#1a80bb', '#ea801c']
    # color_scheme = ['#ea801c', '#1a80bb']
    color_scheme = [
    'rgb(252,141,98)',   # Orange
    # 'rgb(255,179,150)',  # Lighter Orange (new color)
    'rgb(141,160,203)',  # Grayish-blue
    # 'rgb(103,122,165)',  # Darker Grayish-Blue
        # 'rgb(28,144,153)'    # Deeper Teal (accent color)
    ]
    color_map = {'Private Data': color_scheme[0], 'Public Data': color_scheme[1]}
    # Create the pie chart
    fig = go.Figure(data=[go.Pie(labels=labels,
                                 values=values,
                                 marker_colors=[color_scheme[1], color_scheme[0]],
                                 text=custom_text,
                                 textposition='inside',
                                 textfont_size=text_size,
                                 hovertemplate="%{label}: %{text}<extra></extra>")])

    fig.update_traces(textinfo='text')

    fig.update_layout(
        width=width,
        height=height,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="center",
            x=0.5,
            font=dict(size=legend_font_size)
        ),
        margin=dict(t=80, b=20, l=20, r=20)
    )

    fig.show()
    
    if save_path:
        fig.write_image(save_path, scale=save_scale)
        print(f"Pie chart saved as '{save_path}'")
    # Print the percentages
    print(f"Public Data: {public_percentage}%")
    print(f"Private Data: {private_percentage}%")

# Usage:
# plot_public_vs_private_overall(your_dataframe, text_size=14, legend_font_size=12, save_path="public_vs_private_overall_piechart.png")

# Usage:
# plot_public_vs_private_overall(your_dataframe, text_size=14, legend_font_size=12, save_path="public_vs_private_overall_piechart.png")
# Usage
plot_public_vs_private_overall(main_df, text_size=28, legend_font_size=24, save_path="figures/public_vs_private_overall_piechart.png")

Pie chart saved as 'figures/public_vs_private_overall_piechart.png'
Public Data: 26%
Private Data: 74%


In [52]:
import plotly.graph_objects as go
from collections import Counter
import plotly.express as px

def plot_topic_distribution(df, width=800, height=600, text_size=14, legend_font_size=12, save_path=None, save_scale=3):
    # Calculate topic distribution
    topic_counts = Counter(df['topic'])
    total = len(df)
    
    # Sort topics by count (descending order)
    sorted_topics = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)
    
    # Separate labels and values
    labels = [topic for topic, _ in sorted_topics]
    values = [count for _, count in sorted_topics]
    
    # Calculate percentages
    percentages = [count / total * 100 for count in values]
    
    # Create custom text for labels
    custom_text = [f'{p:.0f}%' for p in percentages]
    
    # Get Set2 color palette
    # set2_colors = px.colors.sequential.RdBu[5:]
    # set2_colors = ['#0d7d87', '#c31e23', '#99c6cc', '#ff5a5e']
    # set2_colors = px.colors.qualitative.Set2[:len(labels)]
    set2_colors = ["#2066a8", "#3594cc", "#8cc5e3"]
    set2_colors = px.colors.sequential.Blues[3:7]
    set2_colors = px.colors.sequential.Teal_r[3:7]
    color_palette = [
    'rgb(252,141,98)',   # Orange
    'rgb(255,179,150)',  # Lighter Orange (new color)
    'rgb(141,160,203)',  # Grayish-blue
    'rgb(103,122,165)',  # Darker Grayish-Blue
        # 'rgb(28,144,153)'    # Deeper Teal (accent color)
    ]
        
    color_palette = color_palette[::-1]
    set2_colors = color_palette
    # Create the pie chart with Set2 colors
    fig = go.Figure(data=[go.Pie(
        labels=labels,
        values=values,
        text=custom_text,
        textposition='inside',
        textfont_size=text_size,
        hovertemplate="%{label}: %{text}<extra></extra>",
        marker=dict(colors=set2_colors[:len(labels)])  # Use Set2 colors
    )])
    
    fig.update_traces(textinfo='text')
    fig.update_layout(
        width=width,
        height=height,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="center",
            x=0.5,
            font=dict(size=legend_font_size)
        ),
        margin=dict(t=80, b=20, l=20, r=20)
    )
    
    fig.show()
    
    if save_path:
        fig.write_image(save_path, scale=save_scale)
        print(f"Pie chart saved as '{save_path}'")
    
    # Print the percentages
    for topic, percentage in zip(labels, percentages):
        print(f"{topic}: {percentage:.0f}%")

# Usage:
plot_topic_distribution(main_df, text_size=38, legend_font_size=32, save_path="figures/general/topic_distribution_piechart.png")

Pie chart saved as 'figures/general/topic_distribution_piechart.png'
E.H.R: 63%
Clinical Images: 16%
Biomedicine: 12%
Biosignals: 8%
