In [15]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

import pandas as pd
import numpy as np

def count_nan_citations(chil_df: pd.DataFrame, ml4h_df: pd.DataFrame, mlhc_df: pd.DataFrame) -> dict:
    """
    Counts the number of rows where the 'citation_count' column is NaN for each dataframe.
    
    Args:
        chil_df, ml4h_df, mlhc_df: pandas DataFrames containing paper data
        
    Returns:
        dict: A dictionary with the counts of NaN citations for each conference
    """
    nan_citations = {
        'CHIL': chil_df['citation_count'].isna().sum(),
        'ML4H': ml4h_df['citation_count'].isna().sum(),
        'MLHC': mlhc_df['citation_count'].isna().sum()
    }
    
    # Calculate totals and percentage
    total_nan_citations = sum(nan_citations.values())
    total_papers = len(chil_df) + len(ml4h_df) + len(mlhc_df)
    
    nan_citations['Total'] = total_nan_citations
    nan_citations['Percentage'] = (total_nan_citations / total_papers) * 100
    
    return nan_citations

def count_uncleaned_titles(chil_df: pd.DataFrame, ml4h_df: pd.DataFrame, mlhc_df: pd.DataFrame) -> dict:
    """
    Counts the number of rows where the 'cleaned_title' column is empty or NaN for each dataframe.
    
    Args:
        chil_df, ml4h_df, mlhc_df: pandas DataFrames containing paper data
        
    Returns:
        dict: A dictionary with the counts of uncleaned titles for each conference
    """
    # Function to count empty or NaN titles
    def count_empty_or_nan(df):
        if 'cleaned_title' not in df.columns:
            return len(df)  # If column doesn't exist, count all rows as uncleaned
        return df['cleaned_title'].isna().sum() + (df['cleaned_title'] == '').sum()
    
    uncleaned_titles = {
        'CHIL': count_empty_or_nan(chil_df),
        'ML4H': count_empty_or_nan(ml4h_df),
        'MLHC': count_empty_or_nan(mlhc_df)
    }
    
    # Calculate totals and percentage
    total_uncleaned = sum(uncleaned_titles.values())
    total_papers = len(chil_df) + len(ml4h_df) + len(mlhc_df)
    
    uncleaned_titles['Total'] = total_uncleaned
    uncleaned_titles['Percentage'] = (total_uncleaned / total_papers) * 100
    
    return uncleaned_titles

def count_nan_titles(chil_df, ml4h_df, mlhc_df):
    """
    Counts the number of rows where the 'title' column is NaN for each dataframe.
    
    Args:
    chil_df, ml4h_df, mlhc_df: pandas DataFrames containing paper data
    
    Returns:
    dict: A dictionary with the counts of NaN titles for each conference
    """
    nan_titles = {
        'CHIL': chil_df['title'].isna().sum(),
        'ML4H': ml4h_df['title'].isna().sum(),
        'MLHC': mlhc_df['title'].isna().sum()
    }
    
    total_nan_titles = sum(nan_titles.values())
    total_papers = len(chil_df) + len(ml4h_df) + len(mlhc_df)
    
    nan_titles['Total'] = total_nan_titles
    nan_titles['Percentage'] = (total_nan_titles / total_papers) * 100
    
    return nan_titles

file_paths = {
            'ml4h': {
                'input': 'data/cleaned/ml4h/ml4h_cleaned.csv',
                'output': 'data/processed/ml4h/ml4h_citations.csv'
            },
            'chil': {
                'input': 'data/cleaned/chil/chil_cleaned.csv',
                'output': 'data/processed/chil/chil_citations.csv'
            },
            'mlhc': {
                'input': 'data/cleaned/mlhc/mlhc_cleaned.csv',
                'output': 'data/processed/mlhc/mlhc_citations.csv'
            }
        }

# Load the dataframes
chil_semantic_df = pd.read_csv(file_paths['chil']['output'])
ml4h_semantic_df = pd.read_csv(file_paths['ml4h']['output'])
mlhc_semantic_df = pd.read_csv(file_paths['mlhc']['output'])

# Count NaN citations
nan_citation_counts = count_nan_citations(chil_semantic_df, ml4h_semantic_df, mlhc_semantic_df)

# Print the NaN citation results
print("\nNaN Citation Counts:")
for conference, count in nan_citation_counts.items():
    if conference == 'Percentage':
        print(f"{conference}: {count:.2f}%")
    else:
        print(f"{conference}: {count}")

# Count uncleaned titles
uncleaned_title_counts = count_uncleaned_titles(chil_semantic_df, ml4h_semantic_df, mlhc_semantic_df)

# Print the uncleaned title results
print("\nUncleaned Title Counts:")
for conference, count in uncleaned_title_counts.items():
    if conference == 'Percentage':
        print(f"{conference}: {count:.2f}%")
    else:
        print(f"{conference}: {count}")

# Count NaN titles
nan_title_counts = count_nan_titles(chil_semantic_df, ml4h_semantic_df, mlhc_semantic_df)

# Print the NaN title results
print("\nNaN Title Counts:")
for conference, count in nan_title_counts.items():
    if conference == 'Percentage':
        print(f"{conference}: {count:.2f}%")
    else:
        print(f"{conference}: {count}")

# ... [Rest of the code remains unchanged] ...


NaN Citation Counts:
CHIL: 29
ML4H: 16
MLHC: 110
Total: 155
Percentage: 24.41%

Uncleaned Title Counts:
CHIL: 4
ML4H: 3
MLHC: 23
Total: 30
Percentage: 4.72%

NaN Title Counts:
CHIL: 4
ML4H: 3
MLHC: 23
Total: 30
Percentage: 4.72%


In [17]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import os

# ... [Previous functions remain unchanged] ...

# # Load the dataframes
# chil_semantic_df = pd.read_csv("processed_data/chil_semantic_scholar_citations.csv")
# ml4h_semantic_df = pd.read_csv("processed_data/ml4h_semantic_scholar_citations.csv")
# mlhc_semantic_df = pd.read_csv("processed_data/mlhc_semantic_scholar_citations.csv")

# Print original dataframe sizes
print("\nOriginal Dataframe Sizes:")
print(f"CHIL: {len(chil_semantic_df)}")
print(f"ML4H: {len(ml4h_semantic_df)}")
print(f"MLHC: {len(mlhc_semantic_df)}")

# Count NaN titles before removal
nan_title_counts = count_nan_titles(chil_semantic_df, ml4h_semantic_df, mlhc_semantic_df)

# Print the NaN title results before removal
print("\nNaN Title Counts (Before Removal):")
for conference, count in nan_title_counts.items():
    if conference == 'Percentage':
        print(f"{conference}: {count:.2f}%")
    else:
        print(f"{conference}: {count}")

# Remove rows with NaN titles
chil_semantic_df = chil_semantic_df.dropna(subset=['title'])
ml4h_semantic_df = ml4h_semantic_df.dropna(subset=['title'])
mlhc_semantic_df = mlhc_semantic_df.dropna(subset=['title'])

# Print new dataframe sizes
print("\nDataframe Sizes After Removing NaN Titles:")
print(f"CHIL: {len(chil_semantic_df)}")
print(f"ML4H: {len(ml4h_semantic_df)}")
print(f"MLHC: {len(mlhc_semantic_df)}")

# Recount NaN citations after removing rows with NaN titles
nan_citation_counts = count_nan_citations(chil_semantic_df, ml4h_semantic_df, mlhc_semantic_df)

# Print the new NaN citation results
print("\nNaN Citation Counts (After Removing NaN Titles):")
for conference, count in nan_citation_counts.items():
    if conference == 'Percentage':
        print(f"{conference}: {count:.2f}%")
    else:
        print(f"{conference}: {count}")

# Save cleaned dataframes to new CSV files
output_dir = "data/cleaned"
os.makedirs(output_dir, exist_ok=True)

chil_semantic_df.to_csv(file_paths['chil']['output'], index=False)
ml4h_semantic_df.to_csv(file_paths['ml4h']['output'], index=False)
mlhc_semantic_df.to_csv(file_paths['mlhc']['output'], index=False)

print("\nCleaned CSV files have been saved in the 'data/processed' directory.")

# ... [Rest of the code remains unchanged] ...


Original Dataframe Sizes:
CHIL: 155
ML4H: 134
MLHC: 346

NaN Title Counts (Before Removal):
CHIL: 4
ML4H: 3
MLHC: 23
Total: 30
Percentage: 4.72%

Dataframe Sizes After Removing NaN Titles:
CHIL: 151
ML4H: 131
MLHC: 323

NaN Citation Counts (After Removing NaN Titles):
CHIL: 25
ML4H: 13
MLHC: 87
Total: 125
Percentage: 20.66%

Cleaned CSV files have been saved in the 'processed_data/cleaned' directory.


# Getting the important titles we want to query SerpAPI for 

In [34]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import os

# ... [Previous functions remain unchanged] ...

def get_titles_with_nan_citations(df):
    """
    Returns a list of cleaned titles for papers with NaN citation counts.
    
    Args:
    df: pandas DataFrame containing paper data
    
    Returns:
    list: A list of cleaned titles for papers with NaN citation counts
    """
    return df[df['citation_count'].isna()]['cleaned_title'].tolist()

# Load the cleaned dataframes
# Get lists of titles with NaN citations for each conference
chil_nan_citation_titles = get_titles_with_nan_citations(chil_semantic_df)
ml4h_nan_citation_titles = get_titles_with_nan_citations(ml4h_semantic_df)
mlhc_nan_citation_titles = get_titles_with_nan_citations(mlhc_semantic_df)

# Print the results
print("\nTitles with NaN citations:")
print(f"\nCHIL ({len(chil_nan_citation_titles)}):")
for title in chil_nan_citation_titles:  # Print first 5 for brevity
    print(f"- {title}")
# if len(chil_nan_citation_titles) > 5:
#     print("...")

print(f"\nML4H ({len(ml4h_nan_citation_titles)}):")
for title in ml4h_nan_citation_titles:  # Print first 5 for brevity
    print(f"- {title}")
# if len(ml4h_nan_citation_titles) > 5:
#     print("...")

print(f"\nMLHC ({len(mlhc_nan_citation_titles)}):")
for title in mlhc_nan_citation_titles:  # Print first 5 for brevity
    print(f"- {title}")
# if len(mlhc_nan_citation_titles) > 5:
#     print("...")

# ... [Rest of the code remains unchanged] ...


Titles with NaN citations:

CHIL (23):
- MIC-Extract: A Data Extraction, Preprocessing, and Representation Pipeline for
- Visual Che Xbert: Addressing the Discrepancy Between Radiology Report Labels and Image Labels
- Che X-Transfer: Performance and Parameter Efficiency of Image Net Models for Chest X-Ray Interpretation
- Interpretable Missing Values in Healthcare: Figure 7 - Impact of Father's Education on Infant Mortality Risk. Appendix A: Testing for MCAR with EBM: Case Study.
- Toward the Practical Utility of Federated Learning in the Medical Domain
- Evaluating Model Performance in Medical Datasets Over Time: A Snapshot into the State of Proceedings and the First 20 Papers That Came Up in the Radiology Medical Journal When Searching for the Keyword "Machine Learning" and Filtering for Papers from To
- Understanding and Predicting Environment Effects on Individuals with T2D: Appendix A - CGM Dataset. We include information on the range of values for each external factor in our dat

In [32]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import os
import re

# ... [Previous functions remain unchanged] ...

def specific_cleaning(df):
    """
    Performs specific cleaning operations on the dataframe.
    
    Args:
    df: pandas DataFrame containing paper data
    
    Returns:
    pandas DataFrame: The cleaned dataframe
    """
    # Titles to remove
    titles_to_remove = [
        "Supplementary",
        "Motivation for What Purpose Was the Dataset Created?",
        "Conference Proceedings",
        "Variable Total Age (Mean SD) 55.14",
        "There is no title to clean",
        "Machine Learning for Healthcare August 8-10",
        "Development of a Clinical Decision Tool and Protocol for Identification and Treatment",
        "Clinical Abstract Track",
        "To 32.9 in 1,"
    ]
    
    # Remove rows with specified titles
    pattern = '|'.join(map(re.escape, titles_to_remove))
    df = df[~df['cleaned_title'].str.contains(pattern, case=False, regex=True)]
    
    # Clean specific titles
    df['cleaned_title'] = df['cleaned_title'].replace({
        "EG to Text: Learning to Write Medical Reports from G Recordings": 
        "EEG to Text: Learning to Write Medical Reports from EEG Recordings"
    })
    
    # Remove "Preprint: Under Review" and associated numbers
    df['cleaned_title'] = df['cleaned_title'].apply(lambda x: re.sub(r'Preprint: Under Review \d+:\s*\d+–\d+,', '', x).strip())
    df['cleaned_title'] = df['cleaned_title'].apply(lambda x: re.sub(r'Preprint: Under Review \d+–\d+,?', '', x).strip())
    
    # Remove rows where the cleaned title is empty after cleaning
    df = df[df['cleaned_title'] != '']
    
    return df

# Apply specific cleaning to each dataframe
print("Applying specific cleaning operations...")
chil_semantic_df = specific_cleaning(chil_semantic_df)
ml4h_semantic_df = specific_cleaning(ml4h_semantic_df)
mlhc_semantic_df = specific_cleaning(mlhc_semantic_df)

# Print the number of rows in each dataframe after cleaning
print(f"\nRows after specific cleaning:")
print(f"CHIL: {len(chil_semantic_df)}")
print(f"ML4H: {len(ml4h_semantic_df)}")
print(f"MLHC: {len(mlhc_semantic_df)}")

# Save the specifically cleaned dataframes
# output_dir = "processed_data/specifically_cleaned"
# os.makedirs(output_dir, exist_ok=True)

chil_semantic_df.to_csv(file_paths['chil']['output'], index=False)
ml4h_semantic_df.to_csv(file_paths['ml4h']['output'], index=False)
mlhc_semantic_df.to_csv(file_paths['mlhc']['output'], index=False)

print(f"\nSpecifically cleaned CSV files have been saved in the '{output_dir}' directory.")

# Function to check for "preprint" in titles
def count_preprint_titles(df):
    return df['cleaned_title'].str.lower().str.contains('preprint').sum()

# Count and print the number of titles containing "preprint"
print("\nNumber of titles containing 'preprint':")
print(f"CHIL: {count_preprint_titles(chil_semantic_df)}")
print(f"ML4H: {count_preprint_titles(ml4h_semantic_df)}")
print(f"MLHC: {count_preprint_titles(mlhc_semantic_df)}")

# Function to get titles containing "preprint"
def get_preprint_titles(df):
    return df[df['cleaned_title'].str.lower().str.contains('preprint')]['cleaned_title'].tolist()

# Print any titles that still contain "preprint"
for name, df in [("CHIL", chil_semantic_df), ("ML4H", ml4h_semantic_df), ("MLHC", mlhc_semantic_df)]:
    preprint_titles = get_preprint_titles(df)
    if preprint_titles:
        print(f"\n{name} title(s) still containing 'preprint':")
        for title in preprint_titles:
            print(f"- {title}")
    else:
        print(f"\nNo {name} titles contain 'preprint'.")

# ... [Rest of the code remains unchanged] ...

Applying specific cleaning operations...

Rows after specific cleaning:
CHIL: 148
ML4H: 127
MLHC: 283

Specifically cleaned CSV files have been saved in the 'processed_data/specifically_cleaned' directory.

Number of titles containing 'preprint':
CHIL: 0
ML4H: 0
MLHC: 0

No CHIL titles contain 'preprint'.

No ML4H titles contain 'preprint'.

No MLHC titles contain 'preprint'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_title'] = df['cleaned_title'].replace({
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_title'] = df['cleaned_title'].apply(lambda x: re.sub(r'Preprint: Under Review \d+:\s*\d+–\d+,', '', x).strip())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_title'] = df['cle

In [36]:
import pandas as pd
import json
import os
from serpapi import GoogleSearch
from time import sleep
import re

def clean_filename(title):
    # Remove invalid characters and limit length
    clean = re.sub(r'[^\w\s-]', '', title).strip()
    return clean[:100]  # Limit filename length

def search_and_save_results(df, api_key, output_dir='serpapi_results'):
    os.makedirs(output_dir, exist_ok=True)
    
    # Filter for rows where citation_count is NaN
    df_nan_citations = df[pd.isna(df['citation_count'])]
    
    print(f"Processing {len(df_nan_citations)} titles with NaN citation counts...")
    
    for index, row in df_nan_citations.iterrows():
        cleaned_title = row['cleaned_title']
        filename = clean_filename(cleaned_title)
        filename = filename.replace(' ', '_') + '.json'
        
        params = {
            "api_key": api_key,
            "engine": "google_scholar",
            "q": cleaned_title,
            "hl": "en"
        }
        
        try:
            search = GoogleSearch(params)
            results = search.get_dict()
            
            with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=4)
            
            print(f"Saved results for: {cleaned_title}")
            sleep(2)  # To avoid hitting rate limits
        
        except Exception as e:
            print(f"Error processing {cleaned_title}: {str(e)}")
    
    print("Finished processing all titles with NaN citation counts.")

# Usage
api_key = ""  # Replace with your actual API key

# Load the cleaned dataframes
# input_dir = "processed_data/specifically_cleaned"
# chil_df = pd.read_csv(f"{input_dir}/chil_semantic_scholar_citations.csv")
# ml4h_df = pd.read_csv(f"{input_dir}/ml4h_semantic_scholar_citations.csv")
# mlhc_df = pd.read_csv(f"{input_dir}/mlhc_semantic_scholar_citations.csv")

# Process each dataframe
for name, df in [("CHIL", chil_semantic_df), ("ML4H", ml4h_semantic_df), ("MLHC", mlhc_semantic_df)]:
    print(f"\nProcessing {name} titles...")
    output_dir = f"serpapi_results_{name.lower()}_nan_citations"
    search_and_save_results(df, api_key, output_dir)

print("All processing completed.")


Processing CHIL titles...
Processing 23 titles with NaN citation counts...
Saved results for: MIC-Extract: A Data Extraction, Preprocessing, and Representation Pipeline for
Saved results for: Visual Che Xbert: Addressing the Discrepancy Between Radiology Report Labels and Image Labels
Saved results for: Che X-Transfer: Performance and Parameter Efficiency of Image Net Models for Chest X-Ray Interpretation
Saved results for: Interpretable Missing Values in Healthcare: Figure 7 - Impact of Father's Education on Infant Mortality Risk. Appendix A: Testing for MCAR with EBM: Case Study.
Saved results for: Toward the Practical Utility of Federated Learning in the Medical Domain
Saved results for: Evaluating Model Performance in Medical Datasets Over Time: A Snapshot into the State of Proceedings and the First 20 Papers That Came Up in the Radiology Medical Journal When Searching for the Keyword "Machine Learning" and Filtering for Papers from To
Saved results for: Understanding and Predicti

In [1]:
import pandas as pd
import json
import os
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

def clean_filename(title):
    # Remove special characters and limit length
    clean = re.sub(r'[^\w\s-]', '', title).strip()
    clean = re.sub(r'\s+', '_', clean)  # Replace spaces with underscores
    return clean[:100]  # Limit filename length

def compare_titles(original_title, result_title):
    original_tokens = word_tokenize(original_title.lower())
    result_tokens = word_tokenize(result_title.lower())
    stop_words = set(stopwords.words('english'))
    original_tokens = [token for token in original_tokens if token.isalpha() and token not in stop_words]
    result_tokens = [token for token in result_tokens if token.isalpha() and token not in stop_words]
    overlapping_words = set(original_tokens) & set(result_tokens)
    return len(overlapping_words) >= 3

def extract_citations_from_file(file_path, original_title):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    if 'organic_results' in data and len(data['organic_results']) > 0:
        result = data['organic_results'][0]
        if compare_titles(original_title, result['title']):
            return result.get('inline_links', {}).get('cited_by', {}).get('total', 0)
    return None

def process_dataframe(df, input_dir):
    # Filter for rows where citation_count is NaN
    df_nan_citations = df[pd.isna(df['citation_count'])]
    
    print(f"Processing {len(df_nan_citations)} titles with NaN citation counts...")
    unmatched_count = 0
    for index, row in df_nan_citations.iterrows():
        cleaned_title = row['cleaned_title']
        filename = clean_filename(cleaned_title) + '.json'
        file_path = os.path.join(input_dir, filename)
        
        if os.path.exists(file_path):
            # Extract citations
            citations = extract_citations_from_file(file_path, cleaned_title)
            if citations is not None:
                df.loc[index, 'citation_count'] = citations
            else:
                unmatched_count += 1
        else:
            print(f"File not found for: {cleaned_title}")
            unmatched_count += 1
    
    # Drop rows with NaN citation counts
    df_cleaned = df.dropna(subset=['citation_count'])
    
    return df_cleaned, unmatched_count

# Usage

# Process each dataframe
for name, df in [("CHIL", chil_semantic_df), ("ML4H", ml4h_semantic_df), ("MLHC", mlhc_semantic_df)]:
    print(f"\nProcessing {name} titles...")
    input_dir = f"serpapi_results_{name.lower()}_nan_citations"
    df_cleaned, unmatched_count = process_dataframe(df, input_dir)
    
    # Save the cleaned dataframe
    output_file = file_paths[name.lower()]['output']
    df_cleaned.to_csv(output_file, index=False)
    print(f"Saved cleaned {name} dataframe to {output_file}")
    
    # Print statistics
    print(f"Total entries processed: {len(df)}")
    print(f"Entries with NaN citations: {len(df) - len(df_cleaned)}")
    print(f"Entries without matching SerpAPI results: {unmatched_count}")
    print(f"Entries with updated citations: {len(df_cleaned) - (len(df) - len(df[pd.isna(df['citation_count'])]))}")

print("All processing completed.")


Processing CHIL titles...
Processing 23 titles with NaN citation counts...
Saved cleaned CHIL dataframe to processed_data/cleaned_chil_citations.csv
Total entries processed: 148
Entries with NaN citations: 1
Entries without matching SerpAPI results: 1
Entries with updated citations: 0

Processing ML4H titles...
Processing 9 titles with NaN citation counts...
Saved cleaned ML4H dataframe to processed_data/cleaned_ml4h_citations.csv
Total entries processed: 127
Entries with NaN citations: 4
Entries without matching SerpAPI results: 4
Entries with updated citations: 0

Processing MLHC titles...
Processing 60 titles with NaN citation counts...
Saved cleaned MLHC dataframe to processed_data/cleaned_mlhc_citations.csv
Total entries processed: 283
Entries with NaN citations: 25
Entries without matching SerpAPI results: 25
Entries with updated citations: 0
All processing completed.
