In [39]:
import pandas as pd
import ast
import re
from collections import Counter
from difflib import SequenceMatcher


In [2]:
all_data_ref = pd.read_parquet('all_data_ref_clean.parquet')

In [3]:
all_data_ref.head(2)

Unnamed: 0,bibcode,my_keyword,first_author,n_authors,first_author_aff,year,read_count,citation_count,reference_journals,n_references,journal,SJR Best Quartile,grouped_keywords
0,1977JCoMa..11..395C,AI,"Christensen, R. M.",2,"California, University, Livermore, Calif",2024,0,4,"[JAM, JCoMa]",2,JCoMa,,AI
1,2000A&A...353....1R,Cosmology,"Rebhan, E.",1,"Institut für Theoretische Physik, Heinrich-Hei...",2000,17,12,"[JETP, PhRv, SvPhU, JETPL, PhLB, PhRvL, PhLB, ...",27,A&A,Q1,Cosmology


In [4]:
all_data_ref['reference_journal_counts'] = all_data_ref['reference_journals'].apply(
    lambda x: dict(Counter(x)) 
)
all_data_ref['reference_journal_counts'].head(2)

0                               {'JAM': 1, 'JCoMa': 1}
1    {'JETP': 1, 'PhRv': 1, 'SvPhU': 1, 'JETPL': 2,...
Name: reference_journal_counts, dtype: object

In [None]:
def get_top_journals_by_keyword_contains(df, keyword_list, keyword_col='grouped_keywords', journal_col='journal', top_n=10):
    """
    Returns a wide-format DataFrame with the top N journals for each keyword substring match,
    including both journal names and their publication counts.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        keyword_list (list): List of keyword substrings to match (case-insensitive).
        keyword_col (str): Column name to search within for keyword substrings.
        journal_col (str): Column name containing journal names.
        top_n (int): Number of top journals to retrieve per keyword substring.

    Returns:
        pd.DataFrame: Wide-format DataFrame with journal names and counts for each matched keyword.
    """
    result_df = pd.DataFrame()

    for keyword in keyword_list:
        # Case-insensitive substring matching
        mask = df[keyword_col].str.contains(keyword, case=False, na=False)
        filtered = df[mask]

        top_journals = filtered[journal_col].value_counts().head(top_n)

        temp_df = pd.DataFrame({
            f"{keyword}_journal": top_journals.index,
            f"{keyword}_journal_count": top_journals.values
        })

        temp_df.reset_index(drop=True, inplace=True)
        result_df = pd.concat([result_df, temp_df], axis=1)

    return result_df
# Define your keyword patterns to look for (case-insensitive)
keywords_to_search = all_data_ref['grouped_keywords'].unique().tolist()
#keywords_to_search = ['exoplanet', 'star', 'AGN', 'Galaxy', 'Astrobiology', 'Asteroseismology']

top10_table = get_top_journals_by_keyword_contains(all_data_ref, keyword_col = 'grouped_keywords', keyword_list=keywords_to_search, top_n=10)
top10_table.head(3)


Unnamed: 0,AI_journal,AI_journal_count,Cosmology_journal,Cosmology_journal_count,Astronomy_journal,Astronomy_journal_count,Astrobiology_journal,Astrobiology_journal_count
0,NatSR,10430,PhRvD,31193,ApJ,40806,NatSR,15513
1,Senso,8964,MNRAS,18721,MNRAS,36906,PNAS,12630
2,IEEEA,8276,ApJ,12825,A&A,33715,NatCo,5954


In [6]:
# Filter only the columns that contain journal names (not counts)
journal_columns = [col for col in top10_table.columns if col.endswith('_journal')]

# Flatten all journal names into one list
all_top_journals = pd.unique(top10_table[journal_columns].values.ravel())

# Drop NaNs and convert to a clean Python list
unique_top_journals = [j for j in all_top_journals if pd.notna(j)]

unique_top_journals_plus_sci = unique_top_journals + ['Sci', 'NatAs']

# Now `unique_top_journals` is your final list
print(unique_top_journals_plus_sci)
print(len(unique_top_journals_plus_sci))



['NatSR', 'PhRvD', 'ApJ', 'Senso', 'MNRAS', 'PNAS', 'IEEEA', 'A&A', 'NatCo', 'RemS', 'JCAP', 'ApJL', 'PhRvE', 'PLoSO', 'CQGra', 'AJ', 'Bioin', 'PatRe', 'ApJS', 'Heliy', 'JHEP', 'Ap&SS', 'Natur', 'CEAgr', 'PhLB', 'PASJ', 'JMoSt', 'EPJC', 'ApEn', 'AN', 'ESPR', 'Sci', 'NatAs']
33


In [7]:
def add_reference_journal_columns(df, ref_col='reference_journal_counts', journal_list=None):
    """
    For each journal in `journal_list`, creates a new column in the DataFrame with the count
    of references to that journal from the dictionary column `ref_col`.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        ref_col (str): Name of the column containing dictionaries of journal reference counts.
        journal_list (list): List of journal names to extract counts for.

    Returns:
        pd.DataFrame: DataFrame with new columns added for each journal reference count.
    """
    if journal_list is None:
        raise ValueError("You must provide a list of journal names to extract.")

    for journal in journal_list:
        col_name = f"{journal}_ref_count"
        df[col_name] = df[ref_col].apply(
            lambda d: d.get(journal, 0) if isinstance(d, dict) else 0
        )

    return df


In [8]:
# Assuming unique_top_journals_plus_sci is your list of 33 journals
all_data_ref = add_reference_journal_columns(
    all_data_ref,
    ref_col='reference_journal_counts',
    journal_list=unique_top_journals_plus_sci
)


In [11]:
all_data_ref.columns

Index(['bibcode', 'my_keyword', 'first_author', 'n_authors',
       'first_author_aff', 'year', 'read_count', 'citation_count',
       'reference_journals', 'n_references', 'journal', 'SJR Best Quartile',
       'grouped_keywords', 'reference_journal_counts', 'NatSR_ref_count',
       'PhRvD_ref_count', 'ApJ_ref_count', 'Senso_ref_count',
       'MNRAS_ref_count', 'PNAS_ref_count', 'IEEEA_ref_count', 'A&A_ref_count',
       'NatCo_ref_count', 'RemS_ref_count', 'JCAP_ref_count', 'ApJL_ref_count',
       'PhRvE_ref_count', 'PLoSO_ref_count', 'CQGra_ref_count', 'AJ_ref_count',
       'Bioin_ref_count', 'PatRe_ref_count', 'ApJS_ref_count',
       'Heliy_ref_count', 'JHEP_ref_count', 'Ap&SS_ref_count',
       'Natur_ref_count', 'CEAgr_ref_count', 'PhLB_ref_count',
       'PASJ_ref_count', 'JMoSt_ref_count', 'EPJC_ref_count', 'ApEn_ref_count',
       'AN_ref_count', 'ESPR_ref_count', 'Sci_ref_count', 'NatAs_ref_count'],
      dtype='object')

In [10]:
all_data_ref.head(2)

Unnamed: 0,bibcode,my_keyword,first_author,n_authors,first_author_aff,year,read_count,citation_count,reference_journals,n_references,...,CEAgr_ref_count,PhLB_ref_count,PASJ_ref_count,JMoSt_ref_count,EPJC_ref_count,ApEn_ref_count,AN_ref_count,ESPR_ref_count,Sci_ref_count,NatAs_ref_count
0,1977JCoMa..11..395C,AI,"Christensen, R. M.",2,"California, University, Livermore, Calif",2024,0,4,"[JAM, JCoMa]",2,...,0,0,0,0,0,0,0,0,0,0
1,2000A&A...353....1R,Cosmology,"Rebhan, E.",1,"Institut für Theoretische Physik, Heinrich-Hei...",2000,17,12,"[JETP, PhRv, SvPhU, JETPL, PhLB, PhRvL, PhLB, ...",27,...,0,3,0,0,0,0,1,0,0,0


In [22]:
def journal_fraction_in_references(
    df,
    keyword_list,
    keyword_col='grouped_keywords',
    journal_name='ApJ'
):
    """
    Calculates, for each keyword, the mean fraction of references to a given journal
    (both across all articles and articles published in that journal),
    and also includes article counts.

    Parameters:
        df (pd.DataFrame): The full DataFrame.
        keyword_list (list): List of keyword labels to consider (exact match).
        keyword_col (str): Column name grouping articles by keyword.
        journal_name (str): The journal to compute reference fractions for.

    Returns:
        pd.DataFrame: Summary table with the following columns:
            - keyword
            - overall_mean_fraction
            - journal_subset_mean_fraction
            - n_articles_total
            - n_articles_from_journal
    """
    results = []

    journal_ref_col = f"{journal_name}_ref_count"

    for keyword in keyword_list:
        sub_df = df[df[keyword_col] == keyword]
        sub_df = sub_df[sub_df['n_references'] > 0]  # Avoid division by zero

        # Total number of articles under this keyword
        n_articles_total = len(sub_df)

        # Overall mean fraction
        if journal_ref_col in sub_df.columns and n_articles_total > 0:
            overall_fraction = (sub_df[journal_ref_col] / sub_df['n_references']).mean()
        else:
            overall_fraction = float('nan')

        # Subset: articles published in the given journal
        sub_sub_df = sub_df[sub_df['journal'] == journal_name]
        n_articles_from_journal = len(sub_sub_df)

        if journal_ref_col in sub_sub_df.columns and n_articles_from_journal > 0:
            journal_subset_fraction = (sub_sub_df[journal_ref_col] / sub_sub_df['n_references']).mean()
        else:
            journal_subset_fraction = float('nan')

        results.append({
            'keyword': keyword,
            'overall_mean_fraction': overall_fraction,
            'journal_subset_mean_fraction': journal_subset_fraction,
            'journal_extra_citation': journal_subset_fraction/overall_fraction,
            'n_articles_total': n_articles_total,
            'n_articles_from_journal': n_articles_from_journal,
            'fraction_articles_from_journal': 100*(n_articles_from_journal/n_articles_total)
        })

    return pd.DataFrame(results)


In [26]:
result = journal_fraction_in_references(
    df=all_data_ref,
    keyword_list=['AI', 'Cosmology', 'Astronomy', 'Astrobiology'],
    journal_name='A&A'
)

print(result)


        keyword  overall_mean_fraction  journal_subset_mean_fraction  \
0            AI               0.000492                      0.171980   
1     Cosmology               0.043073                      0.199297   
2     Astronomy               0.150494                      0.266678   
3  Astrobiology               0.000560                      0.179707   

   journal_extra_citation  n_articles_total  n_articles_from_journal  \
0              349.889533            194007                       79   
1                4.626911            145483                     6737   
2                1.772023            187128                    33715   
3              320.960059            300061                      171   

   fraction_articles_from_journal  
0                        0.040720  
1                        4.630782  
2                       18.017079  
3                        0.056988  


In [28]:
all_data_ref.head(2)

Unnamed: 0,bibcode,my_keyword,first_author,n_authors,first_author_aff,year,read_count,citation_count,reference_journals,n_references,...,CEAgr_ref_count,PhLB_ref_count,PASJ_ref_count,JMoSt_ref_count,EPJC_ref_count,ApEn_ref_count,AN_ref_count,ESPR_ref_count,Sci_ref_count,NatAs_ref_count
0,1977JCoMa..11..395C,AI,"Christensen, R. M.",2,"California, University, Livermore, Calif",2024,0,4,"[JAM, JCoMa]",2,...,0,0,0,0,0,0,0,0,0,0
1,2000A&A...353....1R,Cosmology,"Rebhan, E.",1,"Institut für Theoretische Physik, Heinrich-Hei...",2000,17,12,"[JETP, PhRv, SvPhU, JETPL, PhLB, PhRvL, PhLB, ...",27,...,0,3,0,0,0,0,1,0,0,0


In [38]:
all_data_ref.columns

Index(['bibcode', 'my_keyword', 'first_author', 'n_authors',
       'first_author_aff', 'year', 'read_count', 'citation_count',
       'reference_journals', 'n_references', 'journal', 'SJR Best Quartile',
       'grouped_keywords', 'reference_journal_counts', 'NatSR_ref_count',
       'PhRvD_ref_count', 'ApJ_ref_count', 'Senso_ref_count',
       'MNRAS_ref_count', 'PNAS_ref_count', 'IEEEA_ref_count', 'A&A_ref_count',
       'NatCo_ref_count', 'RemS_ref_count', 'JCAP_ref_count', 'ApJL_ref_count',
       'PhRvE_ref_count', 'PLoSO_ref_count', 'CQGra_ref_count', 'AJ_ref_count',
       'Bioin_ref_count', 'PatRe_ref_count', 'ApJS_ref_count',
       'Heliy_ref_count', 'JHEP_ref_count', 'Ap&SS_ref_count',
       'Natur_ref_count', 'CEAgr_ref_count', 'PhLB_ref_count',
       'PASJ_ref_count', 'JMoSt_ref_count', 'EPJC_ref_count', 'ApEn_ref_count',
       'AN_ref_count', 'ESPR_ref_count', 'Sci_ref_count', 'NatAs_ref_count'],
      dtype='object')

In [None]:
def author_based_reference_bias(
    df,
    keyword,
    journal_name,
    keyword_col='grouped_keywords',
    journal_col='journal',
    ref_count_col_template="{journal}_ref_count"
):
    """
    For a given keyword and journal:
    1. Finds all first authors who published in the journal under the keyword.
    2. Collects all their articles (in any journal).
    3. Calculates the mean fraction of references to the target journal
       in papers published in the journal vs. elsewhere.

    Returns:
        dict with mean fractions and article counts.
    """
    # Build reference count column name
    journal_ref_col = ref_count_col_template.format(journal=journal_name)

    # Filter articles by keyword + journal
    sub_df = df[(df[keyword_col] == keyword) & (df[journal_col] == journal_name)]

    # Identify unique first authors by name + affiliation
    author_keys = set(zip(sub_df['first_author'], sub_df['first_author_aff']))

    # Now find all articles by those authors (any keyword, any journal)
    author_mask = df.apply(
        lambda row: (row['first_author'], row['first_author_aff']) in author_keys,
        axis=1
    )
    author_articles = df[author_mask].copy()

    # Remove rows with zero or missing reference counts
    author_articles = author_articles[author_articles['n_references'] > 0]

    # Compute citation fraction: journal_ref_count / n_references
    if journal_ref_col not in author_articles.columns:
        return {
            "n_articles_total": 0,
            "n_in_journal": 0,
            "n_outside_journal": 0,
            "mean_fraction_in_journal": None,
            "mean_fraction_outside_journal": None
        }

    author_articles['ref_fraction'] = author_articles[journal_ref_col] / author_articles['n_references']

    # Split articles by whether they were published in the target journal
    in_journal = author_articles[author_articles[journal_col] == journal_name]
    outside_journal = author_articles[author_articles[journal_col] != journal_name]

    return {
        "n_articles_total": len(author_articles),
        "n_in_journal": len(in_journal),
        "n_outside_journal": len(outside_journal),
        "mean_fraction_in_journal": in_journal['ref_fraction'].mean() if not in_journal.empty else None,
        "mean_fraction_outside_journal": outside_journal['ref_fraction'].mean() if not outside_journal.empty else None
    }


In [40]:
def author_based_reference_bias_fuzzy(
    df,
    keyword,
    journal_name,
    keyword_col='grouped_keywords',
    journal_col='journal',
    ref_count_col_template="{journal}_ref_count"
):
    """
    For a given keyword and journal:
    1. Finds all first authors who published in the journal under the keyword.
    2. Collects all their articles (in any journal).
    3. Calculates the mean fraction of references to the target journal
       in papers published in the journal vs. elsewhere.

    Returns:
        dict with mean fractions and article counts.
    """
    # Build reference count column name
    journal_ref_col = ref_count_col_template.format(journal=journal_name)

    # Filter articles by keyword + journal
    sub_df = df[(df[keyword_col] == keyword) & (df[journal_col] == journal_name)]

    # Identify unique first authors by name + affiliation
    def fuzzy_affil_match(row_affil, target_affils, threshold=0.8):
        return any(
            SequenceMatcher(None, row_affil.lower(), target_affil.lower()).ratio() > threshold
            for target_affil in target_affils
        )

    author_names = set(sub_df['first_author'])
    target_affils = set(sub_df['first_author_aff'])

    author_mask = df['first_author'].isin(author_names) & df['first_author_aff'].apply(
        lambda x: fuzzy_affil_match(str(x), target_affils)
    )
    author_articles = df[author_mask].copy()

    # Remove rows with zero or missing reference counts
    author_articles = author_articles[author_articles['n_references'] > 0]

    # Compute citation fraction: journal_ref_count / n_references
    if journal_ref_col not in author_articles.columns:
        return {
            "n_articles_total": 0,
            "n_in_journal": 0,
            "n_outside_journal": 0,
            "mean_fraction_in_journal": None,
            "mean_fraction_outside_journal": None
        }

    author_articles['ref_fraction'] = author_articles[journal_ref_col] / author_articles['n_references']

    # Split articles by whether they were published in the target journal
    in_journal = author_articles[author_articles[journal_col] == journal_name]
    outside_journal = author_articles[author_articles[journal_col] != journal_name]

    return {
        "n_articles_total": len(author_articles),
        "n_in_journal": len(in_journal),
        "n_outside_journal": len(outside_journal),
        "mean_fraction_in_journal": in_journal['ref_fraction'].mean() if not in_journal.empty else None,
        "mean_fraction_outside_journal": outside_journal['ref_fraction'].mean() if not outside_journal.empty else None
    }


In [41]:
author_stats = author_based_reference_bias_fuzzy(
    df=all_data_ref,
    keyword="Astronomy",
    journal_name="Natur"
)

print(author_stats)


KeyboardInterrupt: 

In [None]:
all_data_ref[all_data_ref['journal'] == 'Natur'].head(2)