In [122]:
import pandas as pd
import ast
import re
from collections import Counter

In [123]:
all_data_aff_sjr = pd.read_parquet('ads_data/all_data_aff_sjr.parquet')

In [None]:
# Count the number of authors per article by parsing the 'author' column,
# which is stored as a string representation of a list. If parsing is successful,
# we return the length of the list (i.e., number of authors); otherwise, None.
all_data_aff_sjr['n_authors'] = all_data_aff_sjr['author'].apply(
    lambda x: len(ast.literal_eval(x)) if isinstance(x, str) else None
)

# Extract the first author's affiliation by parsing the 'aff' column,
# which is a string representation of a list of affiliations.
# The first item in the list is assumed to correspond to the first author.
# If the value is a valid string and starts with '[', attempt to parse and extract it.
all_data_aff_sjr['first_author_aff'] = all_data_aff_sjr['aff'].map(
    lambda x: ast.literal_eval(x)[0] if isinstance(x, str) and x.startswith('[') else None
)

# Split by comma, strip whitespace, and count
all_data_aff_sjr['n_references'] = all_data_aff_sjr['references'].apply(
    lambda x: len(x.split(',')) if isinstance(x, str) and x.strip() else 0
)

def extract_journal_list(ref_string):
    """
    Extracts a list of journal codes from an ADS-style reference string.

    Parameters:
        ref_string (str): A comma-separated string of bibcodes.

    Returns:
        list: A list of journal codes (e.g., ['JETP', 'PhRvL', 'A&A']).
    """
    if not isinstance(ref_string, str) or not ref_string.strip():
        return []

    refs = [r.strip() for r in ref_string.split(',')]

    journals = []
    for r in refs:
        # Extract any characters between the year (4 digits) and the first dot
        match = re.match(r'\d{4}([^\.]+)', r)
        if match:
            journals.append(match.group(1))

    return journals


all_data_aff_sjr['reference_journals'] = all_data_aff_sjr['references'].apply(extract_journal_list)


In [None]:
relevant_columns = ['bibcode', 'my_keyword',  
           'first_author', 'n_authors', 'first_author_aff', 
          'year', 'read_count', 
       'citation_count', 'reference_journals',   'n_references',
       'journal',
        'SJR Best Quartile',
        'grouped_keywords']

all_data_ref = all_data_aff_sjr[relevant_columns].copy()
all_data_ref = all_data_ref[all_data_ref['n_references'] > 0]
all_data_ref.to_parquet('ads_data/all_data_ref.parquet', index=False)
all_data_ref.head(5)

Unnamed: 0,bibcode,my_keyword,first_author,n_authors,first_author_aff,year,read_count,citation_count,reference_journals,n_references,journal,SJR Best Quartile,grouped_keywords
1,1977JCoMa..11..395C,AI,"Christensen, R. M.",2,"California, University, Livermore, Calif",2024,0,4,"[JAM, JCoMa]",2,JCoMa,,AI
2,2000A&A...353....1R,Cosmology,"Rebhan, E.",1,"Institut für Theoretische Physik, Heinrich-Hei...",2000,17,12,"[JETP, PhRv, SvPhU, JETPL, PhLB, PhRvL, PhLB, ...",27,A&A,Q1,Cosmology
3,2000A&A...353...25M,"AGN, Cosmology, Galaxy","Miyaji, Takamitsu",3,Max-Planck-Inst. für extraterrestrische Physik...,2000,18,262,"[ApJ, ApJ, ApJ, ApJ, ApJ, ApJ, MNRAS, PhR, ApJ...",56,A&A,Q1,Cosmology
4,2000A&A...353...41S,"Cosmology, Galaxy","Schneider, Peter",3,"Max-Planck-Institut für Astrophysik, Postfach ...",2000,10,88,"[ApJ, A&A, ApJ, MNRAS, ApJ, MNRAS, grle, nrfa,...",42,A&A,Q1,Cosmology
5,2000A&A...353...57G,"Cosmology, Galaxy","Girardi, Marisa",3,"Dipartimento di Astronomia, Università degli S...",2000,18,20,"[ApJ, lssu, ApJ, ApJ, ApJ, A&A, ApJ, Sci, ApJ,...",44,A&A,Q1,Cosmology


In [127]:
# Show the top 10 most common journals by count
top_journals = all_data_aff_sjr['journal'].value_counts().head(50)
print(top_journals)


journal
MNRAS    56665
ApJ      54663
A&A      41292
PhRvD    33821
NatSR    28844
PNAS     14377
ApJL     13814
Senso    13007
JCAP     10555
AJ       10440
IEEEA     8755
PLoSO     8696
NatCo     8615
CQGra     8295
RemS      7127
JHEP      6786
Bioin     6619
Sust      6330
Natur     6227
PhRvL     6164
PhRvE     5941
Heliy     5610
PhLB      4880
EPJC      4541
Ap&SS     4538
ApJS      3930
ScTEn     3743
ESPR      3699
PatRe     3601
JChPh     3501
Sci       3369
IJMPD     3259
JMoSt     3089
BiolC     3050
GReGr     2892
WatRe     2770
BiTec     2710
Water     2684
PASJ      2675
CEAgr     2669
AcAC      2624
ChEnJ     2554
Plnts     2551
JEnvM     2475
AcSpA     2456
Entrp     2452
AN        2398
CBio      2283
MedPh     2262
PASP      2252
Name: count, dtype: int64
