In [1]:
import os
import json
import pickle
import importlib
from tqdm import tqdm
import time

from helper_functions import loadPKL

CWD = os.path.abspath("")  # Jupyter notebook path.

dir_input = os.path.join(CWD, "input")
dir_batch = os.path.join(CWD, "batch")  # ChatGPT related output.
dir_TEMP = os.path.join(CWD, "TEMP")  # Intermediate files.
dir_dict = os.path.join(CWD, "dicts")  # Look up dictionaries such as paper2meta; main data directory.
dir_npy = os.path.join(CWD, "npy")  # Data files needed for plotting figures.
dir_output = os.path.join(CWD, "output")  # Figures.
dir_xml = os.path.join(CWD, "xml")  # xml files.
dir_DEBUG = os.path.join(CWD, "DEBUG")

## Load parameters json file in /input.

TRAINING_CSV_FILE_NAME:</br>
csv filename (don't include extension) containing human rated answers; this file should be in /batch.
it should have sentences column as well as one column of ratings for each rater; first row is column name, sentence column should be named "sentences", rater column names can be whatever; column order doesn't matter, so can be rater1, rater 2, sentences, rater 3, for example
sentences column contains the sentence to rate, and rating columns contain ratings for that rater

THRES_NUM_PAIR_COLLAB (int):</br>
Sample size threshold for at least this amount of citation pairs to collaborators.</br>
Used to filter out departments and countries.

In [2]:
with open(os.path.join(dir_input, "params.json")) as f:
    params = json.load(f)
params

{'year_range': [1998, 2023],
 'jif_thres': 3,
 'file_name_train_csv': 'TRAINING_CSV_FILE_NAME',
 'model_sentiment': 'gpt-3.5-turbo-1106',
 'hyperparams': {'learning_rate_multiplier': 0.5,
  'batch_size': 125,
  'n_epochs': 6},
 'model_benchwork': 'gpt-3.5-turbo-0125',
 'model_embed': 'text-embedding-3-small',
 'n_bs': 1000,
 'THRES_NUM_PAIR_COLLAB': 100,
 'dist_max': 6,
 'year_ranges': [[-4, -3], [-2, -1], [0, 0], [1, 2], [3, 4], [5, 6]],
 'binW': 30}

Download select xml files using oa_file_list.csv (in "input" folder) <br />
which is downloaded from https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.csv.

In [2]:
import filter_papers as filt_pap

# Save journal meta info, such as name and JIF.
filt_pap.save_jour2meta(dir_input, dir_dict, "JCR_JournalResults_05_2024", jif_thres=params["jif_thres"])

# Filter by pub year and journals.
founD = loadPKL(dir_dict, "jour2meta")
jrns = set()  # 187 journals (total 188, but 1 is absent in PMC).
for k, v in founD.items():
    MedAbbr = v["MedAbbr"]
    assert MedAbbr not in jrns, f"Duplicate MedAbbr found for journal={MedAbbr}."
    jrns.add(MedAbbr)

# stats_dict, stats_dict_filtered are saved in /TEMP; for descriptive stats.
# oa_file_list_filtered.csv is saved in /TEMP too; specifies what papers we will download next.
stats_dict, stats_dict_filtered = filt_pap.filter_file_list(params["year_range"], jrns, dir_input, dir_TEMP)

below row's 2nd col doesn't follow expected formatting behavior:
['oa_package/c2/56/PMC5871948.tar.gz', 'Diseases. 02017 Dec 22; 6(1):2']
below row's 2nd col doesn't follow expected formatting behavior:
['oa_package/c2/56/PMC5871948.tar.gz', 'Diseases. 02017 Dec 22; 6(1):2']
below row's 2nd col doesn't follow expected formatting behavior:
['oa_package/0c/83/PMC5977326.tar.gz', 'Nanomaterials (Basel). 02018 May 9; 8(5):312']
below row's 2nd col doesn't follow expected formatting behavior:
['oa_package/0c/83/PMC5977326.tar.gz', 'Nanomaterials (Basel). 02018 May 9; 8(5):312']


In [None]:
from download_xml import download_tgz_files

# We have oa_file_list_filtered.csv, now we download xml from it.
# Run below to download xml files we need from PubMedCentral OpenAccess Subset.
download_tgz_files(csv_in=dir_TEMP, xml_out=dir_xml)

now that we have xml files, we now process them and do some filtering <br/>
we use xml to extract following data:
1. citations
2. citation sentences
3. first, last, all author first and last names
4. affiliations -> we will extract department and country from this later

In [None]:
# We need to download punkt for tokenization.
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download nltk punkt for sentence tokenization.
nltk.download("punkt")  # nltk 3.8.1 just needs this one.
# nltk.download('punkt_tab')  # In nltk 3.9.1 this is required.

In [None]:
import xml_parser as xp

with open(os.path.join(dir_TEMP, "stats_dict_filtered.pkl"), "rb") as f:
    stats_dict_filtered = pickle.load(f)
# Parse the downloaded xml files; save key_info_all.pkl in /TEMP.
key_info_all = xp.parse_all_xml_files(dir_xml, dir_TEMP, stats_dict_filtered["journal_year_lookup"])

In [6]:
# Extract info from key_info_all.pkl; save ref_stats.pkl in /TEMP.
ref_stats = xp.make_ref_stats(dir_TEMP, key_info_all, stats_dict_filtered["journal_year_lookup"], jrns)

100%|██████████| 125342/125342 [00:13<00:00, 9229.01it/s]


In [12]:
import get_meta_data as gmd

# Make citation edges and article_meta and save them to /TEMP.
gmd.make_edges_and_meta(dir_TEMP, ref_stats, key_info_all)

100%|██████████| 125342/125342 [00:07<00:00, 16229.15it/s]


### From here on we don't need key_info_all or ref_stats,
### instead we primarily use article_meta and paper2meta.



## Get some data from external sources, using various APIs:
1. sentiment scores (ChatGPT):
    * sentences and info files in /TEMP
2. benchwork score (ChatGPT)
    * we need paper content (starting from introduction, may also cover some of results or method sections) to get this one; paper content in /TEMP
3. h-Index (WoS) & gender (gender-API): contains last author name and their h-Index and estimated gender
    * last_author2gender-Neuroscience.csv (/input)
    * last_author2hIndex.pkl and last_author2gender_info.pkl(/dicts)
    * Need author names for this one.
4. power distance and individualism (https://geerthofstede.com/research-and-vsm/dimension-data-matrix/): 6-dimensions-for-website-2015-08-16.csv (/input)

5. brilliance (https://doi.org/10.1037/edu0000669): brilliance_data.csv (/input)


In [3]:
import external_methods as em
import process_field_and_country as pfc
import cite_coauthor_functions as ccf

In [2]:
# Create two files saved in /TEMP:
# "sentences2rate" txt file to be used for ChatGPT, "sentrow2edgeinfo" pkl file for bookkeeping.
em.save_CGPT_input_files(dir_cnets=dir_TEMP, dir_out=dir_TEMP, cite_marker="✪")

# Create structured citation data; no sentiment yet; later add sentiment from ChatGPT API results.
cite2sent = ccf.make_cite2sent_from_sentence_data(dir_TEMP, dir_TEMP)

weakly connected components stats:
total: 113535 articles
largest component ('subset'): 111577 articles
remaining 845 components:
	1958 articles
	mean: 2.32 articles
	std: 0.78 articles
	median: 2.0 articles


### 1. Sentiment

In [None]:
api_key = input()  # Run this cell and then enter your OpenAI api key.
file_name = params["file_name_train_csv"]

# Finetune ChatGPT model specified in params["model_sentiment"].
init_dict = em.CGPT_init(api_key)
em.save_finetune_file(init_dict, dir_batch, file_name)
em.send_finetune_job(init_dict, params["model_sentiment"], dir_batch, file_name, hyperparams=params["hyperparams"])

In [None]:
# Rate sentence sentiment by creating and uploading batches to OpenAI (ChatGPT).
model = "FINETUNED_MODEL_NUMBER"  # Copy paste the finetuned model number from OpenAI.
em.creat_batch_jobs(api_key, model, dir_TEMP, dir_batch, batch_size=49999)

From OpenAI, download output files (jsonl format) to /batch folder. <br/>
We will process these jsonl files.

In [None]:
# Populate cite2sent with empirical/observed/measured sentiment from ChatGPT output in dir_batch.
ccf.update_cite2sent_from_row2rate(dir_TEMP, dir_batch)
# Apply hierarchy rule such that each pair of papers only has at most 1 sentiment.
# We also make "cite2ns" dict, which contains number of citation sentencees for each citation pair.
ccf.update_cite2sent_hierarchy_rule(dir_TEMP, dir_dict)

From here on, the scope of papers is those in cite2sent (made by ccf module). <br />
All data and look up dictionaries (e.g., paper2authors) for figures are in /dicts. <br />
Below, we extract relevant metadata from paper2meta and turn them into individual lookup dictionaries.

In [30]:
import get_meta_data

paper2meta = ccf.make_paper2meta(list(cite2sent.keys()), dir_TEMP, dir_dict)
get_meta_data.save_paper_author_dicts(paper2meta, dir_dict)
get_meta_data.save_paper_time_dicts(paper2meta, dir_dict)

### Paper title parsing using a separate parser.

In [2]:
# If it doesn't print that titles not found, then all titles found, proceed to next stage.
get_meta_data.save_and_parse_full_titles(dir_xml, dir_dict)

100%|██████████| 108909/108909 [07:34<00:00, 239.85it/s]


In [None]:
# Get title embedding from OpenAI (ETA: 8 hours), save to /dicts.
api_key = input()  # Run this cell and then enter your OpenAI api key.
em.get_title_embedding(dir_dict, dir_TEMP, api_key, model=params["model_embed"])
em.save_title_embedding(dir_dict, dir_TEMP)

In [None]:
# Given title embedding, we calculate title similarity for each citation pair; "cite2title_sim" dict.
ccf.save_cite2title_sim(dir_batch, list(cite2sent.keys()), dir_TEMP)

# Create coauthorship network to calculate social distance (collaboration distance AKA CD).
ccf.save_g_coau_t(dir_dict)
ccf.save_cite2distance(list(cite2sent.keys()), dir_dict)

# Need 4 files: cite2ns, cite2title_sim in /TEMP; cite2sent_2, paper2meta in /dicts.
ccf.save_cite2sent_null_param(dir_dict, dir_TEMP, maxN=15, n_min_samp=500)
# Use g_coau_t and metadata to make a dict mapping citation pairs to time before first collab.
ccf.save_cite2t_collab(dir_dict)

In [None]:
# Find last author's departments and countries for each paper.
pfc.save_department_dicts(paper2meta, os.path.join(dir_input, "department_names.csv"), dir_dict, print_fail=False)
pfc.save_country_dicts(paper2meta, dir_dict, print_fail=False)

### 2. Benchwork Score & Synthesis

In [3]:
# Save .txt file sent to ChatGPT to rate benchwork score, 100 for each of 28 departments.
em.save_paper_snippet(dir_xml, dir_dict, dir_TEMP, n_paper=100)

In [3]:
# Prepare to send to ChatGPT to rate benchwork score.

api_key = input()
init_dict = em.CGPT_init_benchwork(api_key)
with open(os.path.join(dir_TEMP, "benchwork_text_CGPT.txt"), mode="r+", encoding="UTF-8") as file_out:
    txt_to_send = file_out.readlines()
with open(os.path.join(dir_TEMP, "benchwork_text_row2paper.pkl"), "rb") as f:
    benchwork_text_row2paper = pickle.load(f)

benchwork_text_row2response = {}

In [4]:
# Send to ChatGPT to rate benchwork score; ETA: 2 hours.
for row, txt in enumerate(txt_to_send):
    if row not in benchwork_text_row2response:
        time.sleep(0.05)  # Prevent from hitting rate limit; in seconds.
        try:
            res = em.get_rating(txt, init_dict)
            # Print irregular/unexpected response.
            if "no" not in res.casefold() and "yes" not in res.casefold():
                print(f"row={row} PMC={benchwork_text_row2paper[row]}, GPT response: {res}")
            benchwork_text_row2response[row] = res
        except:
            print(f"Error encountered at row={row} PMC={benchwork_text_row2paper[row]}")
    if row % 50 == 0 or row == (len(txt_to_send) - 1):  # Save periodically.
        with open(os.path.join(dir_batch, "benchwork_text_row2response.pkl"), "wb") as f:
            pickle.dump(benchwork_text_row2response, f)

In [4]:
# Process ChatGPT response, save to dictionary for department-wise measures later.
pfc.save_benchwork_count(dir_TEMP, dir_batch, dir_dict)

row=733 PMC=6518697, GPT response: ----> No <----, different from previous response to the same paper; this paper discarded.
row=2715 PMC=3682120, GPT response: ----> No <----, different from previous response to the same paper; this paper discarded.


### 3. h-Index and Gender

In [None]:
# Get h-Index. second_try uses different data to attempt to get h-Index.
WOS_api_key = input()  # Run this cell and then enter your WOS api key.
em.get_author_meta_from_WOS(dir_dict, WOS_api_key, second_try=False, verbose=False)
em.save_last_author2hIndex(dir_dict)

In [5]:
# Create a table of names in .csv and manually send it to gender-api.com.
# Also use it to for Web of Science API to retrieve author metadata, in which we obtain h-Index.
# This method is preferred because it is orders of magnitude faster (only need minutes).
em.prepare_author_names(dir_dict, dir_TEMP)

# Process gender.
# Make sure to download the output file from gender-api.com and save it in /input folder, naming it gender-API.csv.
# Process gender-API.csv (in /input) and create last author to gender info lookup dict.
em.save_author_gender(dir_dict, dir_input)

### 4. and 5. Save Department and Country measures.

Department: benchwork, synthesis, brilliance, proportion of men <br/>
Country: Power Distance, individualism, proportion of men <br/>

The department/country has to have at least 50 post-hierarchy citations towards collaborators.

In [5]:
# Brilliance data should be stored in dir_input (/input), named "brilliance_data.csv".
pfc.save_department_measures(dir_input, dir_dict, thres=params["THRES_NUM_PAIR_COLLAB"])
pfc.save_country_measures(dir_input, dir_dict, thres=params["THRES_NUM_PAIR_COLLAB"])

27 departments have 100+ post-hierarchy citations towards collaborators.
23 countries have 100+ post-hierarchy citations towards collaborators.


## Final step: Prepare data to plot <br/>
This will take 11 hours because it involves random sampling that's different at each iteration.


In [None]:
import prepare_plot_data as ppd


ppd.prepare_collab_groups(dir_dict, dir_npy, n_bs=params["n_bs"])  # 2 hours.
ppd.prepare_collab_distance(dir_dict, dir_npy, dist_max=params["dist_max"], n_bs=params["n_bs"])  # 0.5 hours.
ppd.prepare_t_collab(dir_dict, dir_npy, year_ranges=params["year_ranges"], n_bs=params["n_bs"])  # 1 hour.
ppd.prepare_hindex(dir_dict, dir_npy, binW=params["binW"], n_bs=params["n_bs"])  # 1.3 hours.
ppd.prepare_country_effects(dir_dict, dir_npy, n_bs=params["n_bs"], thres=params["THRES_NUM_PAIR_COLLAB"])  # 2 hours.
ppd.prepare_department_effects(dir_dict, dir_npy, n_bs=params["n_bs"], thres=params["THRES_NUM_PAIR_COLLAB"])  # 3.5 hours.
ppd.prepare_gender_effects(dir_dict, dir_npy, n_bs=params["n_bs"], thres=params["THRES_NUM_PAIR_COLLAB"])  # 2 hours.

### We have all ingredients we need for figures. Run plot_ ipynb to make figures. <br/>