## Load parameters json file in /input.


THRES_NUM_PAIR_COLLAB (int):</br>
Sample size threshold for at least this amount of citation pairs to collaborators.</br>
Used to filter out departments and countries.

In [None]:
import os
import json
import pickle
import importlib
from tqdm import tqdm
import numpy as np
import time
from collections import Counter, defaultdict

import external_methods as em
import process_field_and_country as pfc
import cite_coauthor_functions as ccf
from helper_functions import loadPKL, savePKL, get_IRR, pval_star

CWD = os.path.abspath("")  # Jupyter notebook path.

dir_input = os.path.join(CWD, "input")
dir_batch = os.path.join(CWD, "batch")  # ChatGPT related output.
dir_TEMP = os.path.join(CWD, "TEMP")  # Intermediate files.
dir_dict = os.path.join(CWD, "dicts")  # Look up dictionaries such as paper2meta; main data directory.
dir_npy = os.path.join(CWD, "npy")  # Data files needed for plotting figures.
dir_output = os.path.join(CWD, "output")  # Figures.
dir_xml = os.path.join(CWD, "xml")  # xml files.
dir_DEBUG = os.path.join(CWD, "DEBUG")

with open(os.path.join(dir_input, "params.json")) as f:
    params = json.load(f)
print(params)

api_key = input()  # Run this cell and then enter your OpenAI api key.

Download select xml files using oa_file_list.csv (in "input" folder) <br />
which is downloaded from https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.csv.

In [None]:
import filter_papers as filt_pap

# Save journal meta info, such as name and JIF.
filt_pap.save_jour2meta(dir_input, dir_dict, "JCR_JournalResults_05_2024", jif_thres=params["jif_thres"])

# Filter by pub year and journals.
founD = loadPKL(dir_dict, "jour2meta")
jrns = set()  # 187 journals (total 188, but 1 is absent in PMC).
for k, v in founD.items():
    MedAbbr = v["MedAbbr"]
    assert MedAbbr not in jrns, f"Duplicate MedAbbr found for journal={MedAbbr}."
    jrns.add(MedAbbr)

# stats_dict, stats_dict_filtered are saved in /TEMP; for descriptive stats.
# oa_file_list_filtered.csv is saved in /TEMP too; specifies what papers we will download next.
stats_dict, stats_dict_filtered = filt_pap.filter_file_list(params["year_range"], jrns, dir_input, dir_TEMP)

In [None]:
from download_xml import download_tgz_files

# We have oa_file_list_filtered.csv, now we download xml from it.
# Run below to download xml files we need from PubMedCentral OpenAccess Subset.
download_tgz_files(csv_in=dir_TEMP, xml_out=dir_xml)

now that we have xml files, we now process them and do some filtering <br/>
we use xml to extract following data:
1. citations
2. citation sentences
3. first, last, all author first and last names
4. affiliations -> we will extract department and country from this later

In [None]:
# We need to download punkt for tokenization.
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download nltk punkt for sentence tokenization.
nltk.download("punkt")  # nltk 3.8.1 just needs this one.
# nltk.download('punkt_tab')  # In nltk 3.9.1 this is required.

In [None]:
import xml_parser as xp

with open(os.path.join(dir_TEMP, "stats_dict_filtered.pkl"), "rb") as f:
    stats_dict_filtered = pickle.load(f)
# Parse the downloaded xml files; save key_info_all.pkl in /TEMP.
key_info_all = xp.parse_all_xml_files(dir_xml, dir_TEMP, stats_dict_filtered["journal_year_lookup"])

In [None]:
# Extract info from key_info_all.pkl; save ref_stats.pkl in /TEMP.
ref_stats = xp.make_ref_stats(dir_TEMP, key_info_all, stats_dict_filtered["journal_year_lookup"], jrns)

In [None]:
import get_meta_data as gmd

# Make citation edges and article_meta and save them to /TEMP.
gmd.make_edges_and_meta(dir_TEMP, ref_stats, key_info_all)

### From here on we don't need key_info_all or ref_stats,
### instead we primarily use article_meta and paper2meta.



## Get some data from external sources, using various APIs:
1. sentiment scores (ChatGPT):
    * sentences and info files in /TEMP
2. benchwork score (ChatGPT)
    * we need paper content (starting from introduction, may also cover some of results or method sections) to get this one; paper content in /TEMP
3. h-Index (WoS) & gender (gender-API): contains last author name and their h-Index and estimated gender
    * last_author2gender-Neuroscience.csv (/input)
    * last_author2hIndex.pkl and last_author2gender_info.pkl(/dicts)
    * Need author names for this one.
4. power distance and individualism (https://geerthofstede.com/research-and-vsm/dimension-data-matrix/): 6-dimensions-for-website-2015-08-16.csv (/input)

5. brilliance (https://doi.org/10.1037/edu0000669): brilliance_data.csv (/input)


In [None]:
# Create two files saved in /TEMP:
# "sentences2rate" txt file contains all sentences to be rated by ChatGPT, "sentrow2edgeinfo" pkl file for bookkeeping.
em.save_CGPT_input_files(dir_cnets=dir_TEMP, dir_out=dir_TEMP, cite_marker="âœª")

# Create structured citation data; no sentiment yet; later add sentiment from ChatGPT API results.
cite2sent = ccf.make_cite2sent_from_sentence_data(dir_TEMP, dir_TEMP)

From here on, the scope of papers is those in cite2sent (made by ccf module). <br />
All data and look up dictionaries (e.g., paper2authors) for figures are in /dicts. <br />
Below, we extract relevant metadata from paper2meta and turn them into individual lookup dictionaries.

In [None]:
import get_meta_data

paper2meta = ccf.make_paper2meta(list(cite2sent.keys()), dir_TEMP, dir_dict)
get_meta_data.save_paper_author_dicts(paper2meta, dir_dict)
get_meta_data.save_paper_time_dicts(paper2meta, dir_dict)

### Before continuing, now the full sentence and paper2meta datasets are ready, we will do human raters codebook rating and ChatGPT model validation on sentence sentiment and paper benchwork.
Open `chatgpt_validation.ipynb` for more.

### 1. Sentiment

cost about $150 with CGPT_init_crit_5_explain() and 627108 sentences, gpt-4.1-mini (input $0.4/1M tokens, output $1.6/1M tokens)

In [None]:
# file_name = params["file_name_train_csv"]

# # Finetune ChatGPT model specified in params["model_sentiment"].
# init_dict = em.CGPT_init(api_key)
# em.save_finetune_file(init_dict, dir_batch, file_name)
# em.send_finetune_job(init_dict, params["model_sentiment"], dir_batch, file_name, hyperparams=params["hyperparams"])

In [None]:
batch_path_out = os.path.join(dir_batch, "sentiment_n=627K")
fname = "sentences2rate-CGPT"
init_dict = em.CGPT_init_crit_5_explain(api_key)
print(params["model_sentiment"])
batch_size = 45000

In [None]:
batch_num = 5
# 0,1-4(1-3 are done by nonbatch, 4 there's 2593 done by nonbatch),8-13
subbatch = 5000

# tier-1
# "gpt-3.5-turbo-1106" 200,000/250=800 (500 RPM), 0.12 interval, 10K RPD.
# "gpt-4.1-mini-2025-04-14" 200,000/250=800 (500 RPM), 0.12 interval.
# "gpt-4.1-2025-04-14" 30,000/250=120 RPM, 0.5 interval. nonbatch cost $4.96 (8755 items).

# tier-3
# "gpt-3.5-turbo" 5K RPM, 0.012 interval, 40M TPD.
# "gpt-4.1-mini-2025-04-14" 5K RPM, 0.012 interval, 40M TPD.
# "gpt-4.1-2025-04-14" 5K RPM, 0.012 interval. 100M TPD.

# ~700(input)+100(output) tokens per request. 627K sentences/requests. = 500M+
# Do it in 14 batches, 44800 sentences per batch...
em.creat_batch_jobs_fc(init_dict, params["model_sentiment"], dir_TEMP, batch_path_out, batch_size=batch_size, fname=fname, batch_num=batch_num, bench=False, token_output=100, subbatch=subbatch)
# em.send_request_nonbatch_fc(init_dict, params["model_sentiment"], dir_TEMP, batch_path_out, fname=fname, i=None, lab=None, interval=0.012, num=100000, bench=False)

because batch is unpredictable and thus slow overall, so run nonbatch at the same time to speed up the process. See generate_data_only_nonbatch_run.ipynb.

From OpenAI, download output files (jsonl format) to /batch folder. <br/>
We will process these jsonl files.

In [None]:
# Download batch output files programmatically.
batches_dict = loadPKL(batch_path_out, "gpt-4.1-mini-2025-04-14-batches_dict")
em.download_batch_output(api_key, batches_dict, batch_path_out, verbose=True)

In [None]:
# Process batch output files to get raw row2rate and processed row2rate.
em.process_batch_outputs_fc(batch_path_out, batch_path_out, i="batch", lab=None)
row2rate = em.process_row2rate(loadPKL(batch_path_out, f"gpt-4.1-mini-2025-04-14-row2rate_reason-batch"), verbose=False, fc=True, bench=False)
print(Counter([v[0] for v in row2rate.values()]))

row2rate_nonba = em.process_row2rate(loadPKL(batch_path_out, f"gpt-4.1-mini-2025-04-14-row2rate_reason-1to5"), verbose=False, fc=True, bench=False)
print(Counter([v[0] for v in row2rate_nonba.values()]))

# Combine batch and non-batch results, and save it to dir_batch.
row2rate.update(row2rate_nonba)
print(Counter([v[0] for v in row2rate.values()]))
print(len(row2rate), max(row2rate.keys())-min(row2rate.keys())+1, sep="\n")
savePKL(dir_batch, "row2rate_reason_processed", row2rate)

In [None]:
# Populate cite2sent with empirical/observed/measured sentiment from ChatGPT output in dir_batch.
ccf.update_cite2sent_from_row2rate(dir_TEMP, dir_batch)  # cite2sent_0 -> 1
# Apply hierarchy rule such that each pair of papers only has at most 1 sentiment.
# We also make "cite2ns" dict, which contains number of citation sentencees for each citation pair.
ccf.update_cite2sent_hierarchy_rule(dir_TEMP, dir_dict)  # cite2sent_1 -> 2

### Paper title parsing using a separate parser.

In [None]:
# If it doesn't print that titles not found, then all titles found, proceed to next stage.
get_meta_data.save_and_parse_full_titles(dir_xml, dir_dict)

In [None]:
# Get title embedding from OpenAI (ETA: 8 hours), save to /dicts.
api_key = input()  # Run this cell and then enter your OpenAI api key.
em.get_title_embedding(dir_dict, dir_TEMP, api_key, model=params["model_embed"])
em.save_title_embedding(dir_dict, dir_TEMP)

In [None]:
cite2sent = loadPKL(dir_TEMP, "cite2sent_0")
# Given title embedding, we calculate title similarity for each citation pair; "cite2title_sim" dict.
ccf.save_cite2title_sim(dir_batch, list(cite2sent.keys()), dir_TEMP)

# Create coauthorship network to calculate social distance (collaboration distance AKA CD).
ccf.save_g_coau_t2(dir_dict, weight_nocollab=np.inf, weight_type="binary", no_mid_mid=False, lag=0)
ccf.save_cite2distance(list(cite2sent.keys()), dir_dict)

# Need 4 files: cite2ns, cite2title_sim in /TEMP; cite2sent_2, paper2meta in /dicts.
ccf.save_cite2sent_null_param(dir_dict, dir_TEMP, maxN=15, n_min_samp=500)
# Use g_coau_t and metadata to make a dict mapping citation pairs to time before first collab.
ccf.save_cite2t_collab(dir_dict, lag=0)

In [None]:
paper2meta = loadPKL(dir_dict, "paper2meta")
# Find last author's departments and countries for each paper.
pfc.save_department_dicts(paper2meta, os.path.join(dir_input, "department_names.csv"), dir_dict, print_fail=False)
pfc.save_country_dicts(paper2meta, dir_dict, print_fail=False)

# Department cultural capital (international and interdisciplinary collab measures).
pfc.save_country_dicts_all_authors(paper2meta, dir_dict, print_fail=False)
pfc.save_department_dicts_all_authors(paper2meta, os.path.join(dir_input, "department_names.csv"), dir_dict, print_fail=False)

### 2. Benchwork Score & Synthesis

In [None]:
# Save .txt file sent to ChatGPT to rate benchwork score, 100 for each of 28 departments.
em.save_paper_snippet(dir_xml, dir_dict, dir_TEMP, n_paper=100)

In [None]:
# ~9K(input) tokens per request. (With 100K context window.) 2800 requests.
# $5 for 4.1-mini: Input $0.40/1M tokens, Cached input $0.10/1M tokens, Output $1.60/1M tokens.
fname = f"benchwork_text_CGPT"
init_dict = em.CGPT_init_benchwork(api_key)
em.creat_batch_jobs(init_dict, params["model_benchwork"], dir_TEMP, dir_batch, batch_size=3000, fname=fname, batch_num=None)
# em.send_request_nonbatch(init_dict, model, dir_TEMP, dir_batch, fname=fname, i=ib, lab=lab, interval=0.012, num=1000, bench=True)

Current batch num_tokens=25,674,366
1 batches created.


In [None]:
em.process_batch_outputs(dir_batch, dir_batch)
row2rate0 = em.process_row2rate(loadPKL(dir_batch, f"gpt-4.1-mini-2025-04-14-row2rate"), verbose=False, fc=False, bench=True)
Counter([v for v in row2rate0.values()])

Go to dir_batch "bench_n=2800", manually rename "gpt-4.1-mini-2025-04-14-row2rate.pkl" to "benchwork_text_row2response.pkl", and put in dir_batch

In [4]:
# Process ChatGPT response, save to dictionary for department-wise measures later.
pfc.save_benchwork_count(dir_TEMP, dir_batch, dir_dict)

### 3. h-Index and Gender

In [None]:
# Get h-Index. second_try uses different data to attempt to get h-Index.
WOS_api_key = input()  # Run this cell and then enter your WOS api key.
em.get_author_meta_from_WOS(dir_dict, WOS_api_key, second_try=False, verbose=False)
em.save_last_author2hIndex(dir_dict)

In [None]:
# Create a table of names in .csv and manually send it to gender-api.com.
# Also use it to for Web of Science API to retrieve author metadata, in which we obtain h-Index.
# This method is preferred because it is orders of magnitude faster (only need minutes).
em.prepare_author_names(dir_dict, dir_TEMP)

# Process gender.
# Make sure to download the output file from gender-api.com and save it in /input folder, naming it gender-API.csv.
# Process gender-API.csv (in /input) and create last author to gender info lookup dict.
em.save_author_gender(dir_dict, dir_input)

### 4. and 5. Save Department and Country measures.

Department: benchwork, synthesis, brilliance, proportion of men <br/>
Country: Power Distance, individualism, proportion of men <br/>

The department/country has to have at least 100 post-hierarchy citations towards collaborators.

In [None]:
# Brilliance data should be stored in dir_input (/input), named "brilliance_data.csv".
pfc.save_department_measures(dir_input, dir_dict, thres=params["THRES_NUM_PAIR_COLLAB"])
pfc.save_country_measures(dir_input, dir_dict, thres=params["THRES_NUM_PAIR_COLLAB"])
pfc.save_department_collab_measures(dir_dict, dir_dict, thres=params["THRES_NUM_PAIR_COLLAB"])

## Final step: Prepare data to plot <br/>
This will take 11 hours because it involves random sampling that's different at each iteration.


In [None]:
import prepare_plot_data as ppd

ppd.prepare_collab_groups(dir_dict, dir_npy, n_bs=params["n_bs"])  # 2 hours.
ppd.prepare_collab_distance(dir_dict, dir_npy, dist_max=params["dist_max"], n_bs=params["n_bs"])  # 0.5 hours.
ppd.prepare_department_effects_collab(
    dir_dict, dir_npy, dist_max=params["dist_max"], n_bs=params["n_bs"], thres=params["THRES_NUM_PAIR_COLLAB"]
)  # 1.5 hours.
ppd.prepare_t_collab(dir_dict, dir_npy, year_ranges=params["year_ranges"], n_bs=params["n_bs"])  # 1 hour.
ppd.prepare_hindex(dir_dict, dir_npy, binW=params["binW"], n_bs=params["n_bs"])  # 1.3 hours.
ppd.prepare_country_effects(dir_dict, dir_npy, n_bs=params["n_bs"], thres=params["THRES_NUM_PAIR_COLLAB"])  # 2 hours.
ppd.prepare_department_effects(dir_dict, dir_npy, n_bs=params["n_bs"], thres=params["THRES_NUM_PAIR_COLLAB"])  # 3.5 hours.
ppd.prepare_gender_effects(dir_dict, dir_npy, n_bs=params["n_bs"], thres=params["THRES_NUM_PAIR_COLLAB"])  # 2 hours.

## We have all ingredients we need for figures. Run plot_ ipynb to make figures. <br/>

### below chuncks are supplementary analyses.

### below is for multiple linear regression (moderator analysis) across multiple groups, so null model and y-measure is on citation level at pre-aggregation, which is different from the vanilla group of citation level at post-aggregation

### create null for pre-agg

In [7]:
# Find null.
ccf.save_cite2sent_null_param_pre_agg(dir_dict, dir_TEMP, maxN=15, n_min_samp=500)

In bins whose sample size < 500
there are 21 bins, 6050.0 citation pairs,
taking up 0.91% of all pairs.


In [None]:
# Save y (mean and std for each sentiment) for each cite pair.
cite2sent_1 = loadPKL(dir_TEMP, "cite2sent_1")
cite2sent_n = loadPKL(dir_dict, "cite2sent_null_param_pre_agg")
ccf.find_y_rand_samp_pre_agg(cite2sent_1, cite2sent_n, n_rand_samp=params["n_bs"], full_samp=False, save_as_cite_dict=dir_dict)  # Save as cite2SRPC_ms.pkl, to be used below.

In [9]:
from helper_functions import save_Rds
import pandas as pd
import json

# Build table of citation pairs for multiple linear regression in R.
department2synthesis = loadPKL(dir_dict, "department2synthesis")
department2benchwork = loadPKL(dir_dict, "department2benchwork")
df = pd.DataFrame.from_dict(dict(synthesis=department2synthesis, benchwork=department2benchwork))
save_Rds(dir_dict, "department2meta", df)

country2power_distance = loadPKL(dir_dict, "country2power_distance")
country2individualism = loadPKL(dir_dict, "country2individualism")
df = pd.DataFrame.from_dict(dict(power_distance=country2power_distance, individualism=country2individualism))
save_Rds(dir_dict, "country2meta", df)

# Paper meta data.
paper2last_author_department = loadPKL(dir_dict, "paper2last_author_department_28_dep")
paper2last_author = loadPKL(dir_dict, "paper2last_author")  # Last author name of each paper.
last_author2gender_info = loadPKL(dir_dict, "last_author2gender_info")
last_author2hIndex = loadPKL(dir_dict, "last_author2hIndex")  # Last author hIndex.
paper2last_author_gender = {p: last_author2gender_info[au] for p, au in paper2last_author.items()}
paper2last_author_gender = {p: g[0] if (g[1] >= 0.7 and g[2] >= 20) else None for p, g in paper2last_author_gender.items()}
cite2SRPC_ms = loadPKL(dir_dict, "cite2SRPC_ms")

# Citation table.
tmp_data_dict = dict()
tmp_data_dict["cite_pair"] = {i: str(c) for i, c in enumerate(cite2SRPC_ms)}
tmp_data_dict["y_pos"] = {i: ms[0][0] for i, ms in enumerate(cite2SRPC_ms.values())}
tmp_data_dict["y_neu"] = {i: ms[0][1] for i, ms in enumerate(cite2SRPC_ms.values())}
tmp_data_dict["y_neg"] = {i: ms[0][2] for i, ms in enumerate(cite2SRPC_ms.values())}
tmp_data_dict["dy_pos"] = {i: ms[1][0] for i, ms in enumerate(cite2SRPC_ms.values())}
tmp_data_dict["dy_neu"] = {i: ms[1][1] for i, ms in enumerate(cite2SRPC_ms.values())}
tmp_data_dict["dy_neg"] = {i: ms[1][2] for i, ms in enumerate(cite2SRPC_ms.values())}
# Inverse variance weights. np.nan if std is 0.
tmp_data_dict["wy_pos"] = {i: 1 / ms[1][0] ** 2 if ms[1][0] != 0 else np.nan for i, ms in enumerate(cite2SRPC_ms.values())}
tmp_data_dict["wy_neu"] = {i: 1 / ms[1][1] ** 2 if ms[1][1] != 0 else np.nan for i, ms in enumerate(cite2SRPC_ms.values())}
tmp_data_dict["wy_neg"] = {i: 1 / ms[1][2] ** 2 if ms[1][2] != 0 else np.nan for i, ms in enumerate(cite2SRPC_ms.values())}
tmp_data_dict["last_author_department"] = {i: paper2last_author_department[c[0]] for i, c in enumerate(cite2SRPC_ms)}
tmp_data_dict["citing_gender"] = {i: paper2last_author_gender.get(c[0], "") for i, c in enumerate(cite2SRPC_ms)}
tmp_data_dict["cited_gender"] = {i: paper2last_author_gender.get(c[1], "") for i, c in enumerate(cite2SRPC_ms)}


def _nonce_get_hindex_diff(pi, pj, abs_=False):
    hi = last_author2hIndex.get(paper2last_author[pi], None)
    if hi is None:
        return None
    hj = last_author2hIndex.get(paper2last_author[pj], None)
    if hj is None:
        return None
    return np.abs(hi - hj) if abs_ else hi - hj


tmp_data_dict["hindex_diff"] = {i: _nonce_get_hindex_diff(c[0], c[1], abs_=False) for i, c in enumerate(cite2SRPC_ms)}
tmp_data_dict["hindex_abs"] = {i: _nonce_get_hindex_diff(c[0], c[1], abs_=True) for i, c in enumerate(cite2SRPC_ms)}
df = pd.DataFrame.from_dict(tmp_data_dict)
# Both filled and empty lists will act fine with json.dumps; the NaN ones will become empty list first.
df["last_author_department"] = df["last_author_department"].apply(lambda x: json.dumps(x) if isinstance(x, list) else json.dumps([]))
save_Rds(dir_dict, "cite2author_aff", df)

### below is for sensitivity of num of sentences on sentiment.

### create null for ns=10

In [None]:
import prepare_plot_data as ppd

maxN = 10

ccf.save_cite2sent_null_param(dir_dict, dir_TEMP, maxN=maxN, n_min_samp=500)
print(f"null param ns-cutoff={maxN} done.")
ppd.prepare_collab_distance(dir_dict, dir_npy, dist_max=params["dist_max"], n_bs=params["n_bs"], maxN=maxN)  # 0.5 hours.

### below is for synthesis result without review/research papers.

In [None]:
ccf.save_cite2sent_null_param_only(dir_dict, dir_TEMP, maxN=15, n_min_samp=500, which_only="research-article")
ppd.prepare_department_effects_only(dir_dict, dir_npy, n_bs=params["n_bs"], thres=params["THRES_NUM_PAIR_COLLAB"], which_only="research-article")  # 2.6 hours.

In [None]:
ccf.save_cite2sent_null_param_only(dir_dict, dir_TEMP, maxN=15, n_min_samp=500, which_only="review-article")
ppd.prepare_department_effects_only(dir_dict, dir_npy, n_bs=params["n_bs"], thres=params["THRES_NUM_PAIR_COLLAB"], which_only="review-article")  # 2.6 hours.

### emergency debugging on review effects

ppdnr is now slightly outdated because of the fnames

In [None]:
import prepare_plot_data_no_review as ppdnr

ppdnr.prepare_collab_groups(dir_dict, dir_npy, n_bs=params["n_bs"])  # 2 hours.
ppdnr.prepare_collab_distance(dir_dict, dir_npy, dist_max=params["dist_max"], n_bs=params["n_bs"])  # 0.5 hours.
ppdnr.prepare_department_effects_collab(
    dir_dict, dir_npy, dist_max=params["dist_max"], n_bs=params["n_bs"], thres=params["THRES_NUM_PAIR_COLLAB"]
)  # 1.5 hours.
ppdnr.prepare_t_collab(dir_dict, dir_npy, year_ranges=params["year_ranges"], n_bs=params["n_bs"])  # 1 hour.
ppdnr.prepare_hindex(dir_dict, dir_npy, binW=params["binW"], n_bs=params["n_bs"])  # 1.3 hours.
ppdnr.prepare_country_effects(dir_dict, dir_npy, n_bs=params["n_bs"], thres=params["THRES_NUM_PAIR_COLLAB"])  # 2 hours.
ppdnr.prepare_department_effects(dir_dict, dir_npy, n_bs=params["n_bs"], thres=params["THRES_NUM_PAIR_COLLAB"])  # 3.5 hours.
ppdnr.prepare_gender_effects(dir_dict, dir_npy, n_bs=params["n_bs"], thres=params["THRES_NUM_PAIR_COLLAB"])  # 2 hours.