## Load data.

In [1]:
import numpy as np
from statsmodels.stats.proportion import multinomial_proportions_confint as mpc
import os
import pickle
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from collections import Counter
from scipy import stats


from helper_functions import flatten_list, reverse_dict_list, reverse_dict_val, sorted_dict, loadPKL, set_xticks, set_yticks, SENT2IDX, SENT2LAB, stats, pval_star, print_sigfig


CWD = os.path.abspath("")  # Jupyter notebook path.
dir_TEMP = os.path.join(CWD, "TEMP")  # Intermediate files.
dir_dict = os.path.join(CWD, "dicts")  # Data to plot.
dir_input = os.path.join(CWD, "input")
dir_output = os.path.join(CWD, "output")  # Folder to put figures in.
dir_npy = os.path.join(CWD, "npy")  # Data files needed for plotting figures.

# Set up plotting parameters.
cm_max = 20  # Maximum number of citation mentions (AKA citation frequency) to plot.
sent_colors = ["#504DB2", "#414042", "#B2504D"]  # POS, NEU, NEG
font_kw = {"family": "arial", "weight": "normal", "size": "7"}  # Markers and such.
mpl.rc("font", **font_kw)

# Load data to plot.
with open(os.path.join(dir_TEMP, "sentences2rate-CGPT.txt"), mode="r+", encoding="UTF-8") as file_out:
    sentences2rate = file_out.readlines()
cite2sent_0 = loadPKL(dir_TEMP, "cite2sent_0")  # Pre-aggregation citation sentences indices.
cite2sent_1 = loadPKL(dir_TEMP, "cite2sent_1")  # Pre-aggregation citation sentiment.
cite2ns = loadPKL(dir_TEMP, "cite2ns")  # Citation frequency between a given citer-citee pair.
cite2sent_emp = loadPKL(dir_dict, "cite2sent_2")  # Each citation pair has just 1 sentiment; "empirical".
paper2meta = loadPKL(dir_dict, "paper2meta")
jour2meta = loadPKL(dir_dict, "jour2meta")
cite2distance = loadPKL(dir_dict, "cite2distance")  # Collaboration distance.

### h-Index stats.

In [None]:
with open(os.path.join(dir_dict, "last_author2hIndex.pkl"), "rb") as f:
    last_author2hIndex = pickle.load(f)
with open(os.path.join(dir_dict, "paper2last_author.pkl"), "rb") as f:
    paper2last_author = pickle.load(f)


last_authors = {au for au in paper2last_author.values()}
presence = {au for au in last_authors if au in last_author2hIndex}
a = len(presence) / len(last_authors) * 100
print(f"Num of last authors: {len(last_authors)} {a:.2f}")

### Collab groups aggregate behavior stats.

In [None]:
# 4 collab distance types; [1, inf), [1], [2, inf), [0].
ratio_mat_rel = np.load(os.path.join(dir_npy, "ratio_mat_rel-collab_groups.npy"))

print("two-sided Welch's $t$-test:\n")
for s, i in SENT2IDX.items():
    print(f"{SENT2LAB[s]}")
    abc1 = np.mean(ratio_mat_rel[1, i, :])
    abc2 = np.std(ratio_mat_rel[1, i, :], ddof=1)
    abc3 = np.mean(ratio_mat_rel[2, i, :])
    abc4 = np.std(ratio_mat_rel[2, i, :], ddof=1)
    res = stats.ttest_ind(ratio_mat_rel[1, i, :], ratio_mat_rel[2, i, :], equal_var=False, alternative="two-sided")
    txt_box0 = ""
    txt_box0 += f"t({print_sigfig(res.df)})={print_sigfig(res.statistic)}, {pval_star(res.pvalue)}"
    txt_box0 += f", collab-non-collab diff {print_sigfig((abc1-abc3))}, 95% CI ({print_sigfig(res.confidence_interval().low)},{print_sigfig(res.confidence_interval().high)})"
    txt_box0 += f"\ncollab {print_sigfig(abc1)}±{print_sigfig(abc2)}\nnon-collab {print_sigfig(abc3)}±{print_sigfig(abc4)}\n"
    print(txt_box0, end="\n")

### Calculate pre-aggregation sentiment ratio (of citer-citee pairs) for research, review, and both types of papers.

In [None]:
research_POS = sum([sum([x == 1 for x in s]) for e, s in cite2sent_1.items() if paper2meta[e[0]]["article-type"] == "research-article"])
research_NEU = sum([sum([x == 0 for x in s]) for e, s in cite2sent_1.items() if paper2meta[e[0]]["article-type"] == "research-article"])
research_NEG = sum([sum([x == -1 for x in s]) for e, s in cite2sent_1.items() if paper2meta[e[0]]["article-type"] == "research-article"])
research_tot = research_POS + research_NEU + research_NEG
review_POS = sum([sum([x == 1 for x in s]) for e, s in cite2sent_1.items() if paper2meta[e[0]]["article-type"] == "review-article"])
review_NEU = sum([sum([x == 0 for x in s]) for e, s in cite2sent_1.items() if paper2meta[e[0]]["article-type"] == "review-article"])
review_NEG = sum([sum([x == -1 for x in s]) for e, s in cite2sent_1.items() if paper2meta[e[0]]["article-type"] == "review-article"])
review_tot = review_POS + review_NEU + review_NEG
x_POS = sum([sum([x == 1 for x in s]) for e, s in cite2sent_1.items()])
x_NEU = sum([sum([x == 0 for x in s]) for e, s in cite2sent_1.items()])
x_NEG = sum([sum([x == -1 for x in s]) for e, s in cite2sent_1.items()])
x_tot = x_POS + x_NEU + x_NEG

ratios = [
    [research_POS / research_tot * 100, review_POS / review_tot * 100, x_POS / x_tot * 100],
    [research_NEU / research_tot * 100, review_NEU / review_tot * 100, x_NEU / x_tot * 100],
    [research_NEG / research_tot * 100, review_NEG / review_tot * 100, x_NEG / x_tot * 100],
]
for r, s in zip(ratios, ["POS", "NEU", "NEG"]):
    print(f"{s} ratios: research {r[0]:.2f}% review {r[1]:.2f}% both {r[2]:.2f}%")

### Calculate post-aggregation sentiment ratio (of citer-citee pairs) for research, review, and both types of papers.

In [None]:
research_POS = sum([s == 1 for e, s in cite2sent_emp.items() if paper2meta[e[0]]["article-type"] == "research-article"])
research_NEU = sum([s == 0 for e, s in cite2sent_emp.items() if paper2meta[e[0]]["article-type"] == "research-article"])
research_NEG = sum([s == -1 for e, s in cite2sent_emp.items() if paper2meta[e[0]]["article-type"] == "research-article"])
research_tot = research_POS + research_NEU + research_NEG
review_POS = sum([s == 1 for e, s in cite2sent_emp.items() if paper2meta[e[0]]["article-type"] == "review-article"])
review_NEU = sum([s == 0 for e, s in cite2sent_emp.items() if paper2meta[e[0]]["article-type"] == "review-article"])
review_NEG = sum([s == -1 for e, s in cite2sent_emp.items() if paper2meta[e[0]]["article-type"] == "review-article"])
review_tot = review_POS + review_NEU + review_NEG
x_POS = sum([s == 1 for e, s in cite2sent_emp.items()])
x_NEU = sum([s == 0 for e, s in cite2sent_emp.items()])
x_NEG = sum([s == -1 for e, s in cite2sent_emp.items()])
x_tot = x_POS + x_NEU + x_NEG

ratios = [
    [research_POS / research_tot * 100, review_POS / review_tot * 100, x_POS / x_tot * 100],
    [research_NEU / research_tot * 100, review_NEU / review_tot * 100, x_NEU / x_tot * 100],
    [research_NEG / research_tot * 100, review_NEG / review_tot * 100, x_NEG / x_tot * 100],
]
for r, s in zip(ratios, ["POS", "NEU", "NEG"]):
    print(f"{s} ratios: research {r[0]:.2f}% review {r[1]:.2f}% both {r[2]:.2f}%")

### Make journal table (only ones ended up in the analysis).

In [None]:
# Set of journals in the data.
journals_s = {v["jyf"][0] for v in paper2meta.values()}
# jour2meta has all 185+3 considered and 184+3=187 found in Medline.
# In final data, there are 181 journals.
print(len(journals_s), len(jour2meta))

for k in jour2meta:
    if jour2meta[k]["MedAbbr"] not in journals_s:
        # print(jour2meta[k]["MedAbbr"], jour2meta[k]["jourMeta"]["OAGoldPercent"])
        pass

# Find papers in each journal.
jour2paper = {j: set() for j in journals_s}
for p, v in paper2meta.items():
    jour2paper[v["jyf"][0]].add(p)

In [None]:
# Print journal table.
jour2meta_sorted = dict(
    sorted(dict(i for i in jour2meta.items() if i[1]["MedAbbr"] in journals_s).items(), key=lambda kv: kv[1]["JournalTitle"].casefold())
)


print(max([len(v["JournalTitle"]) for v in jour2meta_sorted.values()]))
print(f"{'Journal Title':<132s}{'JIF':>10s}{'% OA Gold':>12s}{'Number of Papers in Analysis':>30s}")
bb = [None] * len(jour2meta_sorted)
cc = [None] * len(jour2meta_sorted)
dd = [None] * len(jour2meta_sorted)
for i, k in enumerate(jour2meta_sorted):
    a = jour2meta_sorted[k]["JournalTitle"]
    b = jour2meta_sorted[k]["jourMeta"]["JIF2022"]
    c = jour2meta_sorted[k]["jourMeta"]["OAGoldPercent"]
    d = len(jour2paper[jour2meta_sorted[k]["MedAbbr"]])
    bb[i] = b
    cc[i] = c
    dd[i] = d
    print(f"{a:<132s}{b:>10}{c:>12}{d:>30}")
print(f"{'Journal Average':<132s}{np.mean(bb):>10.1f}{np.mean(cc):>12.2f}{np.mean(dd):>30.3f}")
print(f"{'Journal Standard Deviation':<132s}{np.std(bb, ddof=1):>10.1f}{np.std(cc, ddof=1):>12.2f}{np.std(dd, ddof=1):>30.3f}")

### Make department table (all 28 considered).

In [8]:
with open(os.path.join(dir_dict, "paper2last_author.pkl"), "rb") as f:
    paper2last_author = pickle.load(f)
with open(os.path.join(dir_dict, "paper2first_author.pkl"), "rb") as f:
    paper2first_author = pickle.load(f)
last_author2paper = reverse_dict_val(paper2last_author)
first_author2paper = reverse_dict_val(paper2first_author)

with open(os.path.join(dir_dict, "paper2last_author_department_28_dep.pkl"), "rb") as f:
    paper2last_author_department = pickle.load(f)  # Last author departments for each paper.
with open(os.path.join(dir_dict, "paper2first_author_department_28_dep.pkl"), "rb") as f:
    paper2first_author_department = pickle.load(f)  # First author departments for each paper.
last_author_department2paper = reverse_dict_list(paper2last_author_department)
first_author_department2paper = reverse_dict_list(paper2first_author_department)
dep_names = sorted(list(last_author_department2paper.keys()))


last_author_dep2paper = reverse_dict_list(paper2last_author_department)
last_author2dep = {au: [] for au in last_author2paper}
for p, colist in paper2last_author_department.items():
    last_author2dep[paper2last_author[p]] += colist
last_author2dep = {au: list(set(colist)) for au, colist in last_author2dep.items()}
dep2last_author = reverse_dict_list(last_author2dep)

first_author_dep2paper = reverse_dict_list(paper2first_author_department)
first_author2dep = {au: [] for au in first_author2paper}
for p, colist in paper2first_author_department.items():
    first_author2dep[paper2first_author[p]] += colist
first_author2dep = {au: list(set(colist)) for au, colist in first_author2dep.items()}
dep2first_author = reverse_dict_list(first_author2dep)

last_author_dep2n_paper = dict(sorted({co: len(pp) for co, pp in last_author_dep2paper.items()}.items(), key=lambda x: x[0]))
first_author_dep2n_paper = dict(sorted({co: len(pp) for co, pp in first_author_dep2paper.items()}.items(), key=lambda x: x[0]))

dep2n_last_author = dict(sorted({co: len(pp) for co, pp in dep2last_author.items()}.items(), key=lambda x: x[0]))
dep2n_first_author = dict(sorted({co: len(pp) for co, pp in dep2first_author.items()}.items(), key=lambda x: x[0]))

In [None]:
print(
    f"{'Department':<45s}{'# of Last Authors':>20s}{'# of Last Author Papers':>30s}{'# of First Authors':>25s}{'# of First Author Papers':>30s}"
)
aa, bb, cc, dd = [None] * len(dep_names), [None] * len(dep_names), [None] * len(dep_names), [None] * len(dep_names)
for i, co in enumerate(dep_names):
    a = dep2n_last_author[co] if co in dep2n_last_author else 0
    b = last_author_dep2n_paper[co] if co in last_author_dep2n_paper else 0
    c = dep2n_first_author[co] if co in dep2n_first_author else 0
    d = first_author_dep2n_paper[co] if co in first_author_dep2n_paper else 0
    aa[i], bb[i], cc[i], dd[i] = a, b, c, d
    print(f"{co.capitalize():<45s}{a:>20}{b:>30}{c:>25}{d:>30}")

### Make country table (27 that end up in the analyses).

In [10]:
from country_list import countries_for_language

two_letter2full_name = dict(countries_for_language("en"))

with open(os.path.join(dir_dict, "country2power_distance.pkl"), "rb") as f:
    country2power_distance = pickle.load(f)
with open(os.path.join(dir_dict, "country2individualism.pkl"), "rb") as f:
    country2individualism = pickle.load(f)

with open(os.path.join(dir_dict, "paper2last_author.pkl"), "rb") as f:
    paper2last_author = pickle.load(f)
with open(os.path.join(dir_dict, "paper2first_author.pkl"), "rb") as f:
    paper2first_author = pickle.load(f)
last_author2paper = reverse_dict_val(paper2last_author)
first_author2paper = reverse_dict_val(paper2first_author)

with open(os.path.join(dir_dict, "paper2last_author_country.pkl"), "rb") as f:
    paper2last_author_country = pickle.load(f)  # Last author countries for each paper.
with open(os.path.join(dir_dict, "paper2first_author_country.pkl"), "rb") as f:
    paper2first_author_country = pickle.load(f)  # First author countries for each paper.
last_author_country2paper = reverse_dict_list(paper2last_author_country)
first_author_country2paper = reverse_dict_list(paper2first_author_country)
country_names = sorted(list(country2power_distance.keys()), key=lambda x: two_letter2full_name[x])


last_author_nat2paper = reverse_dict_list(paper2last_author_country)
last_author2nat = {au: [] for au in last_author2paper}
for p, colist in paper2last_author_country.items():
    last_author2nat[paper2last_author[p]] += colist
last_author2nat = {au: list(set(colist)) for au, colist in last_author2nat.items()}
nat2last_author = reverse_dict_list(last_author2nat)

first_author_nat2paper = reverse_dict_list(paper2first_author_country)
first_author2nat = {au: [] for au in first_author2paper}
for p, colist in paper2first_author_country.items():
    first_author2nat[paper2first_author[p]] += colist
first_author2nat = {au: list(set(colist)) for au, colist in first_author2nat.items()}
nat2first_author = reverse_dict_list(first_author2nat)

last_author_nat2n_paper = dict(sorted({co: len(pp) for co, pp in last_author_nat2paper.items()}.items(), key=lambda x: x[0]))
first_author_nat2n_paper = dict(sorted({co: len(pp) for co, pp in first_author_nat2paper.items()}.items(), key=lambda x: x[0]))

nat2n_last_author = dict(sorted({co: len(pp) for co, pp in nat2last_author.items()}.items(), key=lambda x: x[0]))
nat2n_first_author = dict(sorted({co: len(pp) for co, pp in nat2first_author.items()}.items(), key=lambda x: x[0]))

In [None]:
print(
    f"{'Country/Region':<25s}{'Power Distance':>20s}{'Individualism':>20s}{'# of Last Authors':>20s}{'# of Last Author Papers':>30s}{'# of First Authors':>25s}{'# of First Author Papers':>30s}"
)
for i, co in enumerate(country_names):
    a = nat2n_last_author[co] if co in nat2n_last_author else 0
    b = last_author_nat2n_paper[co] if co in last_author_nat2n_paper else 0
    c = nat2n_first_author[co] if co in nat2n_first_author else 0
    d = first_author_nat2n_paper[co] if co in first_author_nat2n_paper else 0
    pwd = int(country2power_distance[co])
    inv = int(country2individualism[co])
    full_name = f"{two_letter2full_name[co]} ({co})"
    print(f"{full_name:<25s}{pwd:>20}{inv:>20}{a:>20}{b:>30}{c:>25}{d:>30}")

## Make figure: sentiment ratio (post-hierarchy) as a function of number of sentences in pre-hierarchy.

#### First is monte carlo simulation using pre-hierarchy sentiment ratio.
#### Second is empirical.

In [None]:
# Sentiment ratio prevalence (pre-hierarchy)
p_neg = 0.0602
p_pos = 0.2488
p_neu = 0.6910
n_mc = 1000

rng = np.random.default_rng()
ratio_mat_rel = np.zeros((cm_max, 3, n_mc))
n_cp = np.zeros(cm_max)  # Num of citation pairs.
"""
for a given ns (num of sentence):
1. In a citation pair, we have ns num of sentences, each sentence we draw a sentiment from a categorical distribution of using overall prevalence data above. In other words we draw from categorical distribution ns times. Do hierarchy (AKA aggregation) to calculate post-hierarchy sentiment for this pair.
2. Do that for all num of citation pairs; we use the empirical num.
3. Calculate post-hierarchy sentiment ratio for each sentiment among these sentimented pairs.
4. Repeat step 1-3 n_mc times.

We get a distribution of post-hierarchy sentiment ratio for a given ns. Use mean+sem or whatever and plot.
"""

for n_sentence in tqdm(range(cm_max)):
    n_pair = len([None for pair, ns in cite2ns.items() if ns == n_sentence + 1])
    n_cp[n_sentence] = n_pair
    # continue  # Skip simulation.
    sent_ = rng.choice([1, 0, -1], p=[p_pos, p_neu, p_neg], size=(n_pair, n_sentence, n_mc), replace=True)
    # Perform hierarchy aggregation along n_sentence dim: -1 > 1 > 0.
    sent_ = np.where((sent_ == -1).any(1), -1, np.where((sent_ == 1).any(1), 1, 0))
    # Calculate post-hierarchy sentiment ratio along n_pair dim.
    ratio_mat_rel[n_sentence, :, :] = np.vstack([(sent_ == v).mean(0) for v in [1, 0, -1]])  # dim=(3, n_mc).

In [None]:
# Figure production.
fig, ax = plt.subplots(figsize=(3.41, 3.41))
xticklabels = [f"{x}" for x in range(0, cm_max + 1)]  # Citation frequency label.
x_arr = np.arange(len(xticklabels))
med = np.median(np.array(list(cite2ns.values())))
p95 = np.percentile(np.array(list(cite2ns.values())), 95)


ax.vlines(med, ymin=1, ymax=4e5, color="red", alpha=0.5, zorder=1, linestyle=":")
ax.vlines(p95, ymin=1, ymax=4e5, color="red", alpha=0.5, zorder=1, linestyle="-")
ax.plot([t2 for t1, t2 in enumerate(x_arr) if t1 != 0], n_cp, color="grey", alpha=0.5)

ax.set_xlabel("Citation Frequency", size=10)  # x-large
ax.set_ylabel("Number of Citation Pairs", size=10)  # x-large
ax.set_xticks(x_arr, xticklabels)
ax.grid(which="major", axis="x", alpha=0.2)
ax.grid(which="major", axis="y", alpha=0.2)
ax.set_yscale("log")
ax.set_ylim([1, 4e5])
set_xticks(ax)
set_yticks(ax)

fig.tight_layout()
plt.show()
fig.savefig(os.path.join(dir_output, "SUPP Sample Size vs. NS.svg"), bbox_inches="tight", transparent=True)
fig.clf()  # Clear figure.
plt.close(fig=fig)  # Close figure.

print(f"citation frequency median: {med}")
print(f"citation frequency 95 percentile: {p95}")

In [None]:
# Figure production.
fig, ax = plt.subplots(figsize=(3.41, 3.41))
xticklabels = [f"{x}" for x in range(0, cm_max + 1)]  # Citation frequency label.
x_arr = np.arange(len(xticklabels))
# Baseline (indistinguishable from null).
ax.plot(x_arr, [0 for _ in x_arr], color="grey", alpha=0.5, zorder=1, linestyle=":")

for i in range(3):  # One curve for each of the 3 sentiments.
    m = np.nanmean(ratio_mat_rel[:, i, :], axis=-1)  # From bootstrap sampling distribution.
    std = np.nanstd(ratio_mat_rel[:, i, :], axis=-1, ddof=1)  # From bootstrap sampling distribution.
    ax.plot([t2 for t1, t2 in enumerate(x_arr) if t1 != 0], m, color=sent_colors[i])
    ax.fill_between([t2 for t1, t2 in enumerate(x_arr) if t1 != 0], m - std, m + std, color=sent_colors[i], alpha=0.3, edgecolor=None)

ax.set_xlabel("Citation Frequency", size=10)  # x-large
ax.set_ylabel("Sentiment Ratio", size=10)  # x-large
ax.set_xticks(x_arr, xticklabels)
ax.grid(which="major", axis="x", alpha=0.2)
ax.grid(which="major", axis="y", alpha=0.2)

fig.tight_layout()
plt.show()
fig.savefig(os.path.join(dir_output, "SUPP SR vs. NS (Monte Carlo Simulation).svg"), bbox_inches="tight", transparent=True)
fig.clf()  # Clear figure.
plt.close(fig=fig)  # Close figure.

### Below is empirical plot.

In [15]:
# row: Citatino frequency; col: 3 sentiment
def _is_goodman_invalid(counts, th=5):
    for c in counts:
        if c < th:
            return True
    return False


count_mat_rel = np.zeros((cm_max, 3))
ratio_mat_rel = np.zeros((cm_max, 3))
for d in range(cm_max):
    sent_emp = np.array([cite2sent_emp[pair] for pair, ns in cite2ns.items() if ns == d + 1])
    r_emp = np.array([np.sum(sent_emp == s) for s in [1, 0, -1]], dtype=float)
    count_mat_rel[d, :] = r_emp
    r_emp /= np.sum(r_emp)
    ratio_mat_rel[d, :] = r_emp


count_mat_rel_pre = np.zeros((cm_max, 3))
ratio_mat_rel_pre = np.zeros((cm_max, 3))
for d in range(cm_max):
    sent_emp = np.array(flatten_list([cite2sent_1[pair] for pair, ns in cite2ns.items() if ns == d + 1]))
    r_emp = np.array([np.sum(sent_emp == s) for s in [1, 0, -1]], dtype=float)
    count_mat_rel_pre[d, :] = r_emp
    r_emp /= np.sum(r_emp)
    ratio_mat_rel_pre[d, :] = r_emp

ymin, ymax = -0.02, 0.82

In [None]:
# Figure production.
fig, ax = plt.subplots(figsize=(3.41 * 1.5, 3.41))
xticklabels = [f"{x}" for x in range(0, cm_max + 1)]  # Citation frequency label.
x_arr = np.arange(1, len(xticklabels))

for i in range(3):  # One curve for each of the 3 sentiments.
    ax.plot(x_arr, ratio_mat_rel[:, i], color=sent_colors[i])
    cfy = np.array([mpc(count_mat_rel[j - 1, :], alpha=0.05, method="goodman")[i, :] for j in x_arr])
    ax.fill_between(x_arr, cfy[:, 0], cfy[:, 1], color=sent_colors[i], alpha=0.3, edgecolor=None)
for xi in x_arr[np.array([_is_goodman_invalid(count_mat_rel[j - 1, :], 10) for j in x_arr])]:
    xmin = xi - 0.5
    xmax = xi + 0.5 if xi != x_arr[-1] else xi
    ax.hlines(y=0.11, xmin=xmin, xmax=xmax, color="black", linewidth=2, linestyle=(0, (4, 4)))
for xi in x_arr[np.array([_is_goodman_invalid(count_mat_rel[j - 1, :], 5) for j in x_arr])]:
    xmin = xi - 0.5
    xmax = xi + 0.5 if xi != x_arr[-1] else xi
    ax.hlines(y=0.18, xmin=xmin, xmax=xmax, color="black", linewidth=2, linestyle=(0, (4, 4)))
ax.text(18, 0.112, "n < 10", ha="center", va="bottom", fontsize=8)
ax.text(19, 0.182, "n < 5", ha="center", va="bottom", fontsize=8)

x_arr = np.arange(len(xticklabels))

ax.set_xlabel("Citation Frequency", size=10)  # x-large
ax.set_ylabel("Sentiment Ratio", size=10)  # x-large
ax.set_xticks([t2 for t1, t2 in enumerate(x_arr) if t1 % 5 == 0])  # Citation frequency label.
ax.set_xticklabels([t2 for t1, t2 in enumerate(xticklabels) if t1 % 5 == 0])  # Citation frequency label.

# ymin, ymax = ax.get_ylim()
for xi in x_arr:
    ax.axvline(x=xi, color="lightgray", linestyle="-", linewidth=0.5, zorder=0)
for yi in ax.get_yticks():
    ax.axhline(y=yi, color="lightgray", linestyle="-", linewidth=0.5, zorder=0)
ax.set_ylim(ymin, ymax)
ax.set_xlim(x_arr[0] - 0.5, x_arr[-1] + 0.5)

major_len = 6.5
minor_len = 4
major_width = 1.5
minor_width = 1
# ax.tick_params(axis="y", which="major", length=major_len, width=major_width, labelsize=10)
ax.tick_params(axis="x", which="major", length=major_len, width=major_width, labelsize=10)
set_yticks(ax, alt=1)

fig.tight_layout()
plt.show()
fig.savefig(os.path.join(dir_output, "SUPP SR vs. NS.svg"), bbox_inches="tight", transparent=True)
fig.clf()  # Clear figure.
plt.close(fig=fig)  # Close figure.

In [None]:
# Figure production.
fig, ax = plt.subplots(figsize=(3.41 * 1.5, 3.41))
xticklabels = [f"{x}" for x in range(0, cm_max + 1)]  # Citation frequency label.
x_arr = np.arange(1, len(xticklabels))

for i in range(3):  # One curve for each of the 3 sentiments.
    ax.plot(x_arr, ratio_mat_rel_pre[:, i], color=sent_colors[i])
    cfy = np.array([mpc(count_mat_rel_pre[j - 1, :], alpha=0.05, method="goodman")[i, :] for j in x_arr])
    ax.fill_between(x_arr, cfy[:, 0], cfy[:, 1], color=sent_colors[i], alpha=0.3, edgecolor=None)
for xi in x_arr[np.array([_is_goodman_invalid(count_mat_rel_pre[j - 1, :], 10) for j in x_arr])]:
    xmin = xi - 0.5
    xmax = xi + 0.5 if xi != x_arr[-1] else xi
    ax.hlines(y=0.11, xmin=xmin, xmax=xmax, color="black", linewidth=2, linestyle=(0, (4, 4)))
for xi in x_arr[np.array([_is_goodman_invalid(count_mat_rel_pre[j - 1, :], 5) for j in x_arr])]:
    xmin = xi - 0.5
    xmax = xi + 0.5 if xi != x_arr[-1] else xi
    ax.hlines(y=0.18, xmin=xmin, xmax=xmax, color="black", linewidth=2, linestyle=(0, (4, 4)))
# ax.text(18, 0.112, "n < 10", ha="center", va="bottom", fontsize=8)
# ax.text(19, 0.182, "n < 5", ha="center", va="bottom", fontsize=8)

x_arr = np.arange(len(xticklabels))

ax.set_xlabel("Citation Frequency", size=10)  # x-large
ax.set_ylabel("Sentiment Ratio", size=10)  # x-large
ax.set_xticks([t2 for t1, t2 in enumerate(x_arr) if t1 % 5 == 0])  # Citation frequency label.
ax.set_xticklabels([t2 for t1, t2 in enumerate(xticklabels) if t1 % 5 == 0])  # Citation frequency label.

# ymin, ymax = ax.get_ylim()
for xi in x_arr:
    ax.axvline(x=xi, color="lightgray", linestyle="-", linewidth=0.5, zorder=0)
for yi in ax.get_yticks():
    ax.axhline(y=yi, color="lightgray", linestyle="-", linewidth=0.5, zorder=0)
ax.set_ylim(ymin, ymax)
ax.set_xlim(x_arr[0] - 0.5, x_arr[-1] + 0.5)

major_len = 6.5
minor_len = 4
major_width = 1.5
minor_width = 1
# ax.tick_params(axis="y", which="major", length=major_len, width=major_width, labelsize=10)
ax.tick_params(axis="x", which="major", length=major_len, width=major_width, labelsize=10)
set_yticks(ax, alt=1)

fig.tight_layout()
plt.show()
fig.savefig(os.path.join(dir_output, "SUPP SR vs. NS (pre-agg).svg"), bbox_inches="tight", transparent=True)
fig.clf()  # Clear figure.
plt.close(fig=fig)  # Close figure.

In [None]:
cite2sent = loadPKL(dir_dict, "cite2sent_2")

paper2num_paper = defaultdict(set)
for citer, citee in cite2sent.keys():
    paper2num_paper[citer].add(citee)
paper2num_paper = {k: len(v) for k, v in paper2num_paper.items()}

arr = np.array(list(paper2num_paper.values()))
print(f"sample size: {len(paper2num_paper)} | range(x): {min(arr)}, {max(arr)}")
values = np.arange(1, 21)
counts = np.bincount(arr)[1:21]  # skip index 0
plt.bar(values, counts)
plt.axvline(np.mean(arr), color="black", linestyle="--", linewidth=2, label=f"Mean")
plt.axvline(np.median(arr), color="black", linestyle=":", linewidth=2, label=f"Median")
plt.xlabel("Number of Unique Papers Cited")
plt.ylabel("Frequency")
plt.yscale("log")
plt.legend()
plt.xticks(values)
plt.savefig(os.path.join(dir_output, "SUPP Number of Unique Papers Cited.svg"), format="svg", bbox_inches="tight")
plt.show()

In [None]:
cite2ns = loadPKL(dir_TEMP, "cite2ns")

arr = np.array(list(cite2ns.values()))
print(f"sample size: {len(cite2ns)} | range(x): {min(arr)}, {max(arr)}")
values = np.arange(1, 21)
counts = np.bincount(arr)[1:21]  # skip index 0
plt.bar(values, counts)
plt.axvline(np.mean(arr), color="black", linestyle="--", linewidth=2, label=f"Mean")
plt.axvline(np.median(arr), color="black", linestyle=":", linewidth=2, label=f"Median")
plt.xlabel("Number of Citation Sentences for a Given Citation Pair")
plt.ylabel("Frequency")
plt.yscale("log")
plt.legend()
plt.xticks(values)
plt.savefig(os.path.join(dir_output, "SUPP Number of Citation Sentences for a Given Citation Pair.svg"), format="svg", bbox_inches="tight")
plt.show()