In [None]:
from openai import OpenAI
import os
import openai
import pickle
import csv
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import cohen_kappa_score
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters
from irrCAC.raw import CAC
import pandas as pd
import numpy as np
import json
from collections import Counter, defaultdict

import external_methods as em
from helper_functions import loadPKL, get_IRR, pval_star, savePKL, sorted_dict, print_sigfig

CWD = os.path.abspath("")  # Jupyter notebook path.

dir_input = os.path.join(CWD, "input")
dir_batch = os.path.join(CWD, "batch")  # ChatGPT related output.
dir_TEMP = os.path.join(CWD, "TEMP")  # Intermediate files.
dir_dict = os.path.join(CWD, "dicts")  # Look up dictionaries such as paper2meta; main data directory.
dir_npy = os.path.join(CWD, "npy")  # Data files needed for plotting figures.
dir_output = os.path.join(CWD, "output")  # Figures.
dir_xml = os.path.join(CWD, "xml")  # xml files.
dir_DEBUG = os.path.join(CWD, "DEBUG")

dir_new = os.path.join(dir_batch, "iter_all")
dir_new_files = os.path.join(dir_batch, "iter_all", "files")
i, lab = 3, None
dir_new_i = os.path.join(dir_batch, "iter_all", f"i={i}")
ib, lab = 3, None
dir_new_ib = os.path.join(dir_batch, "iter_all", "bench", f"i={ib}")
model = "gpt-4.1-mini-2025-04-14"


with open(os.path.join(dir_input, "params.json")) as f:
    params = json.load(f)
print(params)

api_key = input()  # Run this cell and then enter your OpenAI api key.

## Before proceeding

external_methods.save_CGPT_input_files() should have been run already, generating two files:

`sentences2rate-CGPT.txt` and `sentrow2edgeinfo.pkl` (in the `TEMP` folder).

and cite_coauthor_functions.make_paper2meta() ran, generating `paper2meta.pkl` in `dicts` folder.

### Randomly sample from sentence dataset w/o replacement
### Randomly sample from paper xml w/o replacement

In [2]:
# Run once; for initial prompt guide.
bk = em.sample_sentence_snippet(dir_TEMP, dir_xml, dir_batch, n_iter=3, n_samp=100, seed=123)

In [None]:
# Run once; generate all rand samples (w/o replacement).
bk_all = em.sample_sentence_snippet(dir_TEMP, dir_xml, dir_dict, dir_new_files, n_iter=100, n_samp=3000, n_iter2=10, n_samp2=1000, seed=10)
savePKL(dir_new_files, "bk_all", bk_all)

### Send requests to gpt.

In [None]:
# ~700(input)+100(output) tokens per request.
fname = f"sentences2rate-CGPT_{i}"
init_dict = em.CGPT_init_crit_5_explain(api_key)
print(model)
em.creat_batch_jobs_fc(init_dict, model, dir_new_files, dir_new_i, batch_size=3000, fname=fname, batch_num=None, bench=False)
# em.send_request_nonbatch_fc(init_dict, model, dir_new_files, dir_new_i, fname=fname, i=i, lab=lab, interval=0.012, num=3000, bench=False)

In [None]:
# ~9K(input) tokens per request. (With 100K context window.)
fname = f"benchwork_text_CGPT_{ib}"
init_dict = em.CGPT_init_benchwork(api_key)
print(model)
em.creat_batch_jobs(init_dict, model, dir_new_files, dir_new_ib, batch_size=1000, fname=fname, batch_num=None)
# em.send_request_nonbatch(init_dict, model, dir_new_files, dir_new_ib, fname=fname, i=ib, lab=lab, interval=0.012, num=1000, bench=True)

### Process gpt outputs.

In [None]:
em.process_batch_outputs_fc(dir_new_i, dir_new_i, i=i)
row2rate0 = em.process_row2rate(loadPKL(dir_new_i, f"gpt-4.1-mini-2025-04-14-row2rate_reason-{i}"), verbose=False, fc=True, bench=False)
Counter([v[0] for v in row2rate0.values()])

In [None]:
em.process_batch_outputs(dir_new_ib, dir_new_ib, i=ib)
row2rate0 = em.process_row2rate(loadPKL(dir_new_ib, f"gpt-4.1-mini-2025-04-14-row2rate-{ib}"), verbose=False, fc=False, bench=True)
Counter([v for v in row2rate0.values()])

## Subsample balanced dataset to rate by humans.

In [None]:
em.subsample_save_sentence2rate_reason_csv(dir_new_i, dir_new_files, dir_new_i, i, model, n_samps={-1: 20, 1: 20, 0: 20}, seed=i)

In [None]:
em.subsample_save_snippet2rate_reason_csv(dir_new_ib, dir_new_files, dir_new_ib, ib, model, n_samps={1: 20, 0: 20}, seed=1000 + ib, fc=False)

## Aggregate human judgment.

In [None]:
df1 = pd.read_excel(os.path.join(dir_new_i, f"sentence2rate_reason_{i}_rate_em_px.xlsx"), nrows=60)
df2 = pd.read_excel(os.path.join(dir_new_ib, f"snippet2rate_{ib}_rate_em_px.xlsx"), nrows=40)

for t, df in zip(["Sentiment", "Benchwork"], [df1, df2]):
    df[["r1", "r2", "r3"]] = df[["r1", "r2", "r3"]].map(lambda x: x.upper() if isinstance(x, str) else x)
    df["consensus"] = (df[["r1", "r2", "r3"]].nunique(axis=1) == 1).astype(int)
    ck_12 = cohen_kappa_score(df["r1"], df["r2"])
    ck_13 = cohen_kappa_score(df["r1"], df["r3"])
    ck_23 = cohen_kappa_score(df["r2"], df["r3"])
    fk = fleiss_kappa(aggregate_raters(df[["r1", "r2", "r3"]])[0], method="fleiss")
    a1 = CAC(df[["r1", "r2", "r3"]], confidence_level=0.95).gwet()["est"]
    fk = CAC(df[["r1", "r2", "r3"]], confidence_level=0.95).fleiss()["est"]
    df[["consensus"]].to_csv(os.path.join(dir_new_i, "consensus.csv"))
    print(f"\n{t} n={len(df)}:")
    print(f"Cohen's Kappa: {ck_12:.3f}", f"{ck_13:.3f}", f"{ck_23:.3f}")
    avg = np.mean([np.mean(df['r1']=='YES')*100, np.mean(df['r2']=='YES')*100, np.mean(df['r3']=='YES')*100])
    print(f"Rater agreement % (avg={print_sigfig(avg)}): {print_sigfig(np.mean(df['r1']=='YES')*100)} {print_sigfig(np.mean(df['r2']=='YES')*100)} {print_sigfig(np.mean(df['r3']=='YES')*100)}")
    print(a1)
    print(fk)

In [None]:
df1 = pd.read_excel(os.path.join(dir_new_i, f"sentence2rate_reason_{i}_rate_em_px.xlsx"), nrows=60)

t = "Sentiment"

df1[["r1", "r2", "r3"]] = df1[["r1", "r2", "r3"]].map(lambda x: x.upper() if isinstance(x, str) else x)
df1["consensus"] = (df1[["r1", "r2", "r3"]].nunique(axis=1) == 1).astype(int)
for sent in [1, 0, -1]:
    df = df1.loc[df1["LLM rating"] == sent]
    ck_12 = cohen_kappa_score(df["r1"], df["r2"])
    ck_13 = cohen_kappa_score(df["r1"], df["r3"])
    ck_23 = cohen_kappa_score(df["r2"], df["r3"])
    fk = fleiss_kappa(aggregate_raters(df[["r1", "r2", "r3"]])[0], method="fleiss")
    a1 = CAC(df[["r1", "r2", "r3"]], confidence_level=0.95).gwet()["est"]
    fk = CAC(df[["r1", "r2", "r3"]], confidence_level=0.95).fleiss()["est"]
    df[["consensus"]].to_csv(os.path.join(dir_new_i, "consensus.csv"))
    print(f"\n{t} {sent} n={len(df)}:")
    print(f"Cohen's Kappa: {ck_12:.3f}", f"{ck_13:.3f}", f"{ck_23:.3f}")
    avg = np.mean([np.mean(df['r1']=='YES')*100, np.mean(df['r2']=='YES')*100, np.mean(df['r3']=='YES')*100])
    print(f"Rater agreement % (avg={print_sigfig(avg)}): {print_sigfig(np.mean(df['r1']=='YES')*100)} {print_sigfig(np.mean(df['r2']=='YES')*100)} {print_sigfig(np.mean(df['r3']=='YES')*100)}")
    print(a1)
    print(fk)

## Human agreed WET/DRY samples.

In [9]:
dir_nonce = os.path.join(dir_new, "bench", "i=1-n=40-benchtest")
df = pd.read_excel(os.path.join(dir_nonce, "snippet2rate_reason_1_rate_em_px.xlsx"), nrows=40)
pmc2wet = {int(x.split("PMC")[1]): y for x, y in zip(df["URL"], df["true_rating"])}

In [21]:
dir_nonce = os.path.join(dir_new, "bench", "i=3")
df = pd.read_excel(os.path.join(dir_nonce, "snippet2rate_3_rate_em_is.xlsx"), nrows=40)
pmc2rev = {int(x.split("PMC")[1]): y for x, y in zip(df["URL"], df["Column1"]) if y == "R"}