In [None]:
import os
from openai import OpenAI, OpenAIError
import os
import json
import configparser
import random
from tqdm.notebook import tqdm
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

config = configparser.ConfigParser()
config.read('config.ini')
client = OpenAI(api_key= config.get('API', 'openai'))

# Use sample of size N for training
# Random seed 42 used for all sampling 
N = 300
random.seed(42)

### Fine-Tune GPT-3.5

In [None]:
lines = []
with open("biolaysumm2024_data/eLife_val.jsonl") as f:
    for line in f:
        lines.append(json.loads(line))

train_lines = []
with open("biolaysumm2024_data/eLife_train.jsonl") as f:
    for line in f:
        train_lines.append(json.loads(line))

sample = random.sample(train_lines, k=N)

In [None]:
textrank_summ_file = "textrank_train_40"
with open(f"predictions/{textrank_summ_file}.json") as f:
    textrank_preds = json.load(f)

In [None]:
# Combine "article" and lay summary
sample_processed = []
for i, s in enumerate(sample):
    article = s['article'].split("\n")
    text = textrank_preds[i]
    summary = s['lay_summary']
    sample_processed.append({
        'text': text,
        'summary': summary
    })

In [None]:
# Prepare in format required for GPT 3.5 fine-tuning
with open("fine_tune_data_elife_300_textrank40.jsonl", "w") as f:
    for example in sample_processed:
        obj = {
            "messages" : [
                {"role" : "system", "content" : "Generate a lay summary of this biomedical article"},
                {"role" : "user", "content" : "### Article: \n" + example['text']},
                {"role" : "assistant", "content" : "### Summary: \n" + example['summary']}
            ]
        }
        f.write(json.dumps(obj) + "\n")

In [None]:
# Create training file
client = OpenAI(api_key= config.get('API', 'openai'))
client.files.create(
  file=open("fine_tune_data_elife_200_textrank40.jsonl", "rb"),
  purpose="fine-tune"
)

# Create & run training job
client = OpenAI(api_key= config.get('API', 'openai'))
client.fine_tuning.jobs.create(
  training_file="file-numbers", 
  model="gpt-3.5-turbo-1106"
)

In [None]:
# Retreieve status of training job
client.fine_tuning.jobs.retrieve("ftjob-XXX")

### Inference

In [None]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(prompt):
    chat_completion = client.chat.completions.create(
            messages=[
            {
                "role": "system",
                "content": "Generate a lay summary of this biomedical article",
            },
            {
                "role": "user",
                "content": prompt,
            }
        ],
    model="gpt-3.5-turbo-1106",
    temperature = 0.1,
    max_tokens=512
    )
    pred = chat_completion.choices[0].message.content
    return pred

In [None]:
import tiktoken
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

lines = []
with open("biolaysumm2024_data/eLife_val.jsonl") as f:
    for line in f:
        lines.append(json.loads(line))
        
with open("predictions/textrank_custom_40.json") as f:
    textrank_preds = json.load(f)

predictions = []
for i in tqdm(range(len(lines))):
    text = lines[i]['article'].split("\n")
    prompt = f"""\n### Article\n{textrank_preds[i]}"""
    predictions.append(completion_with_backoff(prompt))

new_preds = []
for p in predictions:
    p = p.replace("### Summary", "")
    p = p.replace("*", "")
    new_preds.append(p.strip())
predictions = new_preds

with open("predictions_gpt35_elife_textrank_40.json", "w") as f:
    json.dump(predictions, f)