**Practice**
the functions of run_gpt3.py

In [1]:
import os
import json
import argparse

In [2]:
def load_data(args):
    """
    Load and process data from different sources and formats.
    
    Args:
        args: Command line arguments containing data paths and processing options
    
    Returns:
        problems: Dictionary of all problems (test + train)
        test_pids: List of test problem IDs
        cand_pids: List of candidate problem IDs for retrieval
    
    The function handles different data formats:
    1. MedQA format 
    2. MATH problems
    3. Various NLP tasks (squad, tweet_eval, etc.)
    4. Generic JSON format
    """
    # Load test and train data
    problems_test = json.load(open(args.data_root_test))
    problems_train = json.load(open(args.data_root_train))
    
    # Since the data is in list format, we'll concatenate the lists
    problems = problems_test + problems_train
    
    # Create indices for test and train sets
    test_pids = list(range(len(problems_test)))
    train_pids = list(range(len(problems_test), len(problems_test) + len(problems_train)))
    
    cand_pids = train_pids
    for i in cand_pids:
        if i in test_pids:
            print(f"yq: cand id is in test ids")

    return problems, test_pids, cand_pids

In [3]:
def parse_args(args_list):
    """
    Parse command line arguments.
    
    Returns:
        args: Parsed command line arguments
        
    The arguments include:
    1. Data paths and settings
    2. Model configuration
    3. RL algorithm settings
    4. GPT-3/4 settings
    5. Training and evaluation parameters
    """
    parser = argparse.ArgumentParser()
    
    # Data paths
    parser.add_argument('--data_root_train', type=str, default='../data/tabmwp/problems_train.json')
    parser.add_argument('--data_root_test', type=str, default='../data/tabmwp/problems_test.json')
    parser.add_argument('--data_root_vali', type=str, default=None)
    parser.add_argument('--output_root', type=str, default='../results')
    
    # Model settings
    parser.add_argument('--model', type=str, default='gpt3_rl')
    parser.add_argument('--option_inds', type=list,
                        default=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q",
                                 "R", "S", "T", "U", "V", "W", "X", "Y", "Z"])
    parser.add_argument('--batch_size', type=int, default=50)
    
    # User options
    parser.add_argument('--label', type=str, default='exp0')
    parser.add_argument('--test_split', type=str, default='test')
    parser.add_argument('--test_number', type=int, default=500)
    parser.add_argument('--save_every', type=int, default=10)
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--prompt_format', type=str, default='Q-A')
    parser.add_argument('--shot_number', type=int, default=2)
    parser.add_argument('--seed', type=int, default=1)
    
    # GPT-3 settings
    parser.add_argument('--engine', type=str, default='gpt4')
    parser.add_argument('--temperature', type=float, default=0.0)
    parser.add_argument('--max_tokens', type=int, default=512)
    parser.add_argument('--top_p', type=float, default=1.0)
    parser.add_argument('--frequency_penalty', type=float, default=0.0)
    parser.add_argument('--presence_penalty', type=float, default=0.0)
    
    # Policy Model settings
    parser.add_argument('--gpu', type=str, default='0')
    parser.add_argument('--model_config', type=str, default='bert-base-uncased')
    parser.add_argument('--cand_number', type=int, default=100)
    parser.add_argument('--embedding_size', type=int, default=128)
    parser.add_argument('--ckpt_root', type=str, default='../checkpoints')
    parser.add_argument('--ckpt', type=str, default=None)
    parser.add_argument('--ckpt_context', type=str, default=None)
    parser.add_argument('--ckpt_lossnet', type=str, default=None)
    parser.add_argument('--cand_ckpt', type=str, default=None)
    parser.add_argument('--test_pids_ckpt', type=str, default=None)
    parser.add_argument('--train_ckpt', type=str, default=None)
    parser.add_argument('--val_ckpt', type=str, default=None)
    
    # RL Algorithm settings
    parser.add_argument('--lr', type=float, default=0.0001)
    parser.add_argument('--algorithm', type=str, default='ERM')
    parser.add_argument('--adapt_bn', type=int, default=0)
    parser.add_argument('--gamma', type=float, default=0.0)
    parser.add_argument('--score_th', type=float, default=-1)
    
    # Data augmentation settings
    parser.add_argument('--cluster_type', type=str, default=None)
    parser.add_argument('--n_clusters', type=int, default=5)
    parser.add_argument('--aug_method', type=str, default=None)
    parser.add_argument('--preselection', action='store_true')
    parser.add_argument('--select_number', type=int, default=50)
    parser.add_argument('--get_false_test_pids', action='store_true')
    
    args = parser.parse_args(args=args_list)
    args.meta_batch_size = args.batch_size
    return args

In [4]:
import sys

args_list = [
    '--data_root_train', './data/nq_tmp/train.json',
    '--data_root_test', './data/nq_tmp/test.json',
    '--output_root', './data/nq_tmp/results',
]

# Parse arguments
args = parse_args(args_list)
print('====Input Arguments====')
print(json.dumps(vars(args), indent=2, sort_keys=False))


====Input Arguments====
{
  "data_root_train": "./data/nq_tmp/train.json",
  "data_root_test": "./data/nq_tmp/test.json",
  "data_root_vali": null,
  "output_root": "./data/nq_tmp/results",
  "model": "gpt3_rl",
  "option_inds": [
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "J",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "X",
    "Y",
    "Z"
  ],
  "batch_size": 50,
  "label": "exp0",
  "test_split": "test",
  "test_number": 500,
  "save_every": 10,
  "debug": false,
  "prompt_format": "Q-A",
  "shot_number": 2,
  "seed": 1,
  "engine": "gpt4",
  "temperature": 0.0,
  "max_tokens": 512,
  "top_p": 1.0,
  "frequency_penalty": 0.0,
  "presence_penalty": 0.0,
  "gpu": "0",
  "model_config": "bert-base-uncased",
  "cand_number": 100,
  "embedding_size": 128,
  "ckpt_root": "../checkpoints",
  "ckpt": null,
  "ckpt_context": null,
  "ckpt_lossnet": null,
  "cand_ckpt": nul

In [5]:
# Load data
problems, pids, cand_pids = load_data(args)
print(problems[0]["ctxs"])
print(len(problems), len(pids), len(cand_pids))
print(pids[:10], cand_pids[:10])

[{'id': '20734671', 'title': 'Back to God', 'text': 'Back to God "Back to God" is a song performed by American singer, Reba McEntire. It was released as the second single from her 2017 album, "", on January 20, 2017. A duet version with Lauren Daigle was released on April 2, 2017. The song became McEntire first Hot Christian Songs No. 1, and Daigle\'s second. The track held the No. 1 position for one week. "Back to God" was originally released on January 20, 2017 as the second single from her twenty-ninth studio album ""Sing It Now: Songs of Faith & Hope"." The song was written by Dallas Davidson'}, {'id': '1670079', 'title': 'Reba McEntire', 'text': 'features The Isaacs. Jay DeMarcus of the Rascal Flatts produced the album. The first single off the album is "Back to God". In January 2018, McEntire won the Grammy Award for Best Roots Gospel Album, her first nomination since 2007, and her first Grammy Award win in more than twenty years, since 1994. She also headlined the festival in th

In [6]:
def get_result_file(args):
    """
    Generate the path for saving results.
    
    Args:
        args: Command line arguments containing output settings
        
    Returns:
        result_file: Path to the result file
    """
    result_path = f"{args.output_root}/{args.model}"
    os.makedirs(result_path, exist_ok=True)

    result_file = "{}/{}_{}_{}_{}_seed_{}.json".format(
        result_path,
        args.label,
        args.test_split,
        args.prompt_format,
        args.shot_number,
        args.seed
    )

    return result_file

In [7]:
import numpy as np
import torch
import random

from algorithm import init_algorithm

# Set random seeds for reproducibility
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
torch.backends.cudnn.benchmark = True

# # Initialize RL algorithm
algorithm = init_algorithm(args)

# Get result file path
result_file = get_result_file(args)

model_config: bert-base-uncased


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
print(result_file)

./data/nq_tmp/results/gpt3_rl/exp0_test_Q-A_2_seed_1.json


## Initialize RL algorithm

algorithm.py



In [None]:
from base_prompt import get_table_text, get_question_text, get_answer, get_solution_text, create_one_example

problems, pids, cand_pids = load_data(args)

for i, pid in enumerate(pids):
    if i >= 2:
        break
    problem = problems[pid]
    table = get_table_text(problem)
    question = get_question_text(problem, args.option_inds)
    answer = get_answer(problem)
    solution = get_solution_text(problem)
    
    print(f"{problem=}\n, {table=}\n, {question=}\n, {answer=}\n, {solution=}\n")
    
    example = create_one_example(args.prompt_format, table, question, answer, solution,
                                                 test_example=True)
    
    print(f"{example=}\n\n")

problem={'question': 'when did reba mcentire record back to god', 'answers': ['February 3, 2017', '2017'], 'ctxs': [{'id': '20734671', 'title': 'Back to God', 'text': 'Back to God "Back to God" is a song performed by American singer, Reba McEntire. It was released as the second single from her 2017 album, "", on January 20, 2017. A duet version with Lauren Daigle was released on April 2, 2017. The song became McEntire first Hot Christian Songs No. 1, and Daigle\'s second. The track held the No. 1 position for one week. "Back to God" was originally released on January 20, 2017 as the second single from her twenty-ninth studio album ""Sing It Now: Songs of Faith & Hope"." The song was written by Dallas Davidson'}, {'id': '1670079', 'title': 'Reba McEntire', 'text': 'features The Isaacs. Jay DeMarcus of the Rascal Flatts produced the album. The first single off the album is "Back to God". In January 2018, McEntire won the Grammy Award for Best Roots Gospel Album, her first nomination sinc

In [9]:

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

from openai import OpenAI

client = OpenAI()

In [10]:
user_prompt = "Follow the given examples and answer the question following the same format."

question = "What is the capital of France?"

prompt = "You are a helpful assistant."

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": user_prompt + ""}
    ],
    temperature=0.7,
    max_tokens=1000,
)

assistant_response = response.choices[0].message.content
print(assistant_response)

Sure, please provide the examples and the question you would like me to answer, and I will follow the same format.
