In [1]:
%pip install textgrad

Collecting textgradNote: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'c:\\users\\wmichele\\appdata\\local\\anaconda3\\envs\\nw3\\lib\\site-packages\\numpy-1.23.4.dist-info\\METADATA'




  Downloading textgrad-0.1.5.tar.gz (65 kB)
     ---------------------------------------- 0.0/65.7 kB ? eta -:--:--
     ---------------------------------------- 65.7/65.7 kB 3.7 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting openai>=1.23.6 (from textgrad)
  Obtaining dependency information for openai>=1.23.6 from https://files.pythonhosted.org/packages/5e/4d/affea11bd85ca69d9fdd15567495bb9088ac1c37498c95cb42d9ecd984ed/openai-1.43.0-py3-none-any.whl.metadata
  Downloading openai-1.43.0-py3-none-any.whl.metadata (22 kB)
Collecting tenacity>=8.2.3 (from textgrad)
  Obtaining dependency information for tenacity>=8.2.3 from https://files.pythonhosted.org/packages/b6/cb/b86984bed139586d01532a587464b5805f12e397594f19f931c4c2fbfa61/tenacity-9.0.0-py3-none-any.whl.metadata
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting python-dotenv>=1.0.0 (from textgrad)
  Obtaining dependency infor

In [None]:
import argparse
import concurrent
from dotenv import load_dotenv
from tqdm import tqdm
import textgrad as tg
from textgrad.tasks import load_task, DataLoader
import numpy as np
import csv
import random
import pandas as pd
from textgrad.variable import Variable
from loss import MultiFieldTokenParsedEvaluation
from textgrad.autograd import StringBasedFunction
 
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
 
def load_dataset(filepath, evaluation_api, train_size = 0.2, val_size = 0.4, test_size = 0.4):
    tuples_list = []
    with open(filepath, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            sentence, number = row
            tuples_list.append((sentence, number))
 
    data = tuple(tuples_list)
    total = len(data)
    train_end = int(total * train_size)
    val_end = train_end + int(total * val_size)
   
    train_data = data[:train_end]
    val_data = data[train_end:val_end]
    test_data = data[val_end:]
 
    role_descriptions = [
            "Transcript for the task",
            "Ground truth summarization",
            "Summarization from the language model"
        ]
   
    evaluation_instruction = "Below is a question from a transcript summarization task, the ground truth summarization, and final summarization prediction. Is the final summarization prediction correct, i.e. the same as the ground truth answer? Say only 1 (yes) or 0 (no). Return your response within <ACCURACY> </ACCURACY> tags. e.g.<ACCURACY> 0 </ACCURACY> or <ACCURACY> 1 </ACCURACY>."
    eval_instruction = tg.Variable(evaluation_instruction, requires_grad=False, role_description="evaluation instruction for the task")
 
    eval_fn = MultiFieldTokenParsedEvaluation(
        eval_instruction,
        role_descriptions=role_descriptions,
        engine=evaluation_api,
        parse_tags=["<ACCURACY>", "</ACCURACY>"]
    )
    return train_data, val_data, test_data, eval_fn
 
def load_blog_dataset(filepath):
    tuple_list = []
    with open(filepath, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)
        for row in reader:
            sentence, number = row
            tuple_list.append((sentence, number))
           
    return train_data, val_data, test_data, eval_data
 
def eval_sample(item, eval_fn, model):
    x, y = item
    x = tg.Variable(x, requires_grad=False, role_description="query to the language model")
    y = tg.Variable(y, requires_grad=False, role_description="correct answer for the query")
    response = model(x)
   
    try:
        eval_output_variable = eval_fn(inputs=dict(prediction=response, ground_truth_answer=y))
        return int(eval_output_variable.value)
   
    except:
        eval_output_variable = eval_fn([x, y, response])
        eval_output_parsed = eval_fn.parse_output(eval_output_variable)
       
        return int(eval_output_parsed)
 
 
def eval_dataset(test_set, eval_fn, model, max_samples=None):
    if max_samples is None:
        max_samples = len(test_set)
 
    accuracy_list = []
   
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        futures = []  
        for _, sample in enumerate(test_set):
 
            future = executor.submit(eval_sample, sample, eval_fn, model)
            futures.append(future)
            if len(futures) >= max_samples:
                break
        tqdm_loader = tqdm(concurrent.futures.as_completed(futures), total=len(futures), position=0)
        for future in tqdm_loader:
            acc_item = future.result()
            accuracy_list.append(acc_item)
            tqdm_loader.set_description(f"Accuracy: {np.mean(accuracy_list)}")
   
    return accuracy_list
 
 
def run_validation_revert(system_prompt: tg.Variable, results, model, eval_fn, val_set):
    val_performance = np.mean(eval_dataset(val_set, eval_fn, model))
    previous_performance = np.mean(results["validation_acc"][-1])
    print("val_performance: ", val_performance)
    print("previous_performance: ", previous_performance)
    previous_prompt = results["prompt"][-1]
   
    if val_performance < previous_performance:
        print(f"rejected prompt: {system_prompt.value}")
        system_prompt.set_value(previous_prompt)  
        val_performance = previous_performance
   
    results["validation_acc"].append(val_performance)
    return val_performance

In [None]:
set_seed(12)
 
llm_api_eval = tg.get_engine(engine_name="gpt-4o")
 
llm_api_test = tg.get_engine(engine_name="gpt-3.5-turbo-0125")
 
tg.set_backward_engine(llm_api_eval, override=True)
 
train_set, val_set, test_set, eval_fn = load_dataset("./mkt_data.csv", llm_api_eval)
 
print("Train/Val/Test Set Lengths: ", len(train_set), len(val_set), len(test_set))

In [None]:
train_loader = DataLoader(train_set, batch_size=3, shuffle=True)
 
system_prompt = tg.Variable(STARTING_SYSTEM_PROMPT,
                            requires_grad=True,
                            role_description="system prompt to the language model")  
 
model_evaluation = tg.BlackboxLLM(llm_api_eval, system_prompt)
 
system_prompt = tg.Variable(STARTING_SYSTEM_PROMPT,
                            requires_grad=True,  
                            role_description="structured system prompt to a somewhat capable language model that specifies the behavior and strategies for the QA task")  # Descrizione del ruolo della variabile
 
model = tg.BlackboxLLM(llm_api_test, system_prompt)
 
optimizer = tg.TextualGradientDescent(engine=llm_api_eval, parameters=[system_prompt])
 
results = {"test_acc": [], "prompt": [], "validation_acc": []}
results["test_acc"].append(eval_dataset(test_set, eval_fn, model))
results["validation_acc"].append(eval_dataset(val_set, eval_fn, model))
results["prompt"].append(system_prompt.get_value())
 
 


In [None]:
for epoch in range(3):
    for steps, (batch_x, batch_y) in enumerate((pbar := tqdm(train_loader, position=0))):
        pbar.set_description(f"Training step {steps}. Epoch {epoch}")
        optimizer.zero_grad()
        losses = []
        for (x, y) in zip(batch_x, batch_y):
            x = tg.Variable(x, requires_grad=False, role_description="query to the language model")
            y = tg.Variable(y, requires_grad=False, role_description="correct answer for the query")
            response = model(x)
            try:
                eval_output_variable = eval_fn(inputs=dict(prediction=response, ground_truth_answer=y))
            except:
                eval_output_variable = eval_fn([x, y, response])
            losses.append(eval_output_variable)
        total_loss = tg.sum(losses)
        total_loss.backward()
        optimizer.step()
       
        run_validation_revert(system_prompt, results, model, eval_fn, val_set)
       
        print("sys prompt: ", system_prompt)
        test_acc = eval_dataset(test_set, eval_fn, model)
        results["test_acc"].append(test_acc)
        results["prompt"].append(system_prompt.get_value())
        if steps == 3:
            break
