In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#| default_exp src

In [3]:
#| export
import pandas as pd

# Core functions

In [9]:
#| export
def load_data(fp: str='../in/utterances_requests_50k_labeled.csv'):
	df = pd.read_csv(fp)
	# in the `label` column, replace `NaN` with `0` and `1.0` with `1`
	df['label'] = df['label'].fillna(0).astype(int)
	return df

In [10]:
df = load_data()

In [11]:
df.value_counts('label')

label
0    49617
1      383
Name: count, dtype: int64

### get data sample

In [5]:
#| export
def get_sample(df, requests_n: int):
	requests = df.query('label == 1').sample(requests_n, random_state=23) 
	non_requests = df.query('label == 0').sample(requests_n, random_state=23)
	return pd.concat([requests, non_requests])

## set up GPT

### get API key

In [6]:
#| export
from dotenv import load_dotenv
import os

In [7]:
#| notest
#| export
def get_api_key():
	# Load the environment variables from the .env file
	load_dotenv()
	# Get the value of the OPENAI_API_KEY environment variable
	return os.getenv("OPENAI_API_KEY")

### specify output format

In [9]:
#| export
from pydantic import BaseModel, Field

class output_request(BaseModel):
    """output schema for request"""
    label: int = Field(description='`0` if not a request, `1` if a request')
    justification: str = Field(description='justification for why or why not the given utterance was classified as a request')
    confidence: int = Field(description='''
        give a score for how confident you are in your classification of this utterance as a request/non-request,
        on a scale of 1 to 5, where 1 is not confident at all and 5 is very confident''')

### run model

In [10]:
#| export
from tqdm import tqdm

def run_model(ai, df):
	tqdm.pandas()
	ai_labels = df['text'].progress_apply(lambda x: ai(str(x), output_schema=output_request))
	# Unpack the dictionary in `gpt_dict` column into separate columns and prefix every column with `gpt_`
	return pd.concat([df, ai_labels.apply(pd.Series).add_prefix('gpt_')], axis=1)

#### incrementally

In [11]:
from simpleaichat import AIChat

In [None]:
#| notest
model='gpt-3.5-turbo'
# model='gpt-4'

ai = AIChat(model, system=prompt,
    console=False, save_messages=False, params={"temperature": 0.0}
)

In [65]:
requests_n = len(df.query('label == 1'))
requests_n

383

In [66]:
df_test = get_sample(df, requests_n)
len(df_test)

766

In [69]:
from tqdm import tqdm

In [78]:
import os
import pandas as pd
from tqdm import tqdm

def run_model_incremental(ai, df, batch_size=100, results_file="intermediate_results.csv"):
    # Check if results file exists
    if os.path.exists(results_file):
        # Load previously processed results
        processed_df = pd.read_csv(results_file)
        last_index = processed_df.index[-1] if not processed_df.empty else -1
    else:
        processed_df = pd.DataFrame()
        last_index = -1

    # If everything is already processed
    if last_index >= len(df) - 1:
        return processed_df

    # Calculate the number of batches
    total_batches = (len(df) - last_index - 1) // batch_size + 1

    with tqdm(total=total_batches, desc="Overall Progress") as pbar_outer:
        # Process in batches
        for start in range(last_index + 1, len(df), batch_size):
            end = start + batch_size
            batch = df.iloc[start:end]
            
            # Inner progress bar for batch processing
            ai_labels_list = []
            for text in tqdm(batch['text'], desc="Batch Progress", leave=False):
                ai_labels_list.append(ai(str(text), output_schema=output_request))
                
            # Convert ai_labels_list into a DataFrame
            ai_labels_df = pd.DataFrame(ai_labels_list)

            # Reset indices before concatenation
            batch = batch.reset_index(drop=True)
            ai_labels_df = ai_labels_df.reset_index(drop=True)

            result_batch = pd.concat([batch, ai_labels_df.add_prefix('gpt_')], axis=1)

            # Append to processed_df and save
            processed_df = pd.concat([processed_df, result_batch], axis=0)
            processed_df.to_csv(results_file, index=False)

            # Update the outer progress bar
            pbar_outer.update(1)

    return processed_df

In [None]:
results = run_model_incremental(ai, df_test, batch_size=10, results_file="../out/intermediate_results.csv")

In [None]:
results

## find few-show examples

In [None]:
df = load_data()

In [None]:
for t in (df
 .query('label == 0')
 .sample(7)
 .loc[:, 'text']
):
	print(t)

okay Barcelona to Brussels I didn't do
mm
hmm
satellite
mm
Cos how do you go down to Devon usually?
and they they'd cut they snipped it?
