In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp src

# Core functions

## load data

In [None]:
#| export
import pandas as pd

In [None]:
#| export
def load_data(fp='../in/utterances_requests_50k_labeled.csv'):
	df = pd.read_csv('../in/utterances_requests_50k_labeled.csv')
	# in the `label` column, replace `1.0` with `1` and `NaN` with `0`
	df["label"] = df["label"].fillna(0).astype(int)
	return df

### get data sample

In [None]:
#| export
def get_sample(df, n):
	sample_dfs = []
	for label in [0, 1]:
		sample_df = (df
		.query(f'label == {label}')
		.sample(int(n / 3) + 1)
		)
		sample_dfs.append(sample_df)
	return pd.concat(sample_dfs)

## set up GPT

### get API key

In [None]:
#| export
from dotenv import load_dotenv
import os

In [None]:
#| notest
#| export
def get_api_key():
	# Load the environment variables from the .env file
	load_dotenv()
	# Get the value of the OPENAI_API_KEY environment variable
	return os.getenv("OPENAI_API_KEY")

### specify output format

In [None]:
#| export
from pydantic import BaseModel, Field

class output_request(BaseModel):
    """output schema for request"""
    label: int = Field(description="`0` if not a request, `1` if a request")

### run model

In [None]:
#| export
from tqdm import tqdm

def run_model(ai, df):
	tqdm.pandas()
	ai_labels = df["text"].progress_apply(lambda x: ai(str(x), output_schema=output_request))
	# Unpack the dictionary in `gpt_dict` column into separate columns and prefix every column with `gpt_`
	return pd.concat([df, ai_labels.apply(pd.Series).add_prefix('gpt_')], axis=1)