# utterance classification

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp classification

In [None]:
#| export
import pandas as pd

from langchain.pydantic_v1 import BaseModel, Field
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains.openai_functions import create_structured_output_runnable

from tqdm.notebook import tqdm

# read data

In [None]:
#| export
def read_utterances(fpath: str='../in/utterances_train.csv') -> pd.DataFrame:
	utterances = pd.read_csv(fpath)
	# set empty cells in the `label` column, which represent non-requests, to the value `0`
	utterances['label'] = utterances['label'].fillna(0).astype(int)
	return utterances

In [None]:
utterances = read_utterances()
utterances

Unnamed: 0,text_id,u_n,u_who,text,label
0,SC2T,252,S0392,we're all thinking of them now aren't we?,0
1,ST82,298,S0617,very new very modern,0
2,S5QR,10,S0325,I did I loved it,0
3,SA2J,2365,S0622,consign charge the people who made the pipes f...,0
4,SJLF,429,S0202,can I just point out this is for Cambridge Uni...,1
...,...,...,...,...,...
49995,S6W8,1065,S0496,oh,0
49996,SUVQ,600,S0198,okay,0
49997,SYHP,2909,S0262,I think so,0
49998,S6W8,1065,S0496,oh,0


In [None]:
utterances.value_counts('label')

label
0    49617
1      383
Name: count, dtype: int64

# set up model

In [None]:
class UtteranceClassification(BaseModel):
    """Identifying whether the utterance is a request or not."""
    gpt_label: int = Field(..., description="Whether this utterance is a request (0) or not a request (1)")
    gpt_justification: str = Field(..., description="Why you decided that it is a request or not a request.")
    gpt_confidence: int = Field(..., description="How confident you (ChatGPT) are in your decision, on an ordinal scale from 1 (very inconfident) to 5 (very confident)")

In [None]:
def read_prompt(fpath: str) -> str:
	with open(fpath) as f:
		prompt = f.read()
	return prompt

In [None]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

prompt = ChatPromptTemplate.from_messages(
    [("system", read_prompt('../in/prompts/prompt_task.md')),
        ("system", read_prompt('../in/prompts/prompt_few-shot-examples.md')),
        ("system", read_prompt('../in/prompts/prompt_chain-of-thought.md')),
        ("human", 'Please classify the following utterance: {input}')]
)

runnable = create_structured_output_runnable(UtteranceClassification, llm, prompt)


# run classification

In [None]:
#| notest

results_list = []

for _, row in tqdm(utterances.iterrows(), total=len(utterances)):
    result = runnable.invoke({"input": row['text']})
    row_data = {
        "text_id": row['text_id'],
        "u_n": row['u_n'],
        "u_who": row['u_who'],
        "text": row['text'],
        "label": row['label'],
        "gpt_label": result.gpt_label,
        "gpt_justification": result.gpt_justification,
        "gpt_confidence": result.gpt_confidence       
    }
    results_list.append(row_data)
    pd.DataFrame([row_data]).to_csv('../out/results_temp.csv', mode='a', header=False, index=False)

In [None]:
results_df = pd.DataFrame(results_list)