In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from requestgpt.src import *

In [2]:
import pandas as pd
from IPython.display import display, Markdown
from tqdm.notebook import tqdm

# read data

In [33]:
df = load_data(fp='../in/utterances_requests_50k_labeled.csv')

df.value_counts('label')

label
0    49617
1      383
Name: count, dtype: int64

## use smaller DF: 50/50 requests

In [4]:
n_requests = len(df.query('label == 1'))

df = (df
 .sort_values('label', ascending=False)
 .head(n_requests*2)
)

df.value_counts('label')

label
0    383
1    383
Name: count, dtype: int64

# set up Model

In [37]:
from typing import Optional

from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
    create_openai_fn_runnable,
    create_structured_output_runnable,
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

In [38]:
from langchain.pydantic_v1 import BaseModel, Field

class UtteranceClassification(BaseModel):
    """Identifying whether the utterance is a request or not."""
    gpt_label: int = Field(..., description="Whether this utterance is a request (0) or not a request (1)")
    gpt_justification: str = Field(..., description="Why you decided that it is a request or not a request.")
    gpt_confidence: int = Field(..., description="How confident you (ChatGPT) are in your decision, on an ordinal scale from 1 (very inconfident) to 5 (very confident)")

In [46]:
llm = ChatOpenAI(
	model="gpt-3.5-turbo", 
	# model="gpt-4", 
    temperature=0)

prompt = ChatPromptTemplate.from_messages(
    [("system", read_prompt('../in/prompts/prompt_task.md')),
        ("system", read_prompt('../in/prompts/prompt_few-shot-examples.md')),
        ("system", read_prompt('../in/prompts/prompt_chain-of-thought.md')),
        ("human", 'Please classify the following utterance: {input}')]
)

runnable = create_structured_output_runnable(
    UtteranceClassification, llm, prompt)


In [120]:
utterances_all = load_data()
utterances_done = pd.read_csv('../out/results_temp.csv')

In [121]:
utterances_todo = utterances_all.merge(utterances_done, on=['text_id', 'u_n'], how='left', indicator=True)
utterances_todo = utterances_todo[utterances_todo['_merge'] == 'left_only']
utterances_todo = utterances_todo.drop(columns=[col for col in utterances_todo.columns if col.endswith('_y') or col == '_merge'])
utterances_todo.columns = [col.replace('_x', '') for col in utterances_todo.columns]

In [122]:
pd.DataFrame(data=[
	['all', len(utterances_all)],
	['done', len(utterances_done)],
	['todo', len(utterances_all) - len(utterances_done)]	
])

Unnamed: 0,0,1
0,all,50000
1,done,49160
2,todo,840


In [None]:
results_list = []

for _, row in tqdm(utterances_todo.iterrows(), total=len(utterances_todo)):
    result = runnable.invoke({"input": row['text']})
    row_data = {
        "text_id": row['text_id'],
        "u_n": row['u_n'],
        "u_who": row['u_who'],
        "text": row['text'],
        "label": row['label'],
        "gpt_label": result.gpt_label,
        "gpt_justification": result.gpt_justification,
        "gpt_confidence": result.gpt_confidence       
    }
    results_list.append(row_data)
    pd.DataFrame([row_data]).to_csv('../out/results_temp.csv', mode='a', header=False, index=False)

In [None]:
results_df = pd.DataFrame(results_list)
results_df

In [17]:
results_df.to_csv('../out/langchain_gpt3.5_766.csv', index=False)

In [18]:
for _, row in results_df.sample(5).iterrows():
	print(row['gpt_justification'])

The speaker is prompting the listener to provide another Coke, which brings advantages for the speaker.
The speaker is prompting the listener to put something on the plates, which implies a future action. This action could bring advantages for the speaker, such as convenience or organization.
The utterance 'and stuff' does not prompt the listener to a future action and does not bring advantages for the speaker. Therefore, it is not a request.
The utterance does not prompt the listener to a future action or bring advantages for the speaker.
The speaker is prompting the listener to provide something ('what') in the future.


In [None]:
prompt_txt = ''

for i in range(len(prompt.messages)):
	prompt_txt += prompt.messages[i].prompt.template + '\n'

display(Markdown(prompt_txt))

# Evaluation

In [19]:
from sklearn.metrics import precision_recall_fscore_support

In [20]:
def get_metrics(fname, model):
	df = pd.read_csv(fname)
	df = df.dropna()
	precision, recall, f1, _ = precision_recall_fscore_support(df["label"], df["gpt_label"], average="macro")
	metrics = pd.DataFrame({
		'model': model,
		'precision': precision,
		'recall': recall,
		'f1': f1
	}, index=[0])
	return metrics

In [21]:
metrics_gpt = get_metrics('../out/langchain_gpt3.5_766.csv', 'gpt3.5')

In [22]:
metrics_bert = pd.DataFrame({'model': 'BERT', 'precision': 0.21, 'recall': 0.43, 'f1': 0.28}, index=[0])

In [None]:
metrics = pd.concat([metrics_bert, metrics_gpt])
metrics = metrics.melt(id_vars="model", var_name="metric", value_name="score")
metrics

## plot metrics

In [24]:
import altair as alt

In [None]:
# plot the results with altair 
import altair as alt

bars = alt.Chart(metrics).mark_bar().encode(
    y=alt.Y("score", title=''),
    x=alt.X("model", title=''),
    color="model",
)

text = alt.Chart(metrics).mark_text(dx=0, dy=-5, color='blue').encode(
    y="score",
    x="model",
    detail='score',
    text=alt.Text('score', format='.2f'),
    color='model'
)

chart = alt.layer(bars, text).facet(
    column=alt.Column("metric", sort=["precision", "recall", "f1"])
).resolve_scale(y='independent')

chart

In [26]:
# chart.save('../out/metrics_langchain_gpt3.5.png', scale_factor=2.0)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [31]:
mismatches = results_df.query('label != gpt_label')
mismatches

Unnamed: 0,text_id,u_n,u_who,text,label,gpt_label,gpt_justification,gpt_confidence
1,S6BR,401,S0474,just press unlock first and then up yeah hold ...,1,0,"The speaker is giving instructions, but there ...",0
6,S632,5575,S0220,you keep that shit away from my shit though,1,0,This utterance is not a request because the sp...,0
14,SAZX,1895,S0600,pardon?,1,0,The utterance 'pardon?' is not a request becau...,0
15,S632,3087,S0202,please don't,1,0,The utterance does not prompt the listener to ...,1
16,S632,4983,S0202,let me see,1,0,The utterance 'let me see' is not a request be...,0
...,...,...,...,...,...,...,...,...
678,SYHP,2781,S0261,has it got er cinnamon in it?,0,1,The speaker is asking if the item has cinnamon...,0
680,SDS7,53,S0439,oh okay so what do we have going on? we have n...,0,1,The speaker is prompting the listener to provi...,0
711,SY5K,598,S0651,actually some government ought to come in and ...,0,1,The speaker is suggesting that a government sh...,0
722,S6W8,491,S0492,you got her?,0,1,The speaker is asking if the listener has some...,0


In [32]:
mismatches.to_csv('../out/mismatches.csv')