In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
#| default_exp core

In [12]:
#| all_notest

In [18]:
from requestgpt.src import *

# RequestGPT

> Analysing requests using GPT.

## load data

In [13]:
import pandas as pd

In [19]:
df = load_data()

In [20]:
df.value_counts('label')

label
0    49617
1      383
Name: count, dtype: int64

## set up model

### set prompt

In [5]:
#| export
def read_prompt(fpath):
    with open(fpath, 'r') as f:
        prompt = f.read()
    return prompt

In [6]:
prompt_parts = [
    read_prompt('../in/prompts/prompt_task.md'),
    read_prompt('../in/prompts/prompt_few-shot-examples.md'),
    read_prompt('../in/prompts/prompt_chain-of-thought.md'),
    read_prompt('../in/prompts/prompt_closing.md')
]

prompt = '\n\n'.join(prompt_parts)


In [7]:
from IPython.display import display, Markdown

In [8]:
display(Markdown(prompt))

- You work as a corpus linguistic annotator for a research project in linguistics.
- Your task is to annotate data from the BNC 2014 spoken corpus, which contains written transcripts of spoken conversations of British English.
- You are given an utterance and you should classify it according to whether it is a request or not. 
    - If they are requests, label them with the integer 1. 
    - If they are not requests, label them with the integer 0.
- You should classify utterances as requests if they meet both of the following criteria:
	1. The speaker prompts the listener to a future action.
	2. The future action brings advantages for the speaker.

Here are some examples to help you understand the task:

- The following utterances should be classified as requests:
	- "can I have another piece": request, label: 1
	- "who's got a lighter?": request, label: 1
	- "okay can we stop?": request, label: 1
	- "hang on": request, label: 1
	- "just accept it fucking rained heavily": request, label: 1
- The following utterances should not be classified as requests:
	- "it is really good": not a request, label: 0
	- "I'll show you": not a request, label: 0
	- "you know": not a request, label: 0
	- "so that the next time we buy": not a request, label: 0
	- "Cos how do you go down to Devon usually?": not a request, label: 0


Please use a chain of thought for classifying utterances:

1. Read the utterance.
2. Determine if the speaker is prompting the listener to a future action.
3. Determine if the future action brings advantages for the speaker.
4. If both criteria are met, label the utterance as a request (1). If not, label it as not a request (0).

Please classify the following utterance:

## run classification

### load data

#### full corpus

In [40]:
utterances = pd.read_csv('../out/utterances.csv')
results = pd.read_csv('../out/gpt_results.csv')

In [41]:
# Merge the two DataFrames based on 'text_id' and 'u_n' columns
merged_df = utterances.merge(results, on=['text_id', 'u_n'], how='left', indicator=True)
# Filter rows that are only in 'utterances' and not in 'results'
filtered_df = merged_df[merged_df['_merge'] == 'left_only']
# Drop the '_merge' column if you don't need it
utterances_todo = filtered_df.drop('_merge', axis=1)

In [42]:
utterances_todo = (utterances_todo
 .drop(columns=['u_who_y', 'text_y', 'gpt_label', 'gpt_confidence', 'gpt_justification'])
 .rename(columns={
	'u_who_x': 'u_who',
	'text_x': 'text'
	})
)

In [43]:
f'done: {len(utterances) - len(utterances_todo)}'

'done: 23843'

#### 50k

In [17]:
df = pd.read_csv('../in/utterances_requests_50k_labeled.csv')

In [24]:
df

Unnamed: 0,text_id,u_n,u_who,text,label
0,SC2T,252,S0392,we're all thinking of them now aren't we?,
1,ST82,298,S0617,very new very modern,
2,S5QR,10,S0325,I did I loved it,
3,SA2J,2365,S0622,consign charge the people who made the pipes f...,
4,SJLF,429,S0202,can I just point out this is for Cambridge Uni...,1.0
...,...,...,...,...,...
49995,S6W8,1065,S0496,oh,
49996,SUVQ,600,S0198,okay,
49997,SYHP,2909,S0262,I think so,
49998,S6W8,1065,S0496,oh,


In [28]:
n_requests = len(df.query('label == 1'))

In [31]:
utterances_todo = (df
 .sort_values('label')
 .head(n_requests*2)
)

### run classification

In [21]:
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [22]:
import datetime

In [23]:
import logging

logging.basicConfig(filename='progress.log', level=logging.INFO, format='%(asctime)s: %(message)s')

# Set the logging level for the requests library to WARNING

# import requests
# warnings.filterwarnings("ignore", category=DeprecationWarning, module="urllib3")

In [39]:
batches = 100
batch_size = 100

batch_results = pd.DataFrame()

for batch in range(batches):
    try:
        start_idx = batch * batch_size
        end_idx = (batch + 1) * batch_size
        utterances_batch = utterances_todo.iloc[start_idx:end_idx].copy()

        progress_message = f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M")}: {batch} / {batches}'
        logging.info(progress_message)
        print(progress_message)

        # Process the batch and create a DataFrame with the results
        utterances_batch['gpt_dict'] = utterances_batch["text"].apply(
            lambda x: ai(str(x), output_schema=output_request)
        )
        batch_df = pd.concat([
            utterances_batch,
            utterances_batch['gpt_dict'].apply(pd.Series).add_prefix('gpt_')
        ], axis=1).drop(columns=['gpt_dict'])

        # Append the batch results to the batch_results DataFrame
        batch_results = pd.concat([batch_results, batch_df], ignore_index=True)
    except:
        error_message = f'error: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M")}: {batch} / {batches}'
        logging.info(error_message)
        print(error_message)
        continue

2023-11-09 18:36: 0 / 100
error: 2023-11-09 18:43: 0 / 100
2023-11-09 18:43: 1 / 100
error: 2023-11-09 18:47: 1 / 100
2023-11-09 18:47: 2 / 100
error: 2023-11-09 18:48: 2 / 100
2023-11-09 18:48: 3 / 100
2023-11-09 18:50: 4 / 100
error: 2023-11-09 18:56: 4 / 100
2023-11-09 18:56: 5 / 100
error: 2023-11-09 18:58: 5 / 100
2023-11-09 18:58: 6 / 100
error: 2023-11-09 18:59: 6 / 100
2023-11-09 18:59: 7 / 100
error: 2023-11-09 18:59: 7 / 100
2023-11-09 18:59: 8 / 100
2023-11-09 18:59: 9 / 100
2023-11-09 18:59: 10 / 100
2023-11-09 18:59: 11 / 100
2023-11-09 18:59: 12 / 100
2023-11-09 18:59: 13 / 100
2023-11-09 18:59: 14 / 100
2023-11-09 18:59: 15 / 100
2023-11-09 18:59: 16 / 100
2023-11-09 18:59: 17 / 100
2023-11-09 18:59: 18 / 100
2023-11-09 18:59: 19 / 100
2023-11-09 18:59: 20 / 100
2023-11-09 18:59: 21 / 100
2023-11-09 18:59: 22 / 100
2023-11-09 18:59: 23 / 100
2023-11-09 18:59: 24 / 100
2023-11-09 18:59: 25 / 100
2023-11-09 18:59: 26 / 100
2023-11-09 18:59: 27 / 100
2023-11-09 18:59: 28 / 

In [40]:
batch_results

Unnamed: 0,text_id,u_n,u_who,text,label,gpt_label,gpt_justification,gpt_confidence
0,SHCG,320,S0347,get your hand off my thigh,1.0,1.0,The utterance includes a command or request to...,5.0
1,S632,3087,S0202,please don't,1.0,0.0,The user requested not to do something.,5.0
2,SWY3,1846,S0391,hang on can I say Americano?,1.0,1.0,The user is asking if they can say 'Americano'.,4.0
3,S35K,2389,S0372,mail it to me,1.0,1.0,The user is requesting to have something maile...,5.0
4,S632,2243,S0208,let him go down your cleavage no one's let him...,1.0,0.0,This utterance is inappropriate and disrespect...,5.0
...,...,...,...,...,...,...,...,...
95,SWY3,2627,S0392,other ones you could've had were,,0.0,The user's statement does not clearly indicate...,4.0
96,SUVQ,11621,S0236,yeah like,,0.0,The user's utterance does not provide enough i...,3.0
97,SUVQ,2469,S0235,no no you,,0.0,The user's utterance does not seem to be a req...,4.0
98,S3MW,864,S0615,mm,,0.0,The utterance 'mm' does not indicate a request...,5.0


In [41]:
results = pd.concat([results, batch_results], ignore_index=True)

In [42]:
len(results)

600

In [43]:
print(
len(results),
len(results.drop_duplicates(subset=['text_id', 'u_n'])),
sep='\n'
)

600
500


In [45]:
# results = results.drop_duplicates(subset=['text_id', 'u_n'])

In [80]:
# results.to_csv('../out/bad-gpt_results.csv', index=False)

In [None]:
# results = pd.read_csv('../out/gpt_results.csv')

In [47]:
(results
.value_counts('gpt_label')
 )

gpt_label
0.0    344
1.0    156
Name: count, dtype: int64

In [48]:
(results
 .query('gpt_label == 1')
 .sample(10)
)

Unnamed: 0,text_id,u_n,u_who,text,label,gpt_label,gpt_justification,gpt_confidence
103,S35K,2389,S0372,mail it to me,1.0,1.0,The user is requesting to have something maile...,5.0
480,SD6X,857,S0058,we should get a fire in our kitchen,,1.0,The user is suggesting starting a fire in thei...,5.0
172,S3LE,1615,S0343,take me there,1.0,1.0,The user is requesting assistance or guidance ...,4.0
78,S5XD,134,S0196,you should stay with me cos I've got a high te...,1.0,1.0,The user is requesting me to stay with them be...,4.0
90,S632,5920,S0211,hi please may I place an order for delivery?,1.0,1.0,The user explicitly asked to place an order fo...,5.0
72,S6W8,2845,S0497,mummy give it to mummy,1.0,1.0,The utterance includes the word 'give' and is ...,4.0
180,SZ98,239,S0439,stop staring at me,1.0,1.0,The user is requesting me to stop staring at t...,5.0
42,S632,3396,S0221,turn it down turn it down a lot,1.0,1.0,The user is requesting to lower the volume.,5.0
52,S6W8,5481,S0494,I'll take those two whites please,1.0,1.0,The user is requesting to purchase two white i...,4.0
306,S35K,1067,S0262,buying that I mean even seeing James Bond movi...,,1.0,The user is expressing an interest in purchasi...,4.0


### evaluate classifications

In [5]:
import pandas as pd

In [25]:
utterances = pd.read_csv('../out/gpt_results.csv')

In [26]:
utterances

Unnamed: 0,text_id,u_n,u_who,text,gpt_label,gpt_justification,gpt_confidence
0,S23A,1,S0094,words,0,The input 'words' is not a request because it ...,5
1,S23A,2,S0095,it 's a games word ? like a computer games word ?,0,The user is asking a question about computer g...,4
2,S23A,3,S0032,yeah yeah,0,The user's utterance does not contain any spec...,3
3,S23A,4,S0095,oh oh that 's nice,0,The user's statement does not contain a clear ...,4
4,S23A,5,S0032,I it 's something I have really heard z-buffer...,0,The user's statement does not seem to be a req...,4
...,...,...,...,...,...,...,...
32938,S2RD,117,S0244,slow rate,0,The user's statement does not appear to be a r...,4
32939,S2RD,118,S0246,fast rate on er Amazon,1,The user is asking for information about the f...,4
32940,S2RD,120,S0244,why ? I do n't know why why ?,1,The user is asking a question starting with 'w...,4
32941,S2RD,121,S0245,da da da da,0,The utterance does not contain any specific re...,5


In [27]:
utterances.value_counts('gpt_label')

gpt_label
0    28767
1     4176
Name: count, dtype: int64

In [34]:
n = 1_000

requests_pos = utterances.query('gpt_label == 1').sample(n)
requests_neg = utterances.query('gpt_label == 0').sample(n)

In [40]:
requests_eval = (pd.concat([requests_pos, requests_neg])
 .reset_index()
 .drop('index', axis=1)
)

In [41]:
requests_eval

Unnamed: 0,text_id,u_n,u_who,text,gpt_label,gpt_justification,gpt_confidence
0,S2GC,471,S0024,well yeah but when you 've got a native speake...,1,The user is asking a question.,4
1,S2EF,18,S0567,oh is it warm ?,1,The user is asking a question about the temper...,4
2,S2GC,865,S0024,oh I stop st-,1,The user's utterance starts with 'oh' and is f...,4
3,S2C9,512,S0362,to recover,1,The user's input is a single word and does not...,4
4,S2KP,473,S0229,can go and buy that,1,The user is expressing their intention to go a...,4
...,...,...,...,...,...,...,...
1995,S2EF,2232,S0567,they --UNCLEARWORD,0,The input is not clear and does not form a com...,3
1996,S2LD,971,S0555,what the actual fuck ?,0,The utterance contains strong language and exp...,4
1997,S2C9,1121,S0336,that sounds horrible,0,The user's statement is expressing a negative ...,4
1998,S263,1999,S0589,okay,0,The user's response does not contain any speci...,5


In [42]:
requests_eval.to_csv('../out/requests_eval.csv', index_label='index')

### labelled data

total number of requests in the dataset

In [None]:
requests_n = len(df.query('label == 1'))
requests_n

use test set containing an equal number of requests and non-requests

In [None]:
df_test = get_sample(df, requests_n)
len(df_test)

In [None]:
#| export
from tqdm import tqdm

def run_model(ai, df):
	tqdm.pandas()
	ai_labels = df['text'].progress_apply(lambda x: ai(str(x), output_schema=output_request))
	# Unpack the dictionary in `gpt_dict` column into separate columns and prefix every column with `gpt_`
	return pd.concat([df, ai_labels.apply(pd.Series).add_prefix('gpt_')], axis=1)

In [None]:
import backoff
import time

@backoff.on_exception(backoff.expo, Exception, max_tries=5, on_backoff=lambda x: time.sleep(60))
def ai_with_backoff(prompt, **kwargs):
    return ai(prompt, **kwargs)

In [None]:
import pandas as pd

In [None]:
#| notest
# df_out = run_model(ai_with_backoff, df_test)

19:38: 605

In [None]:
#| notest
df_out

In [None]:
#| notest
for j in df_out['gpt_justification'].sample(min(5, len(df_out))):
	print(j)

In [None]:
#| notest
# df_out.to_csv(f'../out/requests_gpt_{model}.csv', index=False)

In [None]:
model = 'gpt3.5-turbo'

In [70]:
results['label'] = results['label'].fillna(0)

In [79]:
from sklearn.metrics import precision_recall_fscore_support

def get_metrics(fname, model):
	df = pd.read_csv(fname)
	df = df.dropna()
	precision, recall, f1, _ = precision_recall_fscore_support(df["label"], df["gpt_label"], average="macro")
	metrics = pd.DataFrame({
		'model': model,
		'precision': precision,
		'recall': recall,
		'f1': f1
	}, index=[0])
	return metrics

In [72]:
metrics_gpt35 = get_metrics("../out/df_out_gpt3.5.csv", 'gpt3.5')
metrics_gpt4 = get_metrics("../out/df_out_gpt4.csv", 'gpt4')

In [81]:
metrics_gpt = get_metrics('../out/bad-gpt_results.csv', 'gpt3.5')

In [82]:
metrics_bert = pd.DataFrame({'model': 'BERT', 'precision': 0.21, 'recall': 0.43, 'f1': 0.28}, index=[0])

In [83]:
metrics = pd.concat([
	metrics_bert, 
	# metrics_gpt35, 
	# metrics_gpt4
	metrics_gpt
	])

In [84]:
metrics = metrics.melt(id_vars="model", var_name="metric", value_name="score")

In [None]:
metrics

In [86]:
import altair as alt

In [None]:
# plot the results with altair 
import altair as alt

bars = alt.Chart(metrics).mark_bar().encode(
    y=alt.Y("score", title=''),
    x=alt.X("model", title=''),
    color="model",
)

text = alt.Chart(metrics).mark_text(dx=0, dy=-5, color='blue').encode(
    y="score",
    x="model",
    detail='score',
    text=alt.Text('score', format='.2f'),
    color='model'
)

chart = alt.layer(bars, text).facet(
    column=alt.Column("metric", sort=["precision", "recall", "f1"])
).resolve_scale(y='independent')

chart

In [None]:
chart.save('../out/metrics.png', scale_factor=2.0)

## analysing confidence

In [None]:
df_gpt4 = pd.read_csv('../out/df_out_gpt4.csv')

In [None]:
df_gpt4.value_counts('gpt_confidence')

In [None]:
alt.Chart(df_gpt4).mark_bar().encode(
	    x=alt.X("gpt_confidence:O"),
	    y='count():O'
	)
