In [None]:
#| default_exp core

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from ergativegpt.src import *

In [None]:
import pandas as pd

from langchain.chains.openai_functions import create_structured_output_runnable
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

from tqdm.notebook import tqdm

from sklearn.metrics import precision_recall_fscore_support,  accuracy_score

import altair as alt

from datetime import datetime

In [None]:
df = pd.read_excel('../in/Data_GPT.xlsx')

In [None]:
model="gpt-3.5-turbo"
# model="gpt-4"

llm = ChatOpenAI(model=model, temperature=0)

In [None]:
with open('../in/prompt.md', 'r') as f:
        prompt_txt = f.read()


In [None]:
prompt = ChatPromptTemplate.from_messages([
        ("system", prompt_txt),
        ("human", 'Please classify the following sentence: {input}')
])

In [None]:
#| export
from langchain.pydantic_v1 import BaseModel, Field

class UtteranceClassification(BaseModel):
    """Classifying the utterances according to linguistic criteria."""
    gpt_transitivity: str = Field(..., description="Whether this use of the verb should be labelled either `transitive` or `intransitive`.")
    gpt_causativity: str = Field(..., description="Whether this use of the verb should be labelled as `causative` or `anticausative`.")
    gpt_subject_animacy: str = Field(..., description='Whether the subject of the verb should be labelled as `animate` or `inamimate`.')
    gpt_subject_role: str = Field(..., description='Whether the semantic role of the subject of the verb should be labelled as either `agent` or `patient`.')
    gpt_subject: str = Field(..., description='The subject in this utterance that you considered for your classification.')
    gpt_verb: str = Field(..., description='The verb in this utterance that you considered for your classification.')
    gpt_object: str = Field(..., description='The object in this utterance that you considered for your classification.')

In [None]:
runnable = create_structured_output_runnable(UtteranceClassification, llm, prompt)

In [None]:
results_list = []

df_dev = (df
    .sample(1)
)

for _, row in tqdm(df_dev.iterrows(), total=len(df_dev)):
    try:
        result = runnable.invoke({"input": row['verbRealization'] + ' in: ' + row['Token']})
        row_data = {
            'text': row['Token'],
            'transitivity': row['Transitivity'],
            'gpt_transitivity': result.gpt_transitivity,
            'causativity': row['Construction'],
            'gpt_causativity': result.gpt_causativity,
            'subject_animacy': row['subjectAnimacy'],
            'gpt_subject_animacy': result.gpt_subject_animacy,
            'subject_role': row['subjectRole'],
            'gpt_subject_role': result.gpt_subject_role,

            'gpt_subject': result.gpt_subject,
            'gpt_verb': result.gpt_verb,
            'gpt_object': result.gpt_object
        }
        results_list.append(row_data)
    except:
        continue

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
results = pd.DataFrame(results_list)

In [None]:
results['transitivity'] = results['transitivity'].replace({'Transitive': 'transitive', 'Intransitive': 'intransitive'})
results['causativity'] = results['causativity'].replace({0: 'causative', 1: 'anticausative'})
results['subject_animacy'] = results['subject_animacy'].replace({'Animate': 'animate', 'Inanimate': 'inanimate'})
results['subject_role'] = results['subject_role'].replace({'Agent': 'agent', 'Patient': 'patient'})

In [None]:
vars = {
	'transitivity': 'intransitive',
	'causativity': 'anticausative',
	'subject_animacy': 'inanimate',
	'subject_role': 'patient',
}

In [None]:
def get_metrics(results, variable, pos_label):
	gpt_variable = f'gpt_{variable}'
	precision, recall, f1, support = precision_recall_fscore_support(
		results[variable], results[gpt_variable], 
		pos_label=pos_label,
		average='binary',
	)
	metrics = pd.DataFrame(
		columns = ['variable', 'metric', 'score'],
		data = [
			[variable, 'precision', round(precision, 2)],
			[variable, 'recall', round(recall, 2)],
			[variable, 'accuracy', accuracy_score(results[variable], results[gpt_variable])],
			[variable, 'F1', round(f1, 2)]
		])
	return metrics


In [None]:
metrics_vars = []
for var in vars.items():
	metrics_var = get_metrics(results, var[0], var[1])
	metrics_vars.append(metrics_var)

metrics = pd.concat(metrics_vars)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
metrics

Unnamed: 0,variable,metric,score
0,transitivity,precision,0.0
1,transitivity,recall,0.0
2,transitivity,accuracy,1.0
3,transitivity,F1,0.0
0,causativity,precision,0.0
1,causativity,recall,0.0
2,causativity,accuracy,1.0
3,causativity,F1,0.0
0,subject_animacy,precision,0.0
1,subject_animacy,recall,0.0


In [None]:
chart = alt.Chart(metrics).mark_bar().encode(
	y='score:Q',
	x=alt.X('metric:N', sort=metrics['metric'].tolist()),
	color=alt.Color('metric', legend=None),
).facet(column='variable:N')

chart

## save results

In [None]:
# current_datetime = datetime.now()
# formatted_date = current_datetime.strftime("%Y-%m-%d_%H:%M")

In [None]:
# with open(f'../out/{formatted_date}_prompt.md', 'w') as f_prompt:
# 	f_prompt.write(prompt_txt)

In [None]:
# with open(f'../out/{formatted_date}_model.md', 'w') as f_prompt:
# 	f_prompt.write(model)

In [None]:
# chart.save(f'../out/{formatted_date}_metrics_plot.png', scale=2.0)

In [None]:
# results.to_csv(f'../out/{formatted_date}_results.csv', index=False)