In [333]:
from pathlib import Path
from dotenv import load_dotenv
from IPython.display import Markdown

import pandas as pd
import altair as alt

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.pydantic_v1 import BaseModel, Field
from typing import Literal
from langchain.chains.openai_functions import create_structured_output_runnable

from tqdm.notebook import tqdm

from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# read data

In [240]:
fps = list(Path('../in/2024-06-27/data').glob('*'))
fps

[PosixPath('../in/2024-06-27/data/Data_GPT.xlsx'),
 PosixPath('../in/2024-06-27/data/Data_GPT_V2.xlsx'),
 PosixPath('../in/2024-06-27/data/NOW_Upload.xlsx'),
 PosixPath('../in/2024-06-27/data/NOW_Install.xlsx'),
 PosixPath('../in/2024-06-27/data/NOW_Film.xlsx'),
 PosixPath('../in/2024-06-27/data/COHA_Air.xlsx'),
 PosixPath('../in/2024-06-27/data/COHA_Stream.xlsx'),
 PosixPath('../in/2024-06-27/data/NOW_Download.xlsx'),
 PosixPath('../in/2024-06-27/data/COHA_Screen.xlsx'),
 PosixPath('../in/2024-06-27/data/NOW_Update.xlsx')]

In [214]:
fp = Path('../in/2024-06-27/data/NOW_Update.xlsx') 

In [215]:
df = pd.read_excel(fp)

In [280]:
column_spec = { 
	'causativity': ['causative', 'anticausative'],
	'transitivity': ['transitive', 'intransitive'],
	'semantic_transitivity': ['transitive', 'intransitive'],
	'subject_animacy': ['animate', 'inanimate'],
	'subject_role': ['agent', 'patient'],
}

In [369]:
def read_data(fp):

    df = pd.read_excel(fp) 

    column_spec = { 
        'causativity': ['causative', 'anticausative'],
        'transitivity': ['transitive', 'intransitive'],
        'semantic_transitivity': ['transitive', 'intransitive'],
        'subject_animacy': ['animate', 'inanimate'],
        'subject_role': ['agent', 'patient'],
    }

    # fix wrong values
    df['transitivity']= df['transitivity'].str.replace('transitive ', 'transitive')
    df['subject_animacy']= df['subject_animacy'].str.replace('animate ', 'animate')
    df['subject_role']= (df['subject_role'].str
                         .replace('agent ', 'agent')
                         .replace('patient ', 'patient'))
    df['causativity']= df['causativity'].str.replace('causative ', 'causative')
    df['semantic_transitivity']= df['semantic_transitivity'].str.replace('transitive ', 'transitive')

    for col in column_spec.keys():
        # check if all relevant columns are present
        assert col in df.columns, f'{fp}: column `{col}` not found in columns'
        assert 'text' in df.columns, f'{fp}: column `text` not found in columns'
        assert 'verb_realization' in df.columns, f'{fp}: column `verb_real` not found in columns'
        # check if only target values are present
        for val in df[col].unique():
            assert val in column_spec[col], f'{fp} -- colummn `{col}` -- value `{val}` not found'
    
    return df

In [370]:
for fp in fps:
    read_data(fp)

In [329]:
read_data(fp)

Unnamed: 0,text,source,year,verb_realization,verb_lemma,transitivity,gpt_transitivity,semantic_transitivity,causativity,gpt_causativity,subject_animacy,gpt_subject_animacy,subject_role,gpt_subject_role
0,"Updated on Mar 24, 2022 10:57 AM IST",https://hindustantimes.com/photos/lifestyle/ma...,2022,updated,update,intransitive,,transitive,causative,,inanimate,,patient,
1,"Originally, the National Archives said 12,879 ...",https://abcnews.go.com/Politics/trove-jfk-assa...,2022,updated,update,transitive,,transitive,causative,,animate,,agent,
2,The NCAA announced earlier this week that it h...,https://www.nationalreview.com/news/upenn-may-...,2022,updated,update,transitive,,transitive,causative,,animate,,agent,
3,"But he says, when you update software, you can...",https://www.fool.com/investing/2022/01/26/ebay...,2022,update,update,transitive,,transitive,causative,,animate,,agent,
4,A community meeting will be held in December t...,https://sports.yahoo.com/several-years-camden-...,2022,update,update,transitive,,transitive,causative,,animate,,agent,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"TSA PreCheck, a program that allows trusted tr...",https://www.nydailynews.com/news/national/ny-t...,2022,updated,update,intransitive,,transitive,causative,,inanimate,,patient,
96,My thesis adviser suggested I think about upda...,https://www.washingtonpost.com/lifestyle/2022/...,2022,updating,update,transitive,,transitive,causative,,animate,,agent,
97,We'd double-check with the company before maki...,https://www.fool.com/the-ascent/insurance/auto...,2022,updated,update,intransitive,,transitive,causative,,inanimate,,patient,
98,We have updated the story to make clear why Zh...,https://www.propublica.org/article/editors-not...,2022,updated,update,transitive,,transitive,causative,,animate,,agent,


# set up model

In [334]:
load_dotenv()

True

In [335]:
model="gpt-4o"
# model="gpt-4-turbo"

llm = ChatOpenAI(model=model, temperature=0)

In [336]:
with open('../in/2024-06-27/prompt.md', 'r') as f:
        prompt_txt = f.read()

In [340]:
# Markdown(prompt_txt)

In [341]:
prompt = ChatPromptTemplate.from_messages([
        ('system', prompt_txt),
        ('human', 'Please classify the following sentence: {input}')
])

In [343]:
class UtteranceClassification(BaseModel):
    """Classifying verbs according to several linguistic criteria."""
    gpt_transitivity: Literal['transitive', 'intransitive'] = Field(..., description="Whether this use of the verb should be labelled either `transitive` or `intransitive`.")
    gpt_causativity: Literal['causative', 'anticausative'] = Field(..., description="Whether this use of the verb should be labelled as `causative` or `anticausative`.")
    gpt_subject_role: Literal['agent', 'patient'] = Field(..., description='Whether the semantic role of the subject of the verb should be labelled as either `agent` or `patient`.')
    gpt_subject_animacy: Literal['animate', 'inanimate'] = Field(..., description='Whether the subject of the verb should be labelled as `animate` or `inanimate`.')
    gpt_subject: str = Field(..., description='The subject in this utterance that you considered for your classification.')
    gpt_verb: str = Field(..., description='The verb in this utterance that you considered for your classification.')
    gpt_object: str = Field(..., description='The object in this utterance that you considered for your classification.')

In [344]:
var_descs = []
for field_name, field in UtteranceClassification.__fields__.items():
	var_descs.append({"field_name": field_name, "description": field.field_info.description})

var_descs_df = pd.DataFrame(var_descs)

In [345]:
runnable = create_structured_output_runnable(UtteranceClassification, llm, prompt)

# run classification

In [371]:
fps

[PosixPath('../in/2024-06-27/data/Data_GPT.xlsx'),
 PosixPath('../in/2024-06-27/data/Data_GPT_V2.xlsx'),
 PosixPath('../in/2024-06-27/data/NOW_Upload.xlsx'),
 PosixPath('../in/2024-06-27/data/NOW_Install.xlsx'),
 PosixPath('../in/2024-06-27/data/NOW_Film.xlsx'),
 PosixPath('../in/2024-06-27/data/COHA_Air.xlsx'),
 PosixPath('../in/2024-06-27/data/COHA_Stream.xlsx'),
 PosixPath('../in/2024-06-27/data/NOW_Download.xlsx'),
 PosixPath('../in/2024-06-27/data/COHA_Screen.xlsx'),
 PosixPath('../in/2024-06-27/data/NOW_Update.xlsx')]

In [377]:
verb = 'upload'

In [378]:
df = read_data('../in/2024-06-27/data/NOW_Upload.xlsx')

In [379]:
results_list = []

df_dev = (df
    #   .sample(10)
      )

for _, row in tqdm(df_dev.iterrows(), total=len(df_dev)):
    try:
        result = runnable.invoke({"input": row['verb_realization'] + ' in: ' + row['text']})
        row_data = {
            'text': row['text'],
            'transitivity': row['transitivity'],
            'gpt_transitivity': result.gpt_transitivity,
            'causativity': row['causativity'],
            'gpt_causativity': result.gpt_causativity,
            'subject_animacy': row['subject_animacy'],
            'gpt_subject_animacy': result.gpt_subject_animacy,
            'subject_role': row['subject_role'],
            'gpt_subject_role': result.gpt_subject_role,
            'gpt_subject': result.gpt_subject,
            'gpt_verb': result.gpt_verb,
            'gpt_object': result.gpt_object
        }
        results_list.append(row_data)
    except:
        continue

results = pd.DataFrame(results_list)

  0%|          | 0/100 [00:00<?, ?it/s]

In [376]:
results

Unnamed: 0,text,transitivity,gpt_transitivity,causativity,gpt_causativity,subject_animacy,gpt_subject_animacy,subject_role,gpt_subject_role,gpt_subject,gpt_verb,gpt_object
0,"EA couldn't celebrate its win for long, though...",transitive,transitive,causative,causative,animate,animate,agent,agent,the Netherlands,update,its legal definition of gambling
1,The shots in use currently are mostly from a s...,transitive,transitive,causative,causative,animate,animate,agent,agent,I,updated,them
2,Created by former bankers Mickey Down and Konr...,transitive,transitive,causative,causative,inanimate,animate,agent,agent,the series,updates,them
3,"We update when possible, but deals expire and ...",intransitive,intransitive,causative,causative,animate,animate,agent,agent,We,update,
4,"However, if no rating change occurs, the data ...",intransitive,intransitive,causative,anticausative,inanimate,inanimate,patient,patient,the data on this page,update,
5,Now updated for Insider Preview Build 22000.46...,intransitive,intransitive,causative,anticausative,inanimate,inanimate,patient,patient,Now,updated,
6,"Also, at the same time, they can track and upd...",transitive,transitive,causative,causative,animate,animate,agent,agent,they,update,their actions and behaviors
7,While it was partially updated with new inform...,intransitive,transitive,causative,causative,inanimate,inanimate,patient,patient,it,updated,with new information about Russia's war on Ukr...
8,(ADP data courtesy of MyFantasyLeague.com; las...,intransitive,intransitive,causative,anticausative,inanimate,inanimate,patient,patient,ADP data,updated,
9,"Updated on Apr 03, 2022 04:13 AM IST",intransitive,intransitive,causative,anticausative,inanimate,inanimate,patient,patient,(implicit subject),updated,
