In [1]:
from pathlib import Path
from dotenv import load_dotenv
from IPython.display import Markdown

import pandas as pd
import altair as alt

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.pydantic_v1 import BaseModel, Field
from typing import Literal
from langchain.chains.openai_functions import create_structured_output_runnable

from tqdm.notebook import tqdm

from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# read data

In [2]:
verb = 'air'

In [3]:
fp = Path('../in/random-samples/COHA_Air.xlsx')

In [4]:
df = pd.read_excel(fp)

In [5]:
df

Unnamed: 0,Token,Year,Period,Genre,verbRealization,verbLemma,syntacticTransitivity,subjectRole,subjectAnimacy,Construction
0,Over MBS it will continue to be aired from the...,1938,1930-39,MAG,aired,air,Intransitive,Patient,Inanimate,0
1,One other consideration dampened Congressional...,1938,1930-39,MAG,aired,air,Intransitive,Patient,Inanimate,0
2,"In the course of its existence, the Radio Thea...",1940,1940-49,MAG,aired,air,Transitive,Agent,Animate,0
3,Last week for the first time in six months CBS...,1940,1940-49,MAG,aired,air,Intransitive,Patient,Inanimate,0
4,Designed to tell inside stories about programs...,1941,1940-49,MAG,aired,air,Intransitive,Patient,Inanimate,0
...,...,...,...,...,...,...,...,...,...,...
539,The show had previously aired on Spectrum Spor...,2019,2010-19,NEWS,aired,air,Intransitive,Patient,Inanimate,1
540,It aired nationally on ABC in January 1968.,2019,2010-19,NEWS,aired,air,Intransitive,Patient,Inanimate,1
541,His show aired in the Detroit area at midnight...,2019,2010-19,NEWS,aired,air,Intransitive,Patient,Inanimate,1
542,"Meanwhile, the Arabic adaptation of the Dutch ...",2019,2010-19,NF/ACAD,air,air,Intransitive,Patient,Inanimate,1


In [6]:
df.value_counts('subjectRole')

subjectRole
Patient    368
Agent      163
Name: count, dtype: int64

In [7]:
# replace `NA` and `Na` values in `subjectRole` column with `NoSubject`
df["subjectRole"] = (df["subjectRole"]
 .replace(to_replace=r'(?i)\s*NA\s*', value="NoSubject", regex=True)
 .fillna("NoSubject")
)

In [8]:
df.value_counts('subjectRole')

subjectRole
Patient      368
Agent        163
NoSubject     13
Name: count, dtype: int64

In [9]:
assert len(df["subjectRole"].value_counts()) == 3
assert df["subjectRole"].isna().sum() == 0

In [10]:
df.value_counts('subjectAnimacy')

subjectAnimacy
Inanimate    381
Animate      150
Name: count, dtype: int64

In [11]:
# replace `NA` values in `subjectAnimacy` column with `NoSubject`
df["subjectAnimacy"] = (df["subjectAnimacy"]
 .replace(to_replace=r'\s*NA\s*', value="NoSubject", regex=True)
 .fillna("NoSubject")
)

In [12]:
df.value_counts('subjectAnimacy')

subjectAnimacy
Inanimate    381
Animate      150
NoSubject     13
Name: count, dtype: int64

In [13]:
assert len(df["subjectAnimacy"].value_counts()) == 3
assert df["subjectAnimacy"].isna().sum() == 0

In [14]:
df

Unnamed: 0,Token,Year,Period,Genre,verbRealization,verbLemma,syntacticTransitivity,subjectRole,subjectAnimacy,Construction
0,Over MBS it will continue to be aired from the...,1938,1930-39,MAG,aired,air,Intransitive,Patient,Inanimate,0
1,One other consideration dampened Congressional...,1938,1930-39,MAG,aired,air,Intransitive,Patient,Inanimate,0
2,"In the course of its existence, the Radio Thea...",1940,1940-49,MAG,aired,air,Transitive,Agent,Animate,0
3,Last week for the first time in six months CBS...,1940,1940-49,MAG,aired,air,Intransitive,Patient,Inanimate,0
4,Designed to tell inside stories about programs...,1941,1940-49,MAG,aired,air,Intransitive,Patient,Inanimate,0
...,...,...,...,...,...,...,...,...,...,...
539,The show had previously aired on Spectrum Spor...,2019,2010-19,NEWS,aired,air,Intransitive,Patient,Inanimate,1
540,It aired nationally on ABC in January 1968.,2019,2010-19,NEWS,aired,air,Intransitive,Patient,Inanimate,1
541,His show aired in the Detroit area at midnight...,2019,2010-19,NEWS,aired,air,Intransitive,Patient,Inanimate,1
542,"Meanwhile, the Arabic adaptation of the Dutch ...",2019,2010-19,NF/ACAD,air,air,Intransitive,Patient,Inanimate,1


In [15]:
df = df.rename(columns={
	'Token': 'text',
	'syntacticTransitivity': 'transitivity',
	'Construction': 'causativity',
	'subjectAnimacy': 'subject_animacy',
	'subjectRole': 'subject_role'
	})

In [16]:
df

Unnamed: 0,text,Year,Period,Genre,verbRealization,verbLemma,transitivity,subject_role,subject_animacy,causativity
0,Over MBS it will continue to be aired from the...,1938,1930-39,MAG,aired,air,Intransitive,Patient,Inanimate,0
1,One other consideration dampened Congressional...,1938,1930-39,MAG,aired,air,Intransitive,Patient,Inanimate,0
2,"In the course of its existence, the Radio Thea...",1940,1940-49,MAG,aired,air,Transitive,Agent,Animate,0
3,Last week for the first time in six months CBS...,1940,1940-49,MAG,aired,air,Intransitive,Patient,Inanimate,0
4,Designed to tell inside stories about programs...,1941,1940-49,MAG,aired,air,Intransitive,Patient,Inanimate,0
...,...,...,...,...,...,...,...,...,...,...
539,The show had previously aired on Spectrum Spor...,2019,2010-19,NEWS,aired,air,Intransitive,Patient,Inanimate,1
540,It aired nationally on ABC in January 1968.,2019,2010-19,NEWS,aired,air,Intransitive,Patient,Inanimate,1
541,His show aired in the Detroit area at midnight...,2019,2010-19,NEWS,aired,air,Intransitive,Patient,Inanimate,1
542,"Meanwhile, the Arabic adaptation of the Dutch ...",2019,2010-19,NF/ACAD,air,air,Intransitive,Patient,Inanimate,1


In [17]:
df['transitivity'] = df['transitivity'].replace({'Transitive': 'transitive', 'Intransitive': 'intransitive'})
df['causativity'] = df['causativity'].replace({0: 'causative', 1: 'anticausative'})
df['subject_animacy'] = df['subject_animacy'].replace({'Animate': 'animate', 'Inanimate': 'inanimate'})
df['subject_role'] = df['subject_role'].replace({'Agent': 'agent', 'Patient': 'patient'})

# set up model

In [18]:
load_dotenv()

True

In [19]:
model="gpt-4o"
# model="gpt-4-turbo"

llm = ChatOpenAI(model=model, temperature=0)

In [20]:
with open('../in/prompt.md', 'r') as f:
        prompt_txt = f.read()

In [21]:
Markdown(prompt_txt)

# Task

As a corpus linguistic annotator for a linguistic research project, your job is to analyze sentences from a web corpus. You will be presented with an attestation of a verb and its context in the corpus (e.g. "'uploading' in: 'He was uploading the files'".) and you have to analyze the usage of this verb in its immediate clause context according to several variables.

## Variables

### Subject Animacy

**Task**: Identify whether the subject of the verb is `animate` or `inanimate`. Label the attestation as `NoSubject` if there is no overt syntactical subject.

**Definitions**:

- `animate` subjects are living entities, typically people or animals.
- `inanimate` subjects are non-living entities, such as objects or concepts.

**Examples**:

- `animate`: "'barked' in: 'The dog barked loudly.'" (Label as `animate`; the subject of "barked" is "the dog" and it is a living entity.)
- `animate`: "'supported' in: 'My manager supported my promotion.'" (Label as `animate`; the subject of "supported" is "my manager" and it is a living entity.)
- `inanimate`: "'struck' in: 'The clock struck midnight.'" (Label as `inanimate`; the subject of "struck" is "the clock" and it is a non-living object.)
- `inanimate`: "'left' in: 'Her vanity left her heavily in debt.'" (Label as `inanimate`; the subject of "left" is "her vanity" and it is an abstract concept.)
- `NoSubject`: "Download this app!" (Label as `NoSubject`, since this imperative sentence does not have an overt subject.)
- `NoSubject`: "Filming that scene was very difficult." (Label as `NoSubject`, since "filming" does not have an overt subject.)


### Subject Role

**Task**: Analyse the semantic role of the syntactic subject of the target verb.

**Definitions**: 

- `NoSubject`: The target verb has no overt syntactic subject.
- `agent`: The verb has a subject and this subject performs the action. It is typically a doer or a causative entity in the sentence.
- `patient`: The target verb has a subject and this subject receives the action. It is usually the entity that is acted upon or affected by the action.

**Examples**:

- `NoSubject`: "'download' in: 'To download the admit card, candidates can take the following steps.'" (Label as `NoSubject`; the non-finite to-clause has no subject.)
- `NoSubject`: "'install' in: 'Don't install this software.'" (Label as `NoSubject`; this imperative sentence has no subject.)
- `agent`: "'chased' in: 'The cat chased the mouse.'" (Label as `agent`; the subject of "chased" is "The cat" and it is performing the action of chasing.)
- `agent`: "'films' in: 'My husband films our dog's every move." (Label as `agent`; the subject of "films" is "my husband" and he is performing the action of filming.)
- `patient`: "'chased' in: 'The mouse was chased by the cat.'" (Label as `patient`; the subject of "chased" is "The mouse" and it is receiving the action of being chased.)
- `patient`: "'filming' in: 'The movie is currently filming in Prague.'" (Label as `patient`; the subject of "filming" is "the movie" and it is undergoing the action of filming.)


### Transitivity

**Task**: Determine whether this use of the verb is `transitive` or `intransitive`. 

**Definitions**:

- `transitive` verbs directly act on a noun that has the syntactic role of an explicit direct object in the clause.
- `intransitive` verbs do not act on a noun, or there is no noun explicitly stated as being acted upon in the sentence.
- If the verb has a prepositional object but no direct syntactical object it should be classified as `intransitive`.

**Examples**:

- `transitive`: "'approved' in: 'The committee approved the new policy.'" (Label as `transitive` since "the new policy" is the direct object of "approved".)
- `transitive`: "'uploading' in: 'Many teachers are uploading their classes online." (Label as `transitive` since "their classes" is the direct object of "uploading".)
- `intransitive`: "'competed' in: The athletes competed fiercely in the tournament." (Label as `intransitive`; no direct object for "competed".)
- `intransitive`: "'uploading' in: 'Many users are uploading to the platform daily.'" (Label as `intransitive`; no direct object for "uploading".)


### Causativity

**Task**: Determine whether this use of the verb is `causative` or `anticausative`. 

**Definitions**:

- `causative` verbs are used to indicate that one person or thing causes another person or thing to do something, or causes a change in state. They typically have a clear, deliberate agent who instigates the action.
- `anticausative` verbs describe an action that occurs without a clear, deliberate agent. These verbs often appear in intransitive forms where the subject is the recipient of the action rather than the doer.

**Examples**:

- `anticausative`: "'opened' in: 'The door opened.'" (Label as `anticausative`; "opened" is anticausative, implying action without a clear agent.)
- `anticausative`: "aired' in: 'The show first aired in 1993.'" (Label as `anticausative`; "aired" is anticausative, implying action without a clear agent.)
- `causative`: "'opened' in: 'She opened the door.'" (Label as `causative`; "opened" has a clear agent, "she".)
- `causative`: "'airs' in: 'The BBC airs a weekly 15-minute political commentary.'" (Label as `causative`; "airs" has a clear agent, "the BBC".)

In [22]:
prompt = ChatPromptTemplate.from_messages([
        ('system', prompt_txt),
        ('human', 'Please classify the following sentence: {input}')
])

In [23]:
class UtteranceClassification(BaseModel):
    """Classifying verbs according to several linguistic criteria."""
    gpt_transitivity: Literal['transitive', 'intransitive'] = Field(..., description="Whether this use of the verb should be labelled either `transitive` or `intransitive`.")
    gpt_causativity: Literal['causative', 'anticausative'] = Field(..., description="Whether this use of the verb should be labelled as `causative` or `anticausative`.")
    gpt_subject_role: Literal['agent', 'patient', 'NoSubject'] = Field(..., description='Whether the semantic role of the subject of the verb should be labelled as either `agent` or `patient`. If there is no overt syntactic subject related to the verb, mark with `NoSubject`.')
    gpt_subject_animacy: Literal['animate', 'inanimate', 'NoSubject'] = Field(..., description='Whether the subject of the verb should be labelled as `animate` or `inanimate`; If there is no overt syntactic subject, mark with `NoSubject`.')
    gpt_subject: str = Field(..., description='The subject in this utterance that you considered for your classification.')
    gpt_verb: str = Field(..., description='The verb in this utterance that you considered for your classification.')
    gpt_object: str = Field(..., description='The object in this utterance that you considered for your classification.')

In [24]:
var_descs = []
for field_name, field in UtteranceClassification.__fields__.items():
	var_descs.append({"field_name": field_name, "description": field.field_info.description})

var_descs_df = pd.DataFrame(var_descs)

In [25]:
runnable = create_structured_output_runnable(UtteranceClassification, llm, prompt)

# run classification

In [33]:
results_list = []

df_dev = (df
    #   .sample(10)
      )

for _, row in tqdm(df_dev.iterrows(), total=len(df_dev)):
    try:
        result = runnable.invoke({"input": row['verbRealization'] + ' in: ' + row['text']})
        row_data = {
            'text': row['text'],
            'transitivity': row['transitivity'],
            'gpt_transitivity': result.gpt_transitivity,
            'causativity': row['causativity'],
            'gpt_causativity': result.gpt_causativity,
            'subject_animacy': row['subject_animacy'],
            'gpt_subject_animacy': result.gpt_subject_animacy,
            'subject_role': row['subject_role'],
            'gpt_subject_role': result.gpt_subject_role,
            'gpt_subject': result.gpt_subject,
            'gpt_verb': result.gpt_verb,
            'gpt_object': result.gpt_object
        }
        results_list.append(row_data)
    except:
        continue

results = pd.DataFrame(results_list)

  0%|          | 0/544 [00:00<?, ?it/s]

In [34]:
results

Unnamed: 0,text,transitivity,gpt_transitivity,causativity,gpt_causativity,subject_animacy,gpt_subject_animacy,subject_role,gpt_subject_role,gpt_subject,gpt_verb,gpt_object
0,Over MBS it will continue to be aired from the...,intransitive,intransitive,causative,anticausative,inanimate,NoSubject,patient,NoSubject,,aired,
1,One other consideration dampened Congressional...,intransitive,intransitive,causative,anticausative,inanimate,NoSubject,patient,NoSubject,,aired,
2,"In the course of its existence, the Radio Thea...",transitive,transitive,causative,causative,animate,inanimate,agent,agent,the Radio Theatre,aired,272 different shows
3,Last week for the first time in six months CBS...,intransitive,intransitive,causative,anticausative,inanimate,NoSubject,patient,NoSubject,,aired,
4,Designed to tell inside stories about programs...,intransitive,intransitive,causative,anticausative,inanimate,NoSubject,patient,NoSubject,NoSubject,aired,NoObject
...,...,...,...,...,...,...,...,...,...,...,...,...
538,The show had previously aired on Spectrum Spor...,intransitive,intransitive,anticausative,anticausative,inanimate,inanimate,patient,agent,The show,aired,
539,It aired nationally on ABC in January 1968.,intransitive,intransitive,anticausative,anticausative,inanimate,inanimate,patient,agent,It,aired,
540,His show aired in the Detroit area at midnight...,intransitive,intransitive,anticausative,anticausative,inanimate,inanimate,patient,agent,His show,aired,
541,"Meanwhile, the Arabic adaptation of the Dutch ...",intransitive,intransitive,anticausative,anticausative,inanimate,NoSubject,patient,NoSubject,,air,


In [35]:
out_dir = Path('../out/2024-06-12_air')

In [36]:
results_fp = out_dir / f'{verb}_results.csv'
results.to_csv(results_fp, index=False)

# calculate metrics

In [37]:
results = pd.read_csv(results_fp)

set variables and positive labels

In [38]:
vars = {
	"subject_role": {"average": "weighted", "pos_label": 1},
	"subject_animacy": {"average": "weighted", "pos_label": 1},
	"transitivity": {"average": "binary", "pos_label": "intransitive"},
	"causativity": {"average": "binary", "pos_label": "anticausative"}
	}

In [39]:
for var in ['subject_role', 'gpt_subject_role', 'subject_animacy', 'gpt_subject_animacy']:
	# print(f"{var}: {len(results[var].value_counts())}")
	assert len(results[var].value_counts()) == 3

for var in ['transitivity', 'gpt_transitivity', 'causativity', 'gpt_causativity']:
	# print(f"{var}: {len(results[var].value_counts())}")
	assert len(results[var].value_counts()) == 2

In [40]:
metrics = []
for var, dict in vars.items():
	metrics_var = {}
	metrics_var["verb"] = verb
	metrics_var["var"] = var
	metrics_var["precision"], metrics_var["recall"], metrics_var["fscore"], _ = precision_recall_fscore_support(
		y_true=results[var], y_pred=results[f"gpt_{var}"],
		average=dict["average"],
		pos_label=dict["pos_label"]
	)
	metrics_var["accuracy"] = accuracy_score(y_true=results[var], y_pred=results[f"gpt_{var}"])
	metrics.append(metrics_var)


In [41]:
metrics_df = pd.DataFrame(metrics)

metrics_df = (metrics_df
	.melt(id_vars=['verb', 'var'], var_name='metric', value_name='value')
	.round(2)
	.sort_values('var')
)

metrics_df

Unnamed: 0,verb,var,metric,value
3,air,causativity,precision,0.64
7,air,causativity,recall,1.0
11,air,causativity,fscore,0.78
15,air,causativity,accuracy,0.76
1,air,subject_animacy,precision,0.54
5,air,subject_animacy,recall,0.26
9,air,subject_animacy,fscore,0.34
13,air,subject_animacy,accuracy,0.26
0,air,subject_role,precision,0.87
4,air,subject_role,recall,0.32


In [42]:
metrics_df.to_csv(out_dir / f'{verb}_metrics.csv', index=False)

In [43]:
import altair as alt

chart = alt.Chart(metrics_df).mark_bar().encode(
	x=alt.X('metric:N', title=''),
	y=alt.Y('value:Q', title=''),
	column=alt.Column('var:N', title=''),
	color=alt.Color('metric:N', legend=None)
).properties(title=f"Chart for verb: “{verb}”")

chart

In [45]:
chart.save(out_dir / f'{verb}_metrics_chart.png', scale_factor=3.0)

In [46]:
counts_vars = []
for var in vars.keys():
	counts_var = results[var].value_counts().to_dict()
	for key, value in counts_var.items():
		counts_class = {}
		counts_class["verb"] = verb
		counts_class["var"] = var
		counts_class["class"] = key
		counts_class["freq"] = value
		counts_vars.append(counts_class)

counts_df = pd.DataFrame(counts_vars)
counts_df

Unnamed: 0,verb,var,class,freq
0,air,subject_role,patient,367
1,air,subject_role,agent,163
2,air,subject_role,NoSubject,13
3,air,subject_animacy,inanimate,380
4,air,subject_animacy,animate,150
5,air,subject_animacy,NoSubject,13
6,air,transitivity,intransitive,369
7,air,transitivity,transitive,174
8,air,causativity,causative,313
9,air,causativity,anticausative,230


In [47]:
counts_df.to_csv(out_dir / f'{verb}_counts.csv', index=False)

# analysis across all verbs

In [90]:
verbs = ['download', 'film', 'install', 'update', 'upload']

## metrics

In [127]:
dfs = [pd.read_csv(f'../out/2024-05-14_random-samples/{verb}_results.csv') for verb in verbs]
results = pd.concat(dfs)

In [131]:
metrics = []
for var, dict in vars.items():
	metrics_var = {}
	metrics_var["var"] = var
	metrics_var["precision"], metrics_var["recall"], metrics_var["fscore"], _ = precision_recall_fscore_support(
		y_true=results[var], y_pred=results[f"gpt_{var}"],
		average=dict["average"],
		pos_label=dict["pos_label"]
	)
	metrics_var["accuracy"] = accuracy_score(y_true=results[var], y_pred=results[f"gpt_{var}"])
	metrics.append(metrics_var)


In [134]:
metrics_df = pd.DataFrame(metrics)

metrics_df = (metrics_df
	.melt(id_vars=['var'], var_name='metric', value_name='value')
	.round(2)
	.sort_values('var')
)

metrics_df

Unnamed: 0,var,metric,value
3,causativity,precision,0.08
7,causativity,recall,1.0
11,causativity,fscore,0.14
15,causativity,accuracy,0.68
1,subject_animacy,precision,0.6
5,subject_animacy,recall,0.61
9,subject_animacy,fscore,0.6
13,subject_animacy,accuracy,0.61
0,subject_role,precision,0.68
4,subject_role,recall,0.62


In [137]:
import altair as alt

chart = alt.Chart(metrics_df).mark_bar().encode(
	x=alt.X('metric:N', title=''),
	y=alt.Y('value:Q', title=''),
	column=alt.Column('var:N', title=''),
	color=alt.Color('metric:N', legend=None)
).properties(title=f"Metrics across all verbs")

chart

In [138]:
chart.save('../out/2024-05-14_random-samples/all_metrics_chart.png', scale_factor=2.0)

## counts

In [119]:
dfs = [pd.read_csv(f'../out/2024-05-14_random-samples/{verb}_counts.csv') for verb in verbs]
counts = pd.concat(dfs)

In [124]:
counts = counts.groupby(['var', 'class'])['freq'].sum().reset_index()
counts

Unnamed: 0,var,class,freq
0,causativity,anticausative,13
1,causativity,causative,484
2,subject_animacy,NoSubject,172
3,subject_animacy,animate,162
4,subject_animacy,inanimate,163
5,subject_role,NoSubject,172
6,subject_role,agent,171
7,subject_role,patient,154
8,transitivity,intransitive,209
9,transitivity,transitive,288


In [125]:
counts.to_csv('../out/2024-05-14_random-samples/all_counts.csv', index=False)