In [27]:
from pathlib import Path
import pandas as pd
import numpy as np
from IPython.display import Markdown
from tqdm.notebook import tqdm

from typing import Optional, Literal
from pydantic import BaseModel, Field
from langchain.chat_models import init_chat_model
from langchain_core.prompts import ChatPromptTemplate

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import altair as alt

# Import our custom library functions
from ergativegpt.data import read_linguistic_data, load_experiment_data
from ergativegpt.config import load_config

# read data

In [2]:
# Load experiment configuration
config = load_config('experiments/config/2025-06-23.yaml')
print(f"Experiment: {config.name}")
print(f"Description: {config.description}")
print(f"Model: {config.model_name}")
print(f"Target: {config.target_verb}")

# Create output directory
Path(config.output_dir).mkdir(parents=True, exist_ok=True)

In [4]:
# Load data using library function
fp = Path(config.input_dir) / config.target_file
print(f"Loading data from: {fp}")

df = read_linguistic_data(fp)
print(f"Loaded {len(df)} samples")
print(f"Columns: {list(df.columns)}")

In [None]:
# for fp in fps:
#     read_data(fp)

# Model setup

In [6]:
llm = init_chat_model(
    config.model_name, 
    model_provider=config.model_provider
    )

In [7]:
from typing import Optional, Literal

class UtteranceClassification(BaseModel):
    """Classifying verbs according to several linguistic criteria."""
    gpt_transitivity: Literal['transitive', 'intransitive'] = Field(..., description="Whether this use of the verb should be labelled either `transitive` or `intransitive`.")
    gpt_causativity: Literal['causative', 'anticausative'] = Field(..., description="Whether this use of the verb should be labelled as `causative` or `anticausative`.")
    gpt_subject_role: Literal['agent', 'patient'] = Field(..., description='Whether the semantic role of the subject of the verb should be labelled as either `agent` or `patient`.')
    gpt_subject_animacy: Literal['animate', 'inanimate'] = Field(..., description='Whether the subject of the verb should be labelled as `animate` or `inanimate`.')
    gpt_verb_pos: Literal['verb', 'other'] = Field(..., description='Whether the attestation is a verb or not. Label it `verb` or `other`.')
    gpt_verb_voice: Literal['active', 'passive'] = Field(..., description='Whether the verb is used in active or passive voice.')
    gpt_subject: str = Field(..., description='The subject in this utterance that you considered for your classification.')
    gpt_verb: str = Field(..., description='The verb in this utterance that you considered for your classification.')
    gpt_object: str = Field(..., description='The object in this utterance that you considered for your classification.')

structured_llm = llm.with_structured_output(UtteranceClassification)

In [8]:
with open(config.prompt_file, 'r') as f:
        prompt_txt = f.read()

prompt = ChatPromptTemplate.from_messages([
        ('system', prompt_txt),
        ('human', 'Please classify the following sentence: {input}')
])

# Markdown(prompt_txt)

In [9]:
chain = prompt | structured_llm

- done
    - develop
    - dry
    - open
    - wake
    - freeze
    - fill
    - close
    - dissolve
    - split
    - improve
- doing
- todo

In [33]:
verb = config.target_verb
print(f"Processing verb: {verb}")
print(f"Data shape: {df.shape}")

../in/2025-04-14/COHA_develop.xlsx
develop


(180, 15)

In [34]:
df_dev = (df
    #   .head(5)
      )

results_list = []

for _, row in tqdm(df_dev.iterrows(), total=len(df_dev)):
    try:
        result = chain.invoke({"input": verb + ' in: ' + row['text']})
        row_data = {
            'ID': row['ID'],
            'text': row['text'],

            'year': row['year'],
            'period': row['period'],
            'genre': row['genre'],
            'source': row['source'],
            'ambiguous_target_verb': row['ambiguous_target_verb'],

            'transitivity': row['transitivity'],
            'gpt_transitivity': result.gpt_transitivity,
            'causativity': row['causativity'],
            'gpt_causativity': result.gpt_causativity,
            'subject_animacy': row['subject_animacy'],
            'gpt_subject_animacy': result.gpt_subject_animacy,
            'subject_role': row['subject_role'],
            'gpt_subject_role': result.gpt_subject_role,
            'pos': row['pos'],
            'gpt_pos': result.gpt_verb_pos,

            'gpt_verb_voice': result.gpt_verb_voice,
            'gpt_subject': result.gpt_subject,
            'gpt_verb': result.gpt_verb,
            'gpt_object': result.gpt_object
        }
        results_list.append(row_data)
    except Exception as e:
        print("Error processing row:", e)
        continue

results = pd.DataFrame(results_list)

  0%|          | 0/180 [00:00<?, ?it/s]

In [35]:
out_dir = Path(config.output_dir)

In [36]:
verb

'develop'

In [37]:
results.shape

(180, 18)

In [38]:
results.to_excel(out_dir / f'{verb}_results.xlsx', index=False)

# inspect results

In [39]:
results_inspect = (results
    # .query('causativity != gpt_causativity')
    .filter(['text', 
            'causativity', 'gpt_causativity',
            'transitivity', 'gpt_transitivity',
            'gpt_verb_voice',
            'gpt_subject', 'gpt_verb', 'gpt_object'
            ])
)

# calculate metrics

In [40]:
metrics = []
vars = {
    'transitivity': {'average': None, 'pos_label': 'intransitive'},
    'causativity': {'average': None, 'pos_label': 'anticausative'},
    'subject_role': {'average': None, 'pos_label': 'patient'},
    'subject_animacy': {'average': None, 'pos_label': 'inanimate'}
}

# Keep a reference to the original results before the loop
original_results = results.copy()

for var, settings in vars.items():
    # --- Filtering Step ---
    # Filter the *original* DataFrame for the current variable, creating a temporary view
    filtered_results = original_results.dropna(subset=[var, f'gpt_{var}'])

    # --- Data Extraction ---
    y_true = filtered_results[var]
    y_pred = filtered_results[f'gpt_{var}']

    # --- Handle Empty Data ---
    if len(y_true) == 0:
        print(f"Skipping {var}: No valid samples after removing NaNs.")
        metrics.append({
            'var': var,
            'accuracy': np.nan,
            'precision': np.nan,
            'recall': np.nan,
            'f1': np.nan,
            'support_pos': 0,
        })
        continue

    # --- Label Handling ---
    # Determine the unique labels present in the *filtered* data and sort them
    present_labels = sorted(list(pd.unique(y_true.astype(str).tolist() + y_pred.astype(str).tolist())))

    # --- Metric Calculation ---
    # Calculate overall accuracy
    acc = accuracy_score(y_true, y_pred)

    # Calculate metrics per class, ensuring zero_division handling
    # Pass the sorted list of labels actually present in the filtered data
    p, r, f, s = precision_recall_fscore_support(
        y_true,
        y_pred,
        average=None, # Crucial: Get metrics per class
        labels=present_labels, # Ensure metrics align with these labels
        zero_division=0 # Handle cases with no true/predicted samples for a class
    )

    # --- Extract Metrics for Positive Class ---
    # Check if the desired positive label exists in the filtered data
    if settings['pos_label'] in present_labels:
        # Find the index of the positive label in the *sorted* list
        pos_label_index = present_labels.index(settings['pos_label'])
        precision_pos = p[pos_label_index]
        recall_pos = r[pos_label_index]
        f1_pos = f[pos_label_index]
        support_pos = s[pos_label_index]
    else:
        # Positive label not found in this subset after filtering NaNs
        print(f"Warning for {var}: Positive label '{settings['pos_label']}' not found in filtered ground truth. Precision/Recall/F1/Support for this class are undefined.")
        precision_pos, recall_pos, f1_pos, support_pos = np.nan, np.nan, np.nan, 0

    # --- Append Results ---
    metrics.append({
        'var': var,
        'accuracy': acc,
        'precision': precision_pos,
        'recall': recall_pos,
        'f1': f1_pos,
        'support_pos': int(support_pos), # Ensure support is an integer
    })

# --- Final DataFrame ---
metrics_df = pd.DataFrame(metrics).round(2)
metrics_df # Display the resulting DataFrame

  present_labels = sorted(list(pd.unique(y_true.astype(str).tolist() + y_pred.astype(str).tolist())))
  present_labels = sorted(list(pd.unique(y_true.astype(str).tolist() + y_pred.astype(str).tolist())))
  present_labels = sorted(list(pd.unique(y_true.astype(str).tolist() + y_pred.astype(str).tolist())))
  present_labels = sorted(list(pd.unique(y_true.astype(str).tolist() + y_pred.astype(str).tolist())))


Unnamed: 0,var,accuracy,precision,recall,f1,support_pos
0,transitivity,0.95,0.94,0.96,0.95,77
1,causativity,0.93,0.81,0.96,0.88,48
2,subject_role,0.9,0.87,0.89,0.88,74
3,subject_animacy,0.93,0.92,0.92,0.92,77


# plot

In [41]:
metrics_df = pd.DataFrame(metrics)

metrics_df = (metrics_df
	.drop('support_pos', axis=1)
	.melt(id_vars=['var'], var_name='metric', value_name='value')
	.round(2)
	.sort_values(['var', 'metric'])
)

In [42]:
chart = alt.Chart(metrics_df).mark_bar().encode(
	x=alt.X('metric:N', title='', sort=['accuracy', 'precision', 'recall', 'f1']),
	y=alt.Y('value:Q', title=''),
	column=alt.Column('var:N', title=''),
	color=alt.Color('metric:N', legend=None)
).properties(title=f"Metrics for “{verb}”")

chart

In [43]:
chart.save(out_dir / f'{verb}_metrics_plot.png', scale_factor=3.0)