In [1]:
import sys
import os
import json
from pathlib import Path

import pandas as pd

In [2]:
root_dir = Path.cwd().parent

In [3]:
sys.path.append(str(root_dir))

In [4]:
from src.extractor.pos_extractor import PosExtractor
from src.metrics.rouge_like import mean_rouge_like_metric

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
val = pd.read_csv(root_dir / "data" / "val.csv")
val.head()

Unnamed: 0,text,target_spacy,target_gpt_3.5
0,Man in apron standing on front of oven with pa...,"{""objects"": {""Man"": [], ""apron"": [], ""front"": ...","{""objects"": {""man"": [""apron""], ""oven"": [], ""pa..."
1,A small kitchen has various appliances and a t...,"{""objects"": {""kitchen"": [""small""], ""appliances...","{""objects"": {""kitchen"": [""small""], ""appliances..."
2,The kitchen is clean and ready for us to see.,"{""objects"": {""kitchen"": []}}","{""objects"": {""kitchen"": [""clean""], ""us"": []}}"
3,a homeless man holding a cup and standing next...,"{""objects"": {""man"": [""homeless""], ""cup"": [], ""...","{""objects"": {""man"": [""homeless""], ""cup"": [], ""..."
4,a blue bike parked on a side walk,"{""objects"": {""bike"": [""blue""], ""side"": [], ""wa...","{""objects"": {""bike"": [""blue""], ""side walk"": []}}"


### POS extractor:

#### spacy target:

In [6]:
extractor = PosExtractor()
val["pos_pred"] = val["text"].apply(extractor.extract)
val.head()

Unnamed: 0,text,target_spacy,target_gpt_3.5,pos_pred
0,Man in apron standing on front of oven with pa...,"{""objects"": {""Man"": [], ""apron"": [], ""front"": ...","{""objects"": {""man"": [""apron""], ""oven"": [], ""pa...","{'objects': {'man': [], 'front': [], 'pans': [..."
1,A small kitchen has various appliances and a t...,"{""objects"": {""kitchen"": [""small""], ""appliances...","{""objects"": {""kitchen"": [""small""], ""appliances...","{'objects': {'table': [], 'kitchen': ['small']..."
2,The kitchen is clean and ready for us to see.,"{""objects"": {""kitchen"": []}}","{""objects"": {""kitchen"": [""clean""], ""us"": []}}","{'objects': {'kitchen': ['clean', 'ready']}}"
3,a homeless man holding a cup and standing next...,"{""objects"": {""man"": [""homeless""], ""cup"": [], ""...","{""objects"": {""man"": [""homeless""], ""cup"": [], ""...","{'objects': {'cup': [], 'shopping': [], 'cart'..."
4,a blue bike parked on a side walk,"{""objects"": {""bike"": [""blue""], ""side"": [], ""wa...","{""objects"": {""bike"": [""blue""], ""side walk"": []}}","{'objects': {'side': [], 'walk': [], 'bike': [..."


In [10]:
pred = val["pos_pred"]
target = val["target_spacy"].apply(json.loads)

metric = mean_rouge_like_metric(pred_col=pred, target_col=target)
print(f"{metric=:.3f}")

metric_with_penalty = mean_rouge_like_metric(pred_col=pred, target_col=target, penalty=True)
print(f"{metric_with_penalty=:.3f}")

metric=0.932
metric_with_penalty=0.930


#### gpt-3.5 target

In [22]:
print(len(val))
val = val[val["target_gpt_3.5"].apply(json.loads) != {"objects": {}}]
print(len(val))

12697
12694


In [23]:
pred = val["pos_pred"]
target = val["target_gpt_3.5"].apply(json.loads)

metric = mean_rouge_like_metric(pred_col=pred, target_col=target)
print(f"{metric=:.3f}")

metric_with_penalty = mean_rouge_like_metric(pred_col=pred, target_col=target, penalty=True)
print(f"{metric_with_penalty=:.3f}")

metric=0.687
metric_with_penalty=0.108


### BERT extractor: average first 6 attention layers

#### spacy target:

In [6]:
from src.extractor import BertExtractor
from src.core.settings import BertExtractorSettings

In [7]:
extractor = BertExtractor(BertExtractorSettings.from_yaml(root_dir / "configs" / "bert_extractor_settings.yaml"))
extractor.config

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertExtractorSettings(pretrained_model_name='google-bert/bert-base-uncased', process_attentions=<ProcessAttentions.get_all_mean: 'get_all_mean'>, n_blocks_to_average=6)

In [8]:
# pred = []

# for text in val["text"]:
#     pred.append(extractor.extract(text))

In [8]:
pred = val["text"].apply(extractor.extract)
target = val["target_spacy"].apply(json.loads)

metric = mean_rouge_like_metric(pred_col=pred, target_col=target)
print(f"{metric=:.3f}")

metric_with_penalty = mean_rouge_like_metric(pred_col=pred, target_col=target, penalty=True)
print(f"{metric_with_penalty=:.3f}")

metric=0.995
metric_with_penalty=0.991


In [11]:
# 

metric=0.340
metric_with_penalty=0.339


#### gpt-3.5 target

In [9]:
val = val[val["target_gpt_3.5"].apply(json.loads) != {"objects": {}}]

target = val["target_gpt_3.5"].apply(json.loads)

metric = mean_rouge_like_metric(pred_col=pred, target_col=target)
print(f"{metric=:.3f}")

metric_with_penalty = mean_rouge_like_metric(pred_col=pred, target_col=target, penalty=True)
print(f"{metric_with_penalty=:.3f}")

metric=0.227
metric_with_penalty=-0.965


In [13]:
# val = val[val["target_gpt_3.5"].apply(json.loads) != {"objects": {}}]

# target = val["target_gpt_3.5"].apply(json.loads)

# metric = mean_rouge_like_metric(pred_col=pred, target_col=target)
# print(f"{metric=:.3f}")

# metric_with_penalty = mean_rouge_like_metric(pred_col=pred, target_col=target, penalty=True)
# print(f"{metric_with_penalty=:.3f}")

metric=0.083
metric_with_penalty=-0.266


In [17]:
val["text"][:3]

0    Man in apron standing on front of oven with pa...
1    A small kitchen has various appliances and a t...
2        The kitchen is clean and ready for us to see.
Name: text, dtype: object

In [18]:
val["text"].iloc[0]

'Man in apron standing on front of oven with pans and bakeware'

In [15]:
pred[:3]

0                                      {'objects': {}}
1    {'objects': {'kitchen': ['small'], 'appliances...
2         {'objects': {'kitchen': ['clean', 'ready']}}
Name: text, dtype: object

In [16]:
val["target_spacy"][:3]

0    {"objects": {"Man": [], "apron": [], "front": ...
1    {"objects": {"kitchen": ["small"], "appliances...
2                         {"objects": {"kitchen": []}}
Name: target_spacy, dtype: object

### BERT extractor: first attention layer

In [10]:
config = BertExtractorSettings(process_attentions="get_first")
extractor = BertExtractor(config)

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
pred = val["text"].apply(extractor.extract)
target = val["target_spacy"].apply(json.loads)

metric = mean_rouge_like_metric(pred_col=pred, target_col=target)
print(f"{metric=:.3f}")

metric_with_penalty = mean_rouge_like_metric(pred_col=pred, target_col=target, penalty=True)
print(f"{metric_with_penalty=:.3f}")

metric=0.995
metric_with_penalty=0.991


In [13]:
pred[:3]

0    {'objects': {'apron': [], 'front': [], 'man': ...
1    {'objects': {'kitchen': ['small'], 'appliances...
2         {'objects': {'kitchen': ['clean', 'ready']}}
Name: text, dtype: object