In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots

pio.templates.default = "simple_white"

from scienceworld import ScienceWorldEnv


from sources.scienceworld import load_step_function, parse_observation
from sources.agent import BDIAgent
from sources.bdi_components.inference import NLIModel
from sources.bdi_components.belief import State

import numpy as np
from os import listdir
from os.path import isfile, join
import re


In [44]:
def preprocess_df(results_df):
    results_df.loc[results_df["plans_pct"] == 1, "plans_pct"] = 100
    results_df.loc[results_df["plans_pct"] == 2, "plans_pct"] = 25
    results_df.loc[results_df["plans_pct"] == 5, "plans_pct"] = 50
    results_df.loc[results_df["plans_pct"] == 7, "plans_pct"] = 75
    results_df['rl_score'] = results_df['rl_score'] / 100
    results_df['bdi_score'] = results_df['bdi_score'] / 100
    results_df['final_score'] = results_df['final_score'] / 100
    return results_df

In [50]:
import pandas as pd


plan_statistics = pd.read_csv("plan_statistics.csv")

dirs = ["../results/v2-gchhablani-bert-base-cased-finetuned-mnli/", "../results/v2-MoritzLaurer-MiniLM-L6-mnli/", "../results/v2-ynie-roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli/"]
#dirs = ["../results/v2-minilm/"]
tasks = ['melt', 'find-non-living-thing']
print(tasks)
#files_overall = "results_melt.csv"
#files_nli = "results_nli_melt.csv"

all_overall_dfs = []
all_nli_dfs = []
for dir in dirs:
    for task in tasks:
        results_df = pd.read_csv(dir+f"results_{task}.csv")
        results_df['task'] = task
        all_overall_dfs.append(results_df)

        nli_results_df = pd.read_csv(dir+f"results_nli_{task}.csv")
        nli_results_df['task'] = task
        all_nli_dfs.append(nli_results_df)

overall_results_df = pd.concat(all_overall_dfs)
overall_results_df = preprocess_df(overall_results_df)
overall_results_df = pd.merge(overall_results_df, plan_statistics, on=['plans_pct', 'task'])
overall_results_df

['melt', 'find-non-living-thing']


Unnamed: 0,num_bdi_actions,num_rl_actions,plan_found,variation,error,bdi_score,rl_score,final_score,complete,num_plans,plan_library_size,plans_pct,eps,drrn_model_file,nli_model,task,num_total_plans,num_common_plans,num_specific_plans
0,0,50,0,21,True,0.0,0.03,0.03,False,0,24,100,457,models/model_task1melt/model-steps56000-eps457.pt,gchhablani/bert-base-cased-finetuned-mnli,melt,193,180,13
1,5,50,1,22,True,0.0,0.05,0.05,False,2,24,100,457,models/model_task1melt/model-steps56000-eps457.pt,gchhablani/bert-base-cased-finetuned-mnli,melt,193,180,13
2,3,50,1,23,True,0.0,0.05,0.05,False,2,24,100,457,models/model_task1melt/model-steps56000-eps457.pt,gchhablani/bert-base-cased-finetuned-mnli,melt,193,180,13
3,5,50,1,24,True,0.0,0.05,0.05,False,3,24,100,457,models/model_task1melt/model-steps56000-eps457.pt,gchhablani/bert-base-cased-finetuned-mnli,melt,193,180,13
4,2,50,1,25,True,0.0,0.03,0.03,False,2,24,100,457,models/model_task1melt/model-steps56000-eps457.pt,gchhablani/bert-base-cased-finetuned-mnli,melt,193,180,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,9,0,1,295,False,1.0,0.00,1.00,True,3,41,100,242,models/models_task13/model-steps32000-eps242.pt,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,find-non-living-thing,210,180,30
248,11,0,1,296,False,1.0,0.00,1.00,True,3,41,100,242,models/models_task13/model-steps32000-eps242.pt,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,find-non-living-thing,210,180,30
249,11,0,1,297,False,1.0,0.00,1.00,True,3,41,100,242,models/models_task13/model-steps32000-eps242.pt,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,find-non-living-thing,210,180,30
250,11,0,1,298,False,1.0,0.00,1.00,True,3,41,100,242,models/models_task13/model-steps32000-eps242.pt,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,find-non-living-thing,210,180,30


In [51]:
projected_cols = ['task', 'plans_pct', 'eps', 'num_specific_plans', 'nli_model']
aggregations = {'variation': 'count', 'final_score': 'mean', 'rl_score': 'mean', 'bdi_score': 'mean',
                'num_bdi_actions': 'mean', 'num_rl_actions': 'mean', 'error': 'mean', 'num_plans': 'mean'}

grouped_df = overall_results_df.groupby(projected_cols).agg(aggregations).reset_index()
grouped_df['dense_rank'] = (
    grouped_df.groupby(['plans_pct', 'task', "nli_model"])['final_score'].rank(method='dense', ascending=False).astype(int))

#grouped_df = grouped_df.sort_values(['plans_pct', 'dense_rank'], ascending=[True, True]).reset_index()
grouped_df = grouped_df[(grouped_df['dense_rank'] == 1)].sort_values(["task", "num_specific_plans", "nli_model"])
# avoiding tied rows
grouped_df.drop(columns=['dense_rank']).sort_values(by=['final_score', 'nli_model'], ascending=[False, False])

Unnamed: 0,task,plans_pct,eps,num_specific_plans,nli_model,variation,final_score,rl_score,bdi_score,num_bdi_actions,num_rl_actions,error,num_plans
2,find-non-living-thing,100,242,30,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,75,0.98,0.0,0.98,9.186667,4.0,0.08,2.84
1,find-non-living-thing,100,242,30,gchhablani/bert-base-cased-finetuned-mnli,75,0.9372,0.0944,0.8428,9.613333,10.0,0.2,2.72
0,find-non-living-thing,100,242,30,MoritzLaurer/MiniLM-L6-mnli,75,0.885333,0.196,0.689333,7.653333,18.666667,0.373333,2.573333
5,melt,100,457,13,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,9,0.676667,0.003333,0.673333,20.888889,16.666667,0.333333,5.666667
4,melt,100,457,13,gchhablani/bert-base-cased-finetuned-mnli,9,0.362222,0.028889,0.333333,11.444444,33.333333,0.666667,3.555556
3,melt,100,457,13,MoritzLaurer/MiniLM-L6-mnli,9,0.262222,0.028889,0.233333,8.777778,50.0,1.0,3.555556


In [52]:
write_df = grouped_df.loc[:, ['task', 'nli_model', 'final_score', 'bdi_score', 'num_bdi_actions', 'error', 'num_plans']]
write_df

Unnamed: 0,task,nli_model,final_score,bdi_score,num_bdi_actions,error,num_plans
0,find-non-living-thing,MoritzLaurer/MiniLM-L6-mnli,0.885333,0.689333,7.653333,0.373333,2.573333
1,find-non-living-thing,gchhablani/bert-base-cased-finetuned-mnli,0.9372,0.8428,9.613333,0.2,2.72
2,find-non-living-thing,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,0.98,0.98,9.186667,0.08,2.84
3,melt,MoritzLaurer/MiniLM-L6-mnli,0.262222,0.233333,8.777778,1.0,3.555556
4,melt,gchhablani/bert-base-cased-finetuned-mnli,0.362222,0.333333,11.444444,0.666667,3.555556
5,melt,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,0.676667,0.673333,20.888889,0.333333,5.666667


In [20]:
sentence_a = "In your inventory, you see: an orange"
sentence_b = "you are in the hallway"
sentence_c = "you are in the bathroom"

words_doc1 = set(sentence_b.split())
words_doc2 = set(sentence_c.split())



def lexical_overlap(a, b):
    words_doc1 = set(a.split())
    words_doc2 = set(b.split())

    diff = words_doc1.difference(words_doc2)
    return len(diff)

dff = words_doc1.difference(words_doc2)
dff, len(dff)

({'hallway'}, 1)

In [8]:
# https://seaborn.pydata.org/examples/grouped_barplot.html

# load nli csv
# compute lexical overlap (see code below)
# mean of each nli using a https://seaborn.pydata.org/examples/grouped_barplot.html to show quantities


#

In [53]:
nli_results_df = pd.concat(all_nli_dfs)
print(len(nli_results_df))
nli_results_df.head()

5397


Unnamed: 0,p,h,output,model,task
0,"In your inventory, you see: an orange",you are in hallway,2,gchhablani/bert-base-cased-finetuned-mnli,melt
1,This room is called the living room.,you are in hallway,2,gchhablani/bert-base-cased-finetuned-mnli,melt
2,You see the agent,you are in hallway,2,gchhablani/bert-base-cased-finetuned-mnli,melt
3,You see a substance called air,you are in hallway,2,gchhablani/bert-base-cased-finetuned-mnli,melt
4,You see a book shelf (containing A book (Frank...,you are in hallway,2,gchhablani/bert-base-cased-finetuned-mnli,melt


In [54]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

all_groups_df = []
for model, group_df in nli_results_df.groupby("model"):
    print(model)
    config = AutoConfig.from_pretrained(model)
    #print(config.label2id)
    #print(config.id2label)
    group_df['inference'] = group_df['output'].apply(lambda id: config.id2label[id].lower())
    all_groups_df.append(group_df)

filtered_nli_df = pd.concat(all_groups_df)
#filtered_nli_df['levenshtein_distance'] = filtered_nli_df.apply(lambda row: levenshtein_distance(row['p'], row['h']), axis=1)
filtered_nli_df['lexical_overlap'] = filtered_nli_df.apply(lambda row: lexical_overlap(row['p'], row['h']), axis=1)
filtered_nli_df['length_p'] = filtered_nli_df['p'].apply(lambda p: len(p.split()))
filtered_nli_df['length_h'] = filtered_nli_df['h'].apply(lambda h: len(h.split()))

filtered_nli_df.head()

MoritzLaurer/MiniLM-L6-mnli
gchhablani/bert-base-cased-finetuned-mnli
ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli


Unnamed: 0,p,h,output,model,task,inference,lexical_overlap,length_p,length_h
0,"In your inventory, you see: an orange",you are in hallway,2,MoritzLaurer/MiniLM-L6-mnli,melt,contradiction,6,7,4
1,This room is called the living room.,you are in hallway,2,MoritzLaurer/MiniLM-L6-mnli,melt,contradiction,7,7,4
2,You see the agent,you are in hallway,1,MoritzLaurer/MiniLM-L6-mnli,melt,neutral,4,4,4
3,You see a substance called air,you are in hallway,2,MoritzLaurer/MiniLM-L6-mnli,melt,contradiction,6,6,4
4,You see a book shelf (containing A book (Frank...,you are in hallway,2,MoritzLaurer/MiniLM-L6-mnli,melt,contradiction,14,15,4


In [56]:
all_statistics = []
for (model, task), group_df in filtered_nli_df.groupby(["model", "task"]):
    all_statistics.append({
        'model': model,
        'num_entailment': len(group_df[group_df['inference'] == 'entailment']),
        #'num_neutral': len(group_df[group_df['inference'] == 'neutral']),
        #'num_contradiction': len(group_df[group_df['inference'] == 'contradiction']),
        'num_nonentailment': len(group_df[group_df['inference'] == 'neutral']) + len(group_df[group_df['inference'] == 'contradiction']),
        #'num_inferences': len(group_df),
        'mean_entailment_lexical_overlap': group_df[group_df['inference'] == 'entailment']['lexical_overlap'].mean(),
        'mean_h': group_df[group_df['inference'] == 'entailment']['length_h'].mean(),
        'mean_p': group_df[group_df['inference'] == 'entailment']['length_p'].mean(),
        "task": task
    })

statistics_df = pd.DataFrame(all_statistics)
#full_models_df = pd.merge(statistics_df, on='model', how='inner')
full_models_df = statistics_df
full_models_df.head()

Unnamed: 0,model,num_entailment,num_nonentailment,mean_entailment_lexical_overlap,mean_h,mean_p,task
0,MoritzLaurer/MiniLM-L6-mnli,25,1051,6.96,4.2,7.84,find-non-living-thing
1,MoritzLaurer/MiniLM-L6-mnli,23,668,8.913043,4.826087,10.956522,melt
2,gchhablani/bert-base-cased-finetuned-mnli,25,1050,7.56,4.16,8.4,find-non-living-thing
3,gchhablani/bert-base-cased-finetuned-mnli,25,665,9.28,4.8,11.12,melt
4,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,54,1021,8.962963,4.074074,10.0,find-non-living-thing


In [None]:
"""
We compute the lexical overlap to measure the inference difficult between the belief base and plan context that we manually developed in order to evaluate our approach.
Specifically, given a sentence pair consisting in a belief and a context, we calculate the number of words contained in beliefs that are absent in plan context.
In cases where lexical overlap is high between the premise and hypothesis, the inference tends to easily infer entailment relation since both sentences are similar and may express the same idea.
Hence, in such cases, sophisticated language models exploit shallow syntatic heuristics to infer logical entailment between sentences.

Note that most beliefs contain more words than the plan context, which illustrates the
"""

In [34]:
full_models_df.rename(columns={
    'model': "Model",
    'bdi_score': 'BDI Score',
    'num_bdi_actions': 'Num BDI actions',
    'num_entailment': "Pairs (E)",
    'num_nonentailment': "Pairs (NE)",
    'mean_entailment_lexical_overlap': 'LO',
    'num_inferences': "Pairs (Total)"

}, inplace=True)

full_models_df.to_csv("overall_nli_table.csv", index=False, float_format='%.2f')

In [24]:
gt_df = filtered_nli_df[filtered_nli_df['model'] == 'roberta-large-mnli'][['p', 'h', 'inference']].sort_values(['h', 'inference'])
gt_df.loc[gt_df['inference'] != 'entailment' ,'inference'] = 'non_entailment'
gt_df['y'] = 1 # temp
gt_df.sort_values(['h', 'inference']).to_csv("ground_truth.csv", index=False)

In [13]:
# Não vale a pena medir ground thruth, pois minilm não gerou todos planos (falhou antes pois não deu sequência nos subgoals seguintes)

# na real vale contar quantos falsos entailment e quantos falsos não entailment