In [75]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots

pio.templates.default = "simple_white"

from scienceworld import ScienceWorldEnv

from Levenshtein import distance as levenshtein_distance

from sources.scienceworld import load_step_function, parse_observation
from sources.agent import BDIAgent
from sources.bdi_components.inference import NLIModel
from sources.bdi_components.belief import State

import numpy as np
from os import listdir
from os.path import isfile, join
import re


In [35]:
def preprocess_df(results_df):
    results_df.loc[results_df["plans_pct"] == 1, "plans_pct"] = 100
    results_df.loc[results_df["plans_pct"] == 2, "plans_pct"] = 25
    results_df.loc[results_df["plans_pct"] == 5, "plans_pct"] = 50
    results_df.loc[results_df["plans_pct"] == 7, "plans_pct"] = 75
    results_df['task'] = 'melt'
    results_df['rl_score'] = results_df['rl_score'] / 100
    results_df['bdi_score'] = results_df['bdi_score'] / 100
    results_df['final_score'] = results_df['final_score'] / 100
    return results_df

In [87]:
import pandas as pd

all_task = [""]

plan_statistics = pd.read_csv("plan_statistics.csv")

dirs = ["../results/v2-roberta_large_mnli/", "../results/v2-roberta_large_anli/", "../results/v2-minilm/", "../results/v2-bert/"]
#dirs = ["../results/v2-minilm/"]
files_overall = "results_melt.csv"
files_nli = "results_nli_melt.csv"

all_overall_dfs = []
for dir in dirs:
    file = dir+files_overall
    results_df = pd.read_csv(file)
    all_overall_dfs.append(results_df)

overall_results_df = pd.concat(all_overall_dfs)
overall_results_df = preprocess_df(overall_results_df)
overall_results_df = pd.merge(overall_results_df, plan_statistics, on=['plans_pct', 'task'])

print(len(overall_results_df))
overall_results_df.head()

1359


Unnamed: 0,num_bdi_actions,num_rl_actions,plan_found,variation,error,bdi_score,rl_score,final_score,complete,num_plans_x,plan_library_size,plans_pct,eps,drrn_model_file,nli_model,task,num_plans_y,num_common_plans,num_specific_plans
0,0,50,0,21,True,0.0,0.03,0.03,False,0,11,0,117,models/model_task1melt/model-steps8000-eps117.pt,roberta-large-mnli,melt,180,180,0
1,0,50,0,22,True,0.0,0.03,0.03,False,0,11,0,117,models/model_task1melt/model-steps8000-eps117.pt,roberta-large-mnli,melt,180,180,0
2,0,50,0,23,True,0.0,0.03,0.03,False,0,11,0,117,models/model_task1melt/model-steps8000-eps117.pt,roberta-large-mnli,melt,180,180,0
3,0,50,0,24,True,0.0,0.03,0.03,False,0,11,0,117,models/model_task1melt/model-steps8000-eps117.pt,roberta-large-mnli,melt,180,180,0
4,0,50,0,25,True,0.0,0.03,0.03,False,0,11,0,117,models/model_task1melt/model-steps8000-eps117.pt,roberta-large-mnli,melt,180,180,0


In [88]:
projected_cols = ['task', 'plans_pct', 'eps', 'num_specific_plans', 'nli_model']
aggregations = {'variation': 'count', 'final_score': 'mean', 'rl_score': 'mean', 'bdi_score': 'mean',
                'num_bdi_actions': 'mean', 'num_rl_actions': 'mean', 'error': 'sum'}

grouped_df = overall_results_df.groupby(projected_cols).agg(aggregations).reset_index()
grouped_df['dense_rank'] = (
    grouped_df.groupby(['plans_pct', 'task', "nli_model"])['final_score'].rank(method='dense', ascending=False).astype(int))

#grouped_df = grouped_df.sort_values(['plans_pct', 'dense_rank'], ascending=[True, True]).reset_index()
grouped_df = grouped_df[(grouped_df['dense_rank'] == 1)].sort_values(["task", "num_specific_plans", "nli_model"])
# avoiding tied rows
grouped_df = grouped_df[grouped_df['plans_pct'] == 100]
grouped_df.drop(columns=['dense_rank']).sort_values(by=['final_score', 'nli_model'], ascending=[False, False])

Unnamed: 0,task,plans_pct,eps,num_specific_plans,nli_model,variation,final_score,rl_score,bdi_score,num_bdi_actions,num_rl_actions,error
147,melt,100,565,13,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,9,0.68,0.006667,0.673333,20.888889,16.666667,3
150,melt,100,614,13,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,9,0.68,0.006667,0.673333,20.888889,16.666667,3
146,melt,100,565,13,roberta-large-mnli,9,0.68,0.006667,0.673333,20.888889,16.666667,3
149,melt,100,614,13,roberta-large-mnli,9,0.68,0.006667,0.673333,20.888889,16.666667,3
139,melt,100,457,13,gchhablani/bert-base-cased-finetuned-mnli,9,0.362222,0.028889,0.333333,11.444444,33.333333,6
138,melt,100,457,13,MoritzLaurer/MiniLM-L6-mnli,9,0.262222,0.028889,0.233333,8.777778,50.0,9
142,melt,100,512,13,MoritzLaurer/MiniLM-L6-mnli,9,0.262222,0.028889,0.233333,8.777778,50.0,9


In [108]:
sentence_a = "In your inventory, you see: an orange"
sentence_b = "you are in hallway"
sentence_c = "you are in the bathroom"

words_doc1 = set(sentence_a.split())
words_doc2 = set(sentence_b.split())



def lexical_overlap(a, b):
    words_doc1 = set(a.split())
    words_doc2 = set(b.split())

    diff = words_doc1.difference(words_doc2)
    return len(diff)

intersection = words_doc1.difference(words_doc2)
intersection, len(intersection)

({'In', 'an', 'inventory,', 'orange', 'see:', 'your'}, 6)

In [None]:
# https://seaborn.pydata.org/examples/grouped_barplot.html

# load nli csv
# compute lexical overlap (see code below)
# mean of each nli using a https://seaborn.pydata.org/examples/grouped_barplot.html to show quantities


#

In [113]:
all_nli_dfs = []
for dir in dirs:
    file = dir + files_nli
    results_df = pd.read_csv(file)
    all_nli_dfs.append(results_df)

nli_results_df = pd.concat(all_nli_dfs)
print(len(nli_results_df))
nli_results_df.head()

2982


Unnamed: 0,p,h,output,model
0,"In your inventory, you see: an orange",you are in hallway,0,roberta-large-mnli
1,This room is called the bathroom.,you are in hallway,0,roberta-large-mnli
2,You see the agent,you are in hallway,1,roberta-large-mnli
3,You see a substance called air,you are in hallway,1,roberta-large-mnli
4,"You see a bathtub, which is turned off. In the...",you are in hallway,1,roberta-large-mnli


In [114]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

all_groups_df = []
for model, group_df in nli_results_df.groupby("model"):
    print(model)
    config = AutoConfig.from_pretrained(model)
    #print(config.label2id)
    #print(config.id2label)
    group_df['inference'] = group_df['output'].apply(lambda id: config.id2label[id].lower())
    all_groups_df.append(group_df)

filtered_nli_df = pd.concat(all_groups_df)
filtered_nli_df['levenshtein_distance'] = filtered_nli_df.apply(lambda row: levenshtein_distance(row['p'], row['h']), axis=1)
filtered_nli_df['lexical_overlap'] = filtered_nli_df.apply(lambda row: lexical_overlap(row['p'], row['h']), axis=1)
filtered_nli_df.head()

MoritzLaurer/MiniLM-L6-mnli
gchhablani/bert-base-cased-finetuned-mnli
roberta-large-mnli
ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli


Unnamed: 0,p,h,output,model,inference,levenshtein_distance,lexical_overlap
0,"In your inventory, you see: an orange",you are in hallway,2,MoritzLaurer/MiniLM-L6-mnli,contradiction,29,6
1,This room is called the bathroom.,you are in hallway,2,MoritzLaurer/MiniLM-L6-mnli,contradiction,25,6
2,You see the agent,you are in hallway,1,MoritzLaurer/MiniLM-L6-mnli,neutral,12,4
3,You see a substance called air,you are in hallway,2,MoritzLaurer/MiniLM-L6-mnli,contradiction,19,6
4,"You see a bathtub, which is turned off. In the...",you are in hallway,2,MoritzLaurer/MiniLM-L6-mnli,contradiction,56,13


In [117]:
for model, group_df in filtered_nli_df.groupby("model"):
    print(model)
    print(len(group_df[group_df['inference'] == 'entailment']))

    print(group_df[group_df['inference'] == 'entailment']['lexical_overlap'].mean())
    #print(group_df[group_df['inference'] == 'entailment']['lexical_overlap'].std())
    print(len(group_df[group_df['inference'] == 'neutral']))
    print(len(group_df[group_df['inference'] == 'contradiction']))
    print(len(group_df))
    print("-")

MoritzLaurer/MiniLM-L6-mnli
23
8.91304347826087
111
556
690
-
gchhablani/bert-base-cased-finetuned-mnli
25
9.28
171
494
690
-
roberta-large-mnli
33
9.515151515151516
408
360
801
-
ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli
43
9.023255813953488
452
306
801
-


In [99]:
df = filtered_nli_df[filtered_nli_df['model'].isin(['ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli', 'roberta-large-mnli'])]
for _, group_df in df[df['inference'] == 'entailment'].groupby("model"):
    display(group_df.sort_values(['h', 'p']))

Unnamed: 0,p,h,output,model,inference,levenshtein_distance
132,"In your inventory, you see: a metal pot (conta...",you have metal pot in your inventory,2,roberta-large-mnli,entailment,109
96,You see a cupboard. The cupboard door is closed.,you see a cupboard,2,roberta-large-mnli,entailment,32
645,This room is called the art studio.,you are in art studio,2,roberta-large-mnli,entailment,19
358,This room is called the workshop.,you are in art studio,2,roberta-large-mnli,entailment,25
81,This room is called the bathroom.,you are in bathroom,2,roberta-large-mnli,entailment,19
84,"You see a bathtub, which is turned off. In the...",you are in bathroom,2,roberta-large-mnli,entailment,53
88,"You see a toilet. In the toilet is: A drain, w...",you are in bathroom,2,roberta-large-mnli,entailment,74
607,This room is called the foundry.,you are in foundry,2,roberta-large-mnli,entailment,19
409,This room is called the greenhouse.,you are in greenhouse,2,roberta-large-mnli,entailment,19
190,A door to the hallway (that is open),you are in hallway,2,roberta-large-mnli,entailment,26


Unnamed: 0,p,h,output,model,inference,levenshtein_distance
132,"In your inventory, you see: a metal pot (conta...",you have metal pot in your inventory,0,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,entailment,109
96,You see a cupboard. The cupboard door is closed.,you see a cupboard,0,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,entailment,32
645,This room is called the art studio.,you are in art studio,0,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,entailment,19
336,A door to the bathroom (that is open),you are in bathroom,0,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,entailment,26
81,This room is called the bathroom.,you are in bathroom,0,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,entailment,19
507,A door to the foundry (that is open),you are in foundry,0,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,entailment,26
607,This room is called the foundry.,you are in foundry,0,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,entailment,19
476,A door to the greenhouse (that is open),you are in greenhouse,0,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,entailment,26
409,This room is called the greenhouse.,you are in greenhouse,0,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,entailment,19
190,A door to the hallway (that is open),you are in hallway,0,ynie/roberta-large-snli_mnli_fever_anli_R1_R2_...,entailment,26


In [86]:
gt_df = filtered_nli_df[filtered_nli_df['model'] == 'roberta-large-mnli'][['p', 'h', 'inference']].sort_values(['h', 'inference'])
gt_df.loc[gt_df['inference'] != 'entailment' ,'inference'] = 'non_entailment'
gt_df['y'] = 1 # temp
gt_df.sort_values(['h', 'inference']).to_csv("ground_truth.csv", index=False)

In [None]:
# Não vale a pena medir ground thruth, pois minilm não gerou todos planos (falhou antes pois não deu sequência nos subgoals seguintes)