In [147]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots

pio.templates.default = "simple_white"

from scienceworld import ScienceWorldEnv

from sources.scienceworld import load_step_function, parse_observation
from sources.agent import BDIAgent
from sources.bdi_components.inference import NLIModel
from sources.bdi_components.belief import State

import numpy as np
from os import listdir
from os.path import isfile, join
import re


In [148]:
def preprocess_df(results_df):
    results_df.loc[results_df["plans_pct"] == 1, "plans_pct"] = 100
    results_df.loc[results_df["plans_pct"] == 2, "plans_pct"] = 25
    results_df.loc[results_df["plans_pct"] == 5, "plans_pct"] = 50
    results_df.loc[results_df["plans_pct"] == 7, "plans_pct"] = 75
    results_df['rl_score'] = results_df['rl_score'] / 100
    results_df['bdi_score'] = results_df['bdi_score'] / 100
    results_df['final_score'] = results_df['final_score'] / 100
    return results_df

In [149]:
import pandas as pd

plan_statistics = pd.read_csv("plan_statistics.csv")

dirs = ["../results/v2-gchhablani-bert-base-cased-finetuned-mnli/", "../results/v2-MoritzLaurer-MiniLM-L6-mnli/",
        "../results/v2-roberta-large-mnli/"]  #, "../results/v2-ynie-roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli/"]
#dirs = ["../results/v2-minilm/"]
tasks = ['melt', 'find-non-living-thing']
print(tasks)
#files_overall = "results_melt.csv"
#files_nli = "results_nli_melt.csv"

all_overall_dfs = []
all_nli_dfs = []
for dir in dirs:
    for task in tasks:
        results_df = pd.read_csv(dir + f"results_{task}.csv")
        results_df['task'] = task
        all_overall_dfs.append(results_df)

        nli_results_df = pd.read_csv(dir + f"results_nli_{task}.csv")
        nli_results_df['task'] = task
        all_nli_dfs.append(nli_results_df)

overall_results_df = pd.concat(all_overall_dfs)
overall_results_df = preprocess_df(overall_results_df)
overall_results_df = pd.merge(overall_results_df, plan_statistics, on=['plans_pct', 'task'])
overall_results_df

In [150]:
projected_cols = ['task', 'plans_pct', 'eps', 'num_specific_plans', 'nli_model']
aggregations = {'variation': 'count', 'final_score': 'mean', 'rl_score': 'mean', 'bdi_score': 'mean',
                'num_bdi_actions': 'mean', 'num_rl_actions': 'mean', 'error': 'mean', 'num_plans': 'mean'}

grouped_df = overall_results_df.groupby(projected_cols).agg(aggregations).reset_index()
grouped_df['dense_rank'] = (
    grouped_df.groupby(['plans_pct', 'task', "nli_model"])['final_score'].rank(method='dense', ascending=False).astype(
        int))

#grouped_df = grouped_df.sort_values(['plans_pct', 'dense_rank'], ascending=[True, True]).reset_index()
grouped_df = grouped_df[(grouped_df['dense_rank'] == 1)].sort_values(["task", "num_specific_plans", "nli_model"])
# avoiding tied rows
grouped_df.drop(columns=['dense_rank']).sort_values(by=['final_score', 'nli_model'], ascending=[False, False])

In [151]:
write_df = grouped_df.loc[:, ['task', 'nli_model', 'final_score', 'bdi_score', 'num_bdi_actions', 'error', 'num_plans']]
write_df = write_df.replace('MoritzLaurer/MiniLM-L6-mnli', 'MiniLM L6')
#write_df = write_df.replace('ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli', 'Roberta Large')
write_df = write_df.replace('roberta-large-mnli', 'Roberta Large')
write_df = write_df.replace('gchhablani/bert-base-cased-finetuned-mnli', 'Bert Base')
write_df.rename(columns={
    'task': 'Task',
    'nli_model': 'Model',
    'bdi_score': 'BDI Score',
    'final_score': 'Score',
    'error': 'Errors',
    'num_plans': 'Num Plans',
    'num_bdi_actions': 'Num Actions'
}, inplace=True)

write_df[['Task', 'Model', 'Score', 'BDI Score', 'Errors', 'Num Plans']].to_csv("nli_performance_results.csv",
                                                                                index=False, float_format='%.3f')
write_df

In [153]:
def lexical_overlap(a, b):
    words_doc1 = set(a.split())
    words_doc2 = set(b.split())

    diff = words_doc1.intersection(words_doc2)
    return len(diff)

lexical_overlap("you see a pot", "you see a container")

In [154]:
nli_results_df = pd.concat(all_nli_dfs)
print(len(nli_results_df))
nli_results_df.head()

In [155]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

all_groups_df = []
for model, group_df in nli_results_df.groupby("model"):
    print(model)
    config = AutoConfig.from_pretrained(model)
    #print(config.label2id)
    #print(config.id2label)
    group_df['inference'] = group_df['output'].apply(lambda id: config.id2label[id].lower())
    all_groups_df.append(group_df)

filtered_nli_df = pd.concat(all_groups_df)
#filtered_nli_df['levenshtein_distance'] = filtered_nli_df.apply(lambda row: levenshtein_distance(row['p'], row['h']), axis=1)
filtered_nli_df['lexical_overlap'] = filtered_nli_df.apply(lambda row: lexical_overlap(row['p'], row['h']), axis=1)
filtered_nli_df['length_p'] = filtered_nli_df['p'].apply(lambda p: len(p.split()))
filtered_nli_df['length_h'] = filtered_nli_df['h'].apply(lambda h: len(h.split()))

filtered_nli_df.head()

In [156]:
all_statistics = []
for (model, task), group_df in filtered_nli_df.groupby(["model", "task"]):
    all_statistics.append({
        'model': model,
        'num_entailment': len(group_df[group_df['inference'] == 'entailment']),
        #'num_neutral': len(group_df[group_df['inference'] == 'neutral']),
        #'num_contradiction': len(group_df[group_df['inference'] == 'contradiction']),
        'num_nonentailment': len(group_df[group_df['inference'] == 'neutral']) + len(
            group_df[group_df['inference'] == 'contradiction']),
        'num_inferences': len(group_df),
        'mean_entailment_lexical_overlap': group_df[group_df['inference'] == 'entailment']['lexical_overlap'].mean(),
        'mean_h': group_df[group_df['inference'] == 'entailment']['length_h'].mean(),
        'mean_p': group_df[group_df['inference'] == 'entailment']['length_p'].mean(),
        "task": task
    })

statistics_df = pd.DataFrame(all_statistics)
#full_models_df = pd.merge(statistics_df, on='model', how='inner')
statistics_df = statistics_df  #.drop(columns=['num_entailment', 'num_nonentailment'])
statistics_df.head()

In [157]:
#write_df = statistics_df.loc[:, ['task', 'model', 'mean_entailment_lexical_overlap', 'mean_h', 'mean_p', 'num_entailment', 'num_nonentailment']]
write_nli_df = statistics_df.loc[:,
               ['task', 'model', 'mean_entailment_lexical_overlap', 'mean_h', 'mean_p', 'num_inferences']]
write_nli_df = write_nli_df.replace('MoritzLaurer/MiniLM-L6-mnli', 'MiniLM L6')
write_nli_df = write_nli_df.replace('roberta-large-mnli', 'Roberta Large')
write_nli_df = write_nli_df.replace('gchhablani/bert-base-cased-finetuned-mnli', 'Bert Base')
write_nli_df.rename(columns={
    'task': 'Task',
    'model': 'Model',
    'mean_entailment_lexical_overlap': "LO(E)",
    'mean_h': 'Mean Plan Context',
    'mean_p': 'Mean Beliefs',
    'num_inferences': 'Inferences'
}, inplace=True)

write_nli_df[['Task', 'Model', 'LO(E)', 'Mean Beliefs', 'Mean Plan Context', 'Inferences']].sort_values(
    ['Task', 'Model']).to_csv("nli_inference_results.csv", index=False, float_format='%.3f')
write_nli_df

In [146]:
0.64num_params = {
    'Bert Base': 110,
    'Roberta Large': 355,
    'MiniLM L6': 22
}

# mnli - m
mnli_results = {
    'Bert Base': 84.6,
    'Roberta Large': 90.8,
    'MiniLM L6': 82.2
}

all_write_df = pd.merge(write_df, write_nli_df, on=['Model', 'Task'], how='inner')
columns = ['Model', 'Params', 'MNLI-m', 'Task', 'Score', 'BDI Score', 'Num Actions', 'Errors',
           'Num Plans', 'LO(E)', 'Mean Beliefs', 'Mean Plan Context',
           'Inferences']
all_write_df['Params'] = all_write_df['Model'].apply(lambda model: num_params[model])
all_write_df['MNLI-m'] = all_write_df['Model'].apply(lambda model: mnli_results[model])
all_write_df = all_write_df[columns].sort_values(['Params','Model', 'Task'])
all_write_df.to_csv("nli_results.csv", index=False, float_format='%.3f')
all_write_df

In [None]:
"""
Não sei se faz sentido manter a qtd de entailment/non entailment, não tem muito o que falar disso. talvez falar alto nivel em future work

We compute the lexical overlap to measure the inference difficult between the belief base and plan context that we manually developed in order to evaluate our approach.
Specifically, given a sentence pair consisting in a belief and a context, we calculate the number of words contained in beliefs that are absent in plan context.
In cases where lexical overlap is high between the premise and hypothesis, the inference tends to easily infer entailment relation since both sentences are similar and may express the same idea.
Hence, in such cases, sophisticated language models exploit shallow syntactic heuristics to infer logical entailment between sentences. (citar paper HANS)
We show that the number of lexical overlap is high when comparing to the average word number in both sentences.
The average lexical overlap in entailment sentence pairs is higher than the average number of plan context words since most beliefs contain more words.

The number of entailed pairs is significant low since the cartesian product between the belief base and plan contexts tends to generate very unrelated pairs.
As future work, we plan to pruning very different sentence pairs in order to reduce the NLI model computation.
"""

In [None]:
# Não vale a pena medir ground thruth, pois minilm não gerou todos planos (falhou antes pois não deu sequência nos subgoals seguintes)

# na real vale contar quantos falsos entailment e quantos falsos não entailment

In [None]:
for task, task_df in filtered_nli_df.groupby("task"):
    for model, model_df in task_df.groupby("model"):
        gt_df = model_df[['p', 'h', 'inference']]
        gt_df['y'] = 1  # temp
        gt_df.loc[gt_df['inference'] != 'entailment', 'inference'] = 'non_entailment'
        gt_df['model'] = model
        gt_df['task'] = task
        gt_df.sort_values(['h', 'inference']).to_csv(f"ground_truth_{task}_{model.replace('/', '-')}.csv", index=False)
