## Evaluate

Create an evaluator class and associated subclasses for my custom evaluators
- "CPC name checker" - is_same_pred_name()
- Supps name checker: 
- cog status checker 
- variable binding checker 
- valid json checker
- valid cpc checker
- valid supplemental semantics checker




  

## Evaluate

- for each item in the dataset
  
    - run the model to produce a json_semantics_predicted
    - check if valid json
    - check intent (t/f)
    - check cpc name (t/f)
    - if an INSTRUCT, was the CPC selected from action repertoire? (t/f)
    - What number of supps are present in the properties repertoire 
    - check supps precision/recall
    - check cpc args length (t/f)
    - variable asignment check: check if variables are present in the right descriptors
      - there is a prec and recall aspect here. there are two lists for each variable
      - For each variable, there is predicted and truth set of supps that it is associated with
      - For each variable there is a cpc position it is associated with
      - E.g., VAR0: {blue, ball, on}, pickup(),   "Pick up the blue ball on the table."
    - variable ordering : on(VAR0, VAR1), table(VAR1), ball(VAR0). pickup(_, VAR0)
    - 

In [32]:
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
import json
import networkx as nx
import matplotlib.pyplot as plt
import networkx.algorithms.isomorphism as iso
import ast

def evaluate(predicted, truth):
    """
    Returns an evaluation json that contains various metrics 
    input: strings obtained from the language model
    input: json of the ground truth
    """
    item = {}
    item['truth'] = truth
    if isinstance(predicted, str):
        # Validate json 
        predicted_json = {}
        try:
            predicted_json = ast.rom langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
import json
import networkx as nx
import matplotlib.pyplot as plt
import networkx.algorithms.isomorphism as iso
import ast

def evaluate(predicted, truth):
    """
    Returns an evaluation json that contains various metrics 
    input: strings obtained from the language model
    input: json of the ground truth
    """
    item = {}
    item['truth'] = truth
    if isinstance(predicted, str):
        # Validate json 
        predicted_json = {}
        try:
            predicted_json = ast.literal_eval(predicted_json_str)
            item['valid_json'] = True
            item['prediction'] = predicted_json
        except:
            item['valid_json'] = False
            pass
    
        # try to fix the json with gpt
        if not item['valid_json']:
            # try to fix it.
            new_json_str = fix_json(predicted)
            
            try: 
                predicted_json = ast.literal_eval(new_json_str)
                item['prediction'] = predicted_json
            except e:
                print(e)
                print("ERROR: Could not fix input json")
                return item
    else:
        item['prediction'] = predicted
        item['valid_json'] = True

    # check intent
    if item['prediction']['intent'] == truth['intent']:
        item['intent_correct'] = True
    else:
        item['intent_correct'] = False

    # check cpc name
    if  is_same_pred_name(item['prediction']['central_proposition'], truth['central_proposition']):
        item['cpc_name_correct'] = True
    else:
        item['cpc_name_correct'] = False

    # check if correct number of sups
    spc_name_prediction = [pred_name(i) for i in item['prediction']['supplemental_semantics']]
    spc_name_truth = [pred_name(i) for i in truth['supplemental_semantics']]

    if len(spc_name_prediction) == len(spc_name_truth):
        item['spc_length_correct'] = True
    else:
        item['spc_length_correct'] = False

    # Evaluate accuracy of spc
    item['spc_accuracy'] = {}
    spc_intersection = set(spc_name_prediction).intersection(set(spc_name_truth))
    item['spc_accuracy']['precision'] = len(spc_intersection)/len(spc_name_prediction)
    item['spc_accuracy']['recall'] = len(spc_intersection)/len(spc_name_truth)

    # check for variable assignment and mapping. 
    if is_isomorphic(item['prediction'],truth):
        item['is_isomorphic'] = True
    else:
        item['is_isomorphic'] = False

    if is_matched(item['prediction'], truth):
        item['is_matched'] = True
    else:
        item['is_matched'] = False

    return item

#### graph matching

def build_semantic_graph(parse):
    """
    builds up a graph 
    """

    G = nx.DiGraph()

    # let's first do the "intent"
    cpc_name = pred_name(parse['central_proposition'])
    cpc_args = pred_args(parse['central_proposition'])
    
    G.add_node(cpc_name, name=cpc_name, source='cpc', type='pred_name')
    for idx,arg in enumerate(cpc_args):
        G.add_node(arg, name=arg, source='args', type='pred_arg')
        G.add_edge(arg,cpc_name,pos=idx)

    for spc in parse['supplemental_semantics']:
        spc_name = pred_name(spc)
        spc_args = pred_args(spc)
        G.add_node(spc_name, name=spc_name, source='spc', type='pred_name')
        for idx, arg in enumerate(spc_args):
            G.add_node(arg, name=arg, source='args', type='pred_arg')
            G.add_edge(arg,spc_name,pos=idx)
    return G

def is_isomorphic(predicted, truth):
    """
    Checks if all the right variables are positioned correctly in the CPC and SPC
    """
    G_predicted = build_semantic_graph(predicted)
    G_truth = build_semantic_graph(truth)
    em = iso.categorical_edge_match("pos", 1)
    return nx.is_isomorphic(G_truth, G_predicted, edge_match=em)

def is_matched(predicted, truth):
    """
    Checks if each variable is correctly connected to exactly the same set of cpc and spcs. 
    """
    G_predicted = build_semantic_graph(predicted)
    G_truth = build_semantic_graph(truth)

    # get all the nodes that are variables
    args_predicted = [x for x,y in G_predicted.nodes(data=True) if y['type']=='pred_arg']
    args_truth = [x for x,y in G_truth.nodes(data=True) if y['type']=='pred_arg']

    for p,t in zip(args_predicted, args_truth):
        successors_predicted = G_predicted.successors(p)
        successors_truth = G_truth.successors(t)
        s1 = set(successors_predicted)
        s2 = set(successors_truth)
        if not s1 == s2: 
            return False
    return True
    

### UTILITIES ####

def fix_json(json_str):
    llm = ChatOpenAI(model_name="gpt-4", temperature=0.0)
    template = """
        Fix the input json string to produce an output that has a valid json format. Only change things like the parenthesis, commas etc.
        
        json_input: \n{json_str}\n
        rewritten valid json:
        """
    prompt = PromptTemplate(input_variables=["json_str"],template=template)
    chain = LLMChain(llm=llm, prompt=prompt)
    output = chain.run(json_str=json_str)
    return output

def is_same_pred_name(predicted_pred, truth_pred):
    if predicted_pred.split("(")[0] == truth_pred.split("(")[0]:
        return True
    return False

def pred_name(pred):
    return pred.split("(")[0].lower()

def pred_args(pred):
    return pred.split("(")[1].split(")")[0].split(",")

In [33]:
predicted = {
  "intent": "INSTRUCT",
  "central_proposition": "putleftof(self:agent,VAR0,VAR1)",
  "supplemental_semantics": [
    "hammer(VAR0)",
    "diningtable(VAR1)",
    "INDEFINITE(VAR0)"
  ]
}

predicted_str = """
{
  "intent": "INSTRUCT",
  "central_proposition": "putleftof(self:agent,VAR0,VAR1)",
  "supplemental_semantics": [
    "hammer(VAR0)",
    "diningtable(VAR1)",
    "INDEFINITE(VAR0)"
  ]
}
"""

bad_predicted_str = """
{
  "intent": "INSTRUCT",
  "central_proposition": "putleftof(self:agent,VAR0,VAR1)",
  "supplemental_semantics": [
    "hammer(VAR0)"
    "diningtable(VAR1)",
    "INDEFINITE(VAR0)"
}
"""

truth = {
  "intent": "INSTRUCT",
  "central_proposition": "putleftof(self:agent,X,D)",
  "supplemental_semantics": [
    "hammer(X)",
    "diningtable(D)",
    "INDEFINITE(X)"
  ]
}

bad_predicted = {
  "intent": "INSTRUCT",
  "central_proposition": "putleftof(self:agent,K,W)",
  "supplemental_semantics": [
    "hammer(W)",
    "diningtable(K)",
    "INDEFINITE(K)"
  ]
}

In [38]:
out = evaluate(bad_predicted, truth)

In [39]:
out

{'truth': {'intent': 'INSTRUCT',
  'central_proposition': 'putleftof(self:agent,X,D)',
  'supplemental_semantics': ['hammer(X)', 'diningtable(D)', 'INDEFINITE(X)']},
 'prediction': {'intent': 'INSTRUCT',
  'central_proposition': 'putleftof(self:agent,K,W)',
  'supplemental_semantics': ['hammer(W)', 'diningtable(K)', 'INDEFINITE(K)']},
 'valid_json': True,
 'intent_correct': True,
 'cpc_name_correct': True,
 'spc_length_correct': True,
 'spc_accuracy': {'precision': 1.0, 'recall': 1.0},
 'is_isomorphic': True,
 'is_matched': False}