In [1]:
from dotenv import load_dotenv
import rich.progress
from llm_utils import return_azure_llm
import pandas as pd
import rich
from ipdb import set_trace as st
from typing import List, Tuple
from typing import List

from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

import rich

load_dotenv('.env')

True

In [2]:
annotations_df = pd.read_csv('annotations.csv')
project_ids = annotations_df['ProjectID'].unique().tolist()
project_ids

[20, 35, 34, 19]

In [3]:
output_df = pd.read_csv("generated_graphs.csv")

In [4]:
output_df['ProjectID'].unique().tolist()

[20, 35, 34, 19]

In [5]:
output_df.head()

Unnamed: 0,ProjectID,from_node,to_node
0,20,Start,Access System
1,20,Access System,View Menu
2,20,View Menu,Exclusive Gateway: Need to log in?
3,20,Exclusive Gateway: Need to log in?,User Login
4,20,Exclusive Gateway: Need to log in?,End


### Evaluation
Compare the ground truth workflow against the generated workflows. 

In [6]:
def calculate_node_metrics(ground_truth_nodes, generated_nodes):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    # Convert node lists to sets for easier comparison
    ground_truth_set = set(tuple(node) for node in ground_truth_nodes)
    generated_set = set(tuple(node) for node in generated_nodes)
    
    # True Positives: Nodes present in both ground truth and generated set
    true_positives = len(ground_truth_set.intersection(generated_set))
    
    # False Positives: Nodes present in generated set but not in ground truth set
    false_positives = len(generated_set - ground_truth_set)
    
    # False Negatives: Nodes present in ground truth set but not in generated set
    false_negatives = len(ground_truth_set - generated_set)
    
    return true_positives, false_positives, false_negatives

In [7]:
import pickle

def dump_cache(cache):
    with open('cache.pkl', 'wb') as f:
        pickle.dump(cache, f)

def retrieve_cache():
    try:
        with open('cache.pkl', 'rb') as f:
            loaded_cache = pickle.load(f)
        print("Loaded cache from disk")
    except:
        print("Not loading cache as it does not exist")
        return {}
    return loaded_cache

In [8]:
from langchain_openai import AzureOpenAIEmbeddings
import os
from dotenv import load_dotenv
load_dotenv('.env')

embedding_function = AzureOpenAIEmbeddings(          
            openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
            deployment=os.getenv('EMBEDDING_DEPLOYMENT_NAME'),
            model=os.getenv('EMBEDDING_MODEL'),
            openai_api_version=os.getenv('AZURE_OPENAI_VERSION'),
            azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
            chunk_size=1) 

embedding_function = embedding_function.embed_query

In [9]:
def check_cache(query, cache):
    if query in cache:
        return True
    else:
        return False

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_node_metrics_semantic(ground_truth_nodes, generated_nodes, embed_query, cache):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    # Function to compute semantic similarity between two node descriptions
    def compute_similarity(node1, node2, cache):

        
        if not check_cache(node1, cache):
            embedding1 = embed_query(node1)
            cache[node1] = embedding1
        else:
            embedding1 = cache[node1]

        if not check_cache(node2, cache):
            embedding2 = embed_query(node2)
            cache[node2] = embedding2
        else:
            embedding2 = cache[node2]
        
        # Compute the cosine similarity between the two embeddings
        similarity_score = cosine_similarity([embedding1], [embedding2])[0, 0]
        return similarity_score, cache
    
    # Convert node lists to sets for easier comparison
    ground_truth_set = set(tuple(node) for node in ground_truth_nodes)
    generated_set = set(tuple(node) for node in generated_nodes)
    
    # Loop through generated nodes and find matches in ground truth using semantic similarity
    for gen_node in generated_set:
        match_found = False
        for gt_node in ground_truth_set:
            similarity_score, cache = compute_similarity(gen_node[0], gt_node[0], cache)
            # You can adjust this threshold as needed
            if similarity_score > 0.8:  # Example threshold
                true_positives += 1
                match_found = True
                break
        if not match_found:
            false_positives += 1
    
    # Count false negatives (ground truth nodes not matched by generated nodes)
    for gt_node in ground_truth_set:
        match_found = False
        for gen_node in generated_set:
            similarity_score, cache = compute_similarity(gen_node[0], gt_node[0], cache)
            if similarity_score > 0.8:  # Example threshold
                match_found = True
                break
        if not match_found:
            false_negatives += 1
    
    return true_positives, false_positives, false_negatives, cache

In [15]:
columns = ['ProjectID', 'TP', 'FP', 'FN', 'TP with Semantic', 'FP with Semantic', 'FN with Semantic']
metrics_df = pd.DataFrame(columns=columns)
cache = retrieve_cache()

for project_id in project_ids:

    results = {'ProjectID': project_id, 'TP': 0, 'FP': 0, 'FN': 0, 'TP with Semantic': 0, 'FP with Semantic': 0, 'FN with Semantic': 0}

    #retrieve the ground truth and generated nodes to compare against each other
    generated_nodes = output_df.loc[output_df['ProjectID']==project_id, ['from_node', 'to_node']].values.tolist()
    ground_truth_nodes = annotations_df.loc[annotations_df['ProjectID']==project_id, ['Step', 'Next Step']].values.tolist()

    #calculate TP, FP, FN for each project WITHOUT semantic comparison
    results['TP'], results['FP'], results['FN'] = calculate_node_metrics(ground_truth_nodes, generated_nodes)

    #calculate TP, FP, FN for each project WITH semantic comparison
    results['TP with Semantic'], results['FP with Semantic'], results['FN with Semantic'], cache = calculate_node_metrics_semantic(ground_truth_nodes, generated_nodes, embedding_function, cache)

    #calculate the edge scores for each project WITHOUT semantic comparison

    #calculate the edge scores for each project WITH semantic comparison

    #append all data to metric df
    metrics_df = pd.concat((metrics_df, pd.DataFrame([results])), axis = 0)

    dump_cache(cache)


Loaded cache from disk


In [16]:
metrics_df

Unnamed: 0,ProjectID,TP,FP,FN,TP with Semantic,FP with Semantic,FN with Semantic
0,20,30,0,0,30,0,0
0,35,8,7,8,15,0,1
0,34,1,22,20,22,1,0
0,19,3,6,10,8,1,2
