In [53]:
file_paths = [
    "C:/ProjectCodeBase/ms/kpi_extraction_results_instruction_tuned_command_light.csv",
    "C:/ProjectCodeBase/ms/kpi_extraction_results_gpt-4.1_1.csv",
    "C:/ProjectCodeBase/ms/kpi_extraction_results_gpt-4.1-mini.csv",
    "C:/ProjectCodeBase/ms/kpi_extraction_results_haiku_3.5_1.csv",
    "C:/ProjectCodeBase/ms/kpi_extraction_results_gemma3.csv",
    "C:/ProjectCodeBase/ms/kpi_extraction_results_llama4_scout.csv",
    "C:/ProjectCodeBase/ms/kpi_extraction_results_deepseek_r1.csv",
    "C:/ProjectCodeBase/ms/kpi_extraction_results_self_consistency_llama_scout.csv",
    "C:/ProjectCodeBase/ms/kpi_extraction_results_llama4_scout_few_shot.csv",
    "C:/ProjectCodeBase/ms/kpi_extraction_results_llama4_scout_cot.csv",
    "C:/ProjectCodeBase/ms/kpi_extraction_results_mistral_7B.csv"
    ]

In [6]:
import pandas as pd

In [27]:
from typing import List, Tuple
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
def calculate_similarity_scores(df):
    similarities = []
    for _, row in df.iterrows():
        gold_list = eval(row['Gold'])
        pred_list = eval(row['Predicted'])
        
        # Combine all elements of gold and pred into strings
        gold_str = " ".join([" ".join(item) for item in gold_list])
        pred_str = " ".join([" ".join(item) for item in pred_list])
        
        # Convert strings to vectors using TfidfVectorizer
        vectorizer = TfidfVectorizer()
        if gold_str and pred_str:  # Check if strings are not empty
            vectors = vectorizer.fit_transform([gold_str, pred_str])
            similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
        else:
            similarity = 0.0
            
        similarities.append(similarity)
    
    # Add similarity scores as a new column
    df['Cosine Similarity'] = similarities
    return df

# Read and process the file
file_path = "C:/ProjectCodeBase/ms/kpi_extraction_results_instruction_tuned_command_light.csv"
df = pd.read_csv(file_path)
df = calculate_similarity_scores(df)
df.to_csv(file_path, index=False)

In [55]:
def evaluate_results(file_path):
    # Read the CSV file
    result_df = pd.read_csv(file_path)
    
    # Extract model name from file path
    model_name = file_path.split('\\')[-1].replace('kpi_extraction_results_', '').replace('.csv', '')
    print(f"\nEvaluating {model_name}:")
    
    # Calculate cosine similarity metrics
    threshold = 0.65
    cosine_similarity_scores = result_df['Cosine Similarity'].tolist()
    
    true_positives = sum(1 for score in cosine_similarity_scores if score > threshold)
    false_positives = len(result_df) - true_positives

    precision = true_positives / (true_positives + false_positives + 1e-5)

    print(f"Precision (Cosine Similarity > {threshold}): {precision:.2f}")

    # Calculate similarity statistics
    max_similarity = max(cosine_similarity_scores)
    min_similarity = min(cosine_similarity_scores)
    avg_similarity = sum(cosine_similarity_scores) / len(cosine_similarity_scores)
    print(f"Max Cosine Similarity: {max_similarity:.2f}")
    print(f"Min Cosine Similarity: {min_similarity:.2f}")
    print(f"Avg Cosine Similarity: {avg_similarity:.2f}")

# Evaluate each file
for file_path in file_paths:
    evaluate_results(file_path)


Evaluating C:/ProjectCodeBase/ms/instruction_tuned_command_light:
Precision (Cosine Similarity > 0.65): 0.47
Max Cosine Similarity: 1.00
Min Cosine Similarity: 0.00
Avg Cosine Similarity: 0.58

Evaluating C:/ProjectCodeBase/ms/gpt-4.1_1:
Precision (Cosine Similarity > 0.65): 0.44
Max Cosine Similarity: 1.00
Min Cosine Similarity: 0.00
Avg Cosine Similarity: 0.58

Evaluating C:/ProjectCodeBase/ms/gpt-4.1-mini:
Precision (Cosine Similarity > 0.65): 0.23
Max Cosine Similarity: 1.00
Min Cosine Similarity: 0.00
Avg Cosine Similarity: 0.26

Evaluating C:/ProjectCodeBase/ms/haiku_3.5_1:
Precision (Cosine Similarity > 0.65): 0.50
Max Cosine Similarity: 1.00
Min Cosine Similarity: 0.00
Avg Cosine Similarity: 0.61

Evaluating C:/ProjectCodeBase/ms/gemma3:
Precision (Cosine Similarity > 0.65): 0.48
Max Cosine Similarity: 1.00
Min Cosine Similarity: 0.00
Avg Cosine Similarity: 0.61

Evaluating C:/ProjectCodeBase/ms/llama4_scout:
Precision (Cosine Similarity > 0.65): 0.48
Max Cosine Similarity: 1.

In [37]:
from typing import List, Tuple

def evaluate_partial_predictions(
    gold: List[Tuple[str, str, str]], 
    pred: List[Tuple[str, str, str]]
) -> Tuple[float, float, float]:
    """
    Partial match evaluation for kpi names and fixed match on value and type.
    """
    tp = 0  
    fp = 0 
    fn = 0 

    gold_set = set(gold)
    pred_set = set(pred)

    for pred_tuple in pred_set:
        pred_kpi, pred_value, pred_type = pred_tuple
        match_found = False

        # Check for partial match in gold tuples
        for gold_tuple in gold_set:
            gold_kpi, gold_value, gold_type = gold_tuple

            # do partial match on KPI name and exact match on value and type
            if pred_value == gold_value and pred_type == gold_type and (pred_kpi in gold_kpi) or (gold_kpi in pred_kpi):
                tp += 1
                match_found = True
                break

        if not match_found:
            fp += 1 

    # Count false negatives (gold tuples not matched by any prediction)
    for gold_tuple in gold_set:
        gold_kpi, gold_value, gold_type = gold_tuple
        match_found = False

        for pred_tuple in pred_set:
            pred_kpi, pred_value, pred_type = pred_tuple

            # Partial match on KPI name and exact match on value and type
            if pred_value == gold_value and pred_type == gold_type and (pred_kpi in gold_kpi) or (gold_kpi in pred_kpi):
                match_found = True
                break

        if not match_found:
            fn += 1  # False negative if no match is found

    # precision, recall, and F1 score
    precision = tp / (tp + fp + 1e-5)
    recall = tp / (tp + fn + 1e-5)
    f1 = 2 * precision * recall / (precision + recall + 1e-5)

    return precision, recall, f1, tp, fp, fn

In [54]:
def prepare_and_evaluate_file(file_path):
    # Read CSV file
    df = pd.read_csv(file_path)
    
    # Get model name from file path
    model_name = file_path.split('/')[-1].replace('kpi_extraction_results_', '').replace('.csv', '')
    # Initialize lists to store metrics for each row
    precisions = []
    recalls = []
    f1s = []
    tp = 0
    fp = 0
    fn = 0
    tp_r = 0 
    fp_r = 0 
    fn_r = 0
    
    # Process each row in the DataFrame individually
    for _, row in df.iterrows():
        gold_list = eval(row['Gold'])
        pred_list = eval(row['Predicted'])
        
        # Evaluate single row
        precision, recall, f1, tp_r, fp_r, fn_r = evaluate_partial_predictions(gold_list, pred_list)
        tp += tp_r
        fp += fp_r
        fn += fn_r
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    overall_precision = tp / (tp + fp)
    overall_recall = tp / (tp + fn)
    overall_f1 = 2 * overall_precision * overall_recall / (overall_precision + overall_recall)

    print(f"\nTrue Positives: {tp}, False Positives: {fp}, False Negatives: {fn}")
    print(f"Overall metrics for {model_name}:")
    print(f"Overall Precision: {overall_precision:.3f}")
    print(f"Overall Recall: {overall_recall:.3f}")
    print(f"Overall F1 Score: {overall_f1:.3f}")

# Evaluate each file
for file_path in file_paths:
    prepare_and_evaluate_file(file_path)


True Positives: 225, False Positives: 99, False Negatives: 150
Overall metrics for instruction_tuned_command_light:
Overall Precision: 0.694
Overall Recall: 0.600
Overall F1 Score: 0.644

True Positives: 1351, False Positives: 741, False Negatives: 699
Overall metrics for gpt-4.1_1:
Overall Precision: 0.646
Overall Recall: 0.659
Overall F1 Score: 0.652

True Positives: 765, False Positives: 355, False Negatives: 1276
Overall metrics for gpt-4.1-mini:
Overall Precision: 0.683
Overall Recall: 0.375
Overall F1 Score: 0.484

True Positives: 1306, False Positives: 819, False Negatives: 736
Overall metrics for haiku_3.5_1:
Overall Precision: 0.615
Overall Recall: 0.640
Overall F1 Score: 0.627

True Positives: 1293, False Positives: 814, False Negatives: 746
Overall metrics for gemma3:
Overall Precision: 0.614
Overall Recall: 0.634
Overall F1 Score: 0.624

True Positives: 1338, False Positives: 723, False Negatives: 707
Overall metrics for llama4_scout:
Overall Precision: 0.649
Overall Recal

In [56]:
from typing import List, Tuple
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def evaluate_partial_similarity_predictions(
    gold: List[Tuple[str, str, str]], 
    pred: List[Tuple[str, str, str]]
) -> Tuple[float, float, float]:
    """
    Partial match evaluation for kpi names and fixed match on value and type.
    """
    tp = 0  
    fp = 0 
    fn = 0 

    gold_set = set(gold)
    pred_set = set(pred)

    for pred_tuple in pred_set:
        pred_kpi, pred_value, pred_type = pred_tuple
        match_found = False

        # Check for partial match in gold tuples
        for gold_tuple in gold_set:
            gold_kpi, gold_value, gold_type = gold_tuple
            # Initialize TF-IDF vectorizer
            vectorizer = TfidfVectorizer()

            # Transform the KPI names into TF-IDF vectors
            kpi_names = [pred_kpi, gold_kpi]
            tfidf_matrix = vectorizer.fit_transform(kpi_names)

            # Calculate cosine similarity
            similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
            # do partial match on KPI name and exact match on value and type
            if pred_value == gold_value and pred_type == gold_type and (similarity > 0.5):
                tp += 1
                match_found = True
                break

        if not match_found:
            fp += 1 

    # Count false negatives (gold tuples not matched by any prediction)
    for gold_tuple in gold_set:
        gold_kpi, gold_value, gold_type = gold_tuple
        match_found = False

        for pred_tuple in pred_set:
            pred_kpi, pred_value, pred_type = pred_tuple

            vectorizer = TfidfVectorizer()

            # Transform the KPI names into TF-IDF vectors
            kpi_names = [gold_kpi, pred_kpi]
            tfidf_matrix = vectorizer.fit_transform(kpi_names)

            # Calculate cosine similarity
            similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

            # Partial match on KPI name and exact match on value and type
            if pred_value == gold_value and pred_type == gold_type and (similarity > 0.5):
                match_found = True
                break

        if not match_found:
            fn += 1  # False negative if no match is found

    # precision, recall, and F1 score
    precision = tp / (tp + fp + 1e-5)
    recall = tp / (tp + fn + 1e-5)
    f1 = 2 * precision * recall / (precision + recall + 1e-5)

    return precision, recall, f1, tp, fp, fn

In [58]:
def prepare_and_evaluate_file(file_path):
    # Read CSV file
    df = pd.read_csv(file_path)
    
    # Get model name from file path
    model_name = file_path.split('/')[-1].replace('kpi_extraction_results_', '').replace('.csv', '')
    # Initialize lists to store metrics for each row
    precisions = []
    recalls = []
    f1s = []
    tp = 0
    fp = 0
    fn = 0
    
    # Process each row in the DataFrame individually
    for _, row in df.iterrows():
        gold_list = eval(row['Gold'])
        pred_list = eval(row['Predicted'])
        
        # Evaluate single row
        precision, recall, f1, tp_r, fp_r, fn_r = evaluate_partial_similarity_predictions(gold_list, pred_list)
        tp += tp_r
        fp += fp_r
        fn += fn_r
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    overall_precision = tp / (tp + fp + 1e-5)
    overall_recall = tp / (tp + fn + 1e-5)
    overall_f1 = 2 * overall_precision * overall_recall / (overall_precision + overall_recall + 1e-5)

    print(f"\nTrue Positives: {tp}, False Positives: {fp}, False Negatives: {fn}")
    print(f"Overall metrics for {model_name}:")
    print(f"Overall Precision: {overall_precision:.3f}")
    print(f"Overall Recall: {overall_recall:.3f}")
    print(f"Overall F1 Score: {overall_f1:.3f}")

# Evaluate each file
for file_path in file_paths:
    prepare_and_evaluate_file(file_path)


True Positives: 157, False Positives: 167, False Negatives: 214
Overall metrics for instruction_tuned_command_light:
Overall Precision: 0.485
Overall Recall: 0.423
Overall F1 Score: 0.452

True Positives: 815, False Positives: 1277, False Negatives: 1245
Overall metrics for gpt-4.1_1:
Overall Precision: 0.390
Overall Recall: 0.396
Overall F1 Score: 0.393

True Positives: 621, False Positives: 499, False Negatives: 1440
Overall metrics for gpt-4.1-mini:
Overall Precision: 0.554
Overall Recall: 0.301
Overall F1 Score: 0.390

True Positives: 883, False Positives: 1242, False Negatives: 1177
Overall metrics for haiku_3.5_1:
Overall Precision: 0.416
Overall Recall: 0.429
Overall F1 Score: 0.422

True Positives: 948, False Positives: 1159, False Negatives: 1113
Overall metrics for gemma3:
Overall Precision: 0.450
Overall Recall: 0.460
Overall F1 Score: 0.455

True Positives: 863, False Positives: 1198, False Negatives: 1197
Overall metrics for llama4_scout:
Overall Precision: 0.419
Overall 