### Get model f1 scores

In [4]:
import json
import csv
import re
from typing import Dict, List, Set
from sklearn.metrics import f1_score

class AnnotationEvaluator:
    def __init__(self, ontology_path: str):
        """
        Initialize the evaluator with the privacy ontology.
        """
        with open(ontology_path, 'r') as f:
            self.ontology = json.load(f)
    
    def _normalize_label(self, label: str) -> List[str]:
        """
        Normalize label by removing extra whitespace, converting to lowercase,
        and splitting comma-separated values.
        """
        # Split by comma, strip whitespace, convert to lowercase
        return [
            l.strip().lower() 
            for l in label.split(',') 
            if l.strip()
        ]
    
    def _preprocess_text(self, text: str) -> str:
        """
        Preprocess text by removing content within or after <R> tags.
        
        Args:
            text (str): Input text to preprocess
        
        Returns:
            str: Preprocessed text
        """
        # Find all <R> tags
        r_tags = list(re.finditer(r'<R>(.*?)</R>', text, re.DOTALL))
        
        if not r_tags:
            # If no </R> tags, remove everything after a single <R> tag
            single_r_match = re.search(r'<R>', text)
            if single_r_match:
                return text[:single_r_match.start()]
            return text
        
        # Take the content before the first <R> tag
        preprocessed = text[:r_tags[0].start()]
        
        # Return preprocessed text
        return preprocessed.strip()
    
    def _match_labels(self, text: str, category_labels: List[str]) -> Set[str]:
        """
        Find matching labels in the text.
        
        Args:
            text (str): Text to search
            category_labels (List[str]): Labels to match
        
        Returns:
            Set[str]: Matched labels
        """
        # Preprocess text to remove R-tag content
        preprocessed_text = self._preprocess_text(text)
        
        # Normalize text
        normalized_text = preprocessed_text.lower()
        
        # Match labels
        matched_labels = set()
        
        for label in category_labels:
            # Various matching strategies
            # 1. Whole word match
            word_pattern = r'\b' + re.escape(label) + r'\b'
            # 2. Partial match
            partial_pattern = re.escape(label)
            
            if (re.search(word_pattern, normalized_text) or 
                re.search(partial_pattern, normalized_text)):
                matched_labels.add(label)
        
        return matched_labels
    
    def calculate_comprehensive_f1_scores(self, ground_truth: Dict, text: str) -> Dict[str, float]:
        """
        Calculate F1 scores for all categories.
        
        Args:
            ground_truth (Dict): Ground truth annotations
            text (str): Text to evaluate
        
        Returns:
            Dict[str, float]: F1 scores
        """
        f1_scores = {}
        categories = ['actions', 'data_types', 'purposes']
        
        for category in categories:
            # Extract labels from ground truth
            gt_labels = self._normalize_label(', '.join(ground_truth.get(category, [])))
            
            # Find matching labels in text
            pred_labels = self._match_labels(text, gt_labels)
            
            # Calculate F1 score
            y_true = [1 if label in gt_labels else 0 for label in gt_labels]
            y_pred = [1 if label in pred_labels else 0 for label in gt_labels]
            
            try:
                # Macro average F1 score
                f1 = f1_score(y_true, y_pred, average='binary')
            except Exception as e:
                print(f"F1 Score calculation error for {category}: {e}")
                f1 = 0
            
            f1_scores[category] = f1
        
        # Calculate overall F1 score
        f1_scores['Overall'] = sum(f1_scores.values()) / len(f1_scores)
        
        # Diagnostic print
        print(f"\nCategory Diagnostics:")
        for category in categories:
            print(f"{category.capitalize()}:")
            print(f"  Ground Truth Labels: {gt_labels}")
            print(f"  Matched Labels: {pred_labels}")
            print(f"  F1 Score: {f1_scores[category]:.4f}")
        
        # Print preprocessed text for verification
        print("\nPreprocessed Text:")
        print(self._preprocess_text(text))
        
        return f1_scores

def process_annotation_csv(csv_path: str, ontology_path: str) -> List[Dict]:
    """
    Process the annotation CSV and calculate F1 scores.
    """
    # Initialize the evaluator
    evaluator = AnnotationEvaluator(ontology_path)
    
    # Results storage
    results = []
    
    # Read the CSV file
    with open(csv_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        
        for row in reader:
            try:
                # Parse target annotations
                ground_truth = json.loads(row['Target Annotations'])
                
                # Process both model responses
                for response_col in ['Model Response 1', 'Model Response 2']:
                    # Extract full text
                    full_text = row[response_col]
                    
                    # Calculate F1 scores
                    f1_scores = evaluator.calculate_comprehensive_f1_scores(
                        ground_truth['metadata'], 
                        full_text
                    )
                    
                    # Prepare result dictionary
                    result = {
                        'File': ground_truth.get('file_name', 'Unknown'),
                        'Model': row['Model'],
                        'Response': response_col,
                        'Metrics': f1_scores
                    }
                    
                    results.append(result)
            
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
    
    return results

def main():
    # Paths to your files
    csv_path = 'LLMAnnotation_groqGemma.csv'
    ontology_path = 'privacy_ontology_simple.json'
    
    # Process the CSV and calculate F1 scores
    results = process_annotation_csv(csv_path, ontology_path)
    
    # Print results
    print("\nAnnotation F1 Scores:")
    for result in results:
        print(f"File: {result['File']}")
        print(f"Model: {result['Model']}")
        print(f"Response: {result['Response']}")
        for category, score in result['Metrics'].items():
            print(f"{category} F1 Score: {score:.4f}")
        print("-" * 40)
    
    # Save results to a CSV
    if results:
        output_keys = ['File', 'Model', 'Response', 'Actions F1', 'Data Types F1', 'Purposes F1', 'Overall F1']
        with open('annotation_f1_scores.csv', 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=output_keys)
            writer.writeheader()
            
            for result in results:
                # Prepare row for CSV
                csv_row = {
                    'File': result['File'],
                    'Model': result['Model'],
                    'Response': result['Response'],
                    'Actions F1': result['Metrics'].get('actions', 0),
                    'Data Types F1': result['Metrics'].get('data_types', 0),
                    'Purposes F1': result['Metrics'].get('purposes', 0),
                    'Overall F1': result['Metrics'].get('Overall', 0)
                }
                writer.writerow(csv_row)
        
        print("Results saved to annotation_f1_scores.csv")

if __name__ == "__main__":
    main()


Category Diagnostics:
Actions:
  Ground Truth Labels: ['personal analytics', 'functionality', 'account management']
  Matched Labels: {'functionality', 'account management'}
  F1 Score: 1.0000
Data_types:
  Ground Truth Labels: ['personal analytics', 'functionality', 'account management']
  Matched Labels: {'functionality', 'account management'}
  F1 Score: 0.4000
Purposes:
  Ground Truth Labels: ['personal analytics', 'functionality', 'account management']
  Matched Labels: {'functionality', 'account management'}
  F1 Score: 0.8000

Preprocessed Text:
"## Annotations:\n\n**Actions:** Collect, Use \n**Data Types:**  Account Information:\n    Account Balance,\n    User id\n**Purposes:** Account management, Functionality\n\n**Stories:** 1. We collect Account Balance, User id for account management. We use Account Balance, User id for functionality. \n\n

Category Diagnostics:
Actions:
  Ground Truth Labels: ['personal analytics', 'functionality', 'account management']
  Matched Labels: 

In [6]:
import json
import csv
import re
from typing import Dict, List, Set
from sklearn.metrics import f1_score, precision_score, recall_score
import os

class AnnotationEvaluator:
    def __init__(self, ontology_path: str):
        with open(ontology_path, 'r') as f:
            self.ontology = json.load(f)
    
    def _normalize_label(self, label: str) -> List[str]:
        return [
            l.strip().lower() 
            for l in label.split(',') 
            if l.strip()
        ]
    
    def _preprocess_text(self, text: str) -> str:
        r_tags = list(re.finditer(r'<R>(.*?)</R>', text, re.DOTALL))
        if not r_tags:
            single_r_match = re.search(r'<R>', text)
            if single_r_match:
                return text[:single_r_match.start()]
            return text
        preprocessed = text[:r_tags[0].start()]
        return preprocessed.strip()
    
    def _match_labels(self, text: str, category_labels: List[str]) -> Set[str]:
        preprocessed_text = self._preprocess_text(text)
        normalized_text = preprocessed_text.lower()
        matched_labels = set()
        for label in category_labels:
            word_pattern = r'\b' + re.escape(label) + r'\b'
            partial_pattern = re.escape(label)
            if (re.search(word_pattern, normalized_text) or 
                re.search(partial_pattern, normalized_text)):
                matched_labels.add(label)
        return matched_labels
    
    def calculate_comprehensive_scores(self, ground_truth: Dict, text: str) -> Dict[str, float]:
        scores = {}
        categories = ['actions', 'data_types', 'purposes']
        
        for category in categories:
            gt_labels = self._normalize_label(', '.join(ground_truth.get(category, [])))
            pred_labels = self._match_labels(text, gt_labels)
            y_true = [1 if label in gt_labels else 0 for label in gt_labels]
            y_pred = [1 if label in pred_labels else 0 for label in gt_labels]
            
            try:
                f1 = f1_score(y_true, y_pred, average='binary')
                precision = precision_score(y_true, y_pred, average='binary')
                recall = recall_score(y_true, y_pred, average='binary')
            except Exception as e:
                print(f"Error calculating metrics for {category}: {e}")
                f1, precision, recall = 0, 0, 0
            
            scores[category] = {
                'f1': f1,
                'precision': precision,
                'recall': recall
            }
        
        overall_f1 = sum([scores[cat]['f1'] for cat in categories]) / len(categories)
        overall_precision = sum([scores[cat]['precision'] for cat in categories]) / len(categories)
        overall_recall = sum([scores[cat]['recall'] for cat in categories]) / len(categories)
        
        scores['Overall'] = {
            'f1': overall_f1,
            'precision': overall_precision,
            'recall': overall_recall
        }
        
        return scores

def process_annotation_csv(csv_path: str, evaluator: AnnotationEvaluator):
    results = []
    combined_y_true = {'actions': [], 'data_types': [], 'purposes': []}
    combined_y_pred = {'actions': [], 'data_types': [], 'purposes': []}
    
    with open(csv_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            try:
                ground_truth = json.loads(row['Target Annotations'])
                for response_col in ['Model Response 1', 'Model Response 2']:
                    full_text = row[response_col]
                    scores = evaluator.calculate_comprehensive_scores(
                        ground_truth['metadata'], 
                        full_text
                    )
                    result = {
                        'File': ground_truth.get('file_name', 'Unknown'),
                        'Model': row['Model'],
                        'Response': response_col,
                        'Metrics': scores
                    }
                    results.append(result)
                    
                    # Aggregate data for overall combined metrics
                    for category in ['actions', 'data_types', 'purposes']:
                        gt_labels = evaluator._normalize_label(', '.join(ground_truth['metadata'].get(category, [])))
                        pred_labels = evaluator._match_labels(full_text, gt_labels)
                        combined_y_true[category].extend([1 if label in gt_labels else 0 for label in gt_labels])
                        combined_y_pred[category].extend([1 if label in pred_labels else 0 for label in gt_labels])
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
    
    # Calculate combined overall metrics
    combined_scores = {}
    for category in ['actions', 'data_types', 'purposes']:
        try:
            combined_scores[category] = {
                'f1': f1_score(combined_y_true[category], combined_y_pred[category], average='binary'),
                'precision': precision_score(combined_y_true[category], combined_y_pred[category], average='binary'),
                'recall': recall_score(combined_y_true[category], combined_y_pred[category], average='binary'),
            }
        except Exception as e:
            print(f"Error calculating combined metrics for {category}: {e}")
            combined_scores[category] = {'f1': 0, 'precision': 0, 'recall': 0}
    
    overall_f1 = sum([combined_scores[cat]['f1'] for cat in combined_scores]) / len(combined_scores)
    overall_precision = sum([combined_scores[cat]['precision'] for cat in combined_scores]) / len(combined_scores)
    overall_recall = sum([combined_scores[cat]['recall'] for cat in combined_scores]) / len(combined_scores)
    
    combined_scores['Overall'] = {
        'f1': overall_f1,
        'precision': overall_precision,
        'recall': overall_recall
    }
    
    return results, combined_scores

def main():
    # Input multiple CSV paths
    csv_paths = ['LLMAnnotation_groqGemma.csv','LLMAnnotation_groqLlama.csv','LLMAnnotation_gpt4o-latest.csv']
    ontology_path = 'privacy_ontology_simple.json'
    
    evaluator = AnnotationEvaluator(ontology_path)
    all_results = {}
    
    for csv_path in csv_paths:
        csv_path = csv_path.strip()
        if not os.path.exists(csv_path):
            print(f"File not found: {csv_path}")
            continue
        print(f"Processing file: {csv_path}")
        results, combined_scores = process_annotation_csv(csv_path, evaluator)
        all_results[os.path.basename(csv_path)] = combined_scores
    
    # Display results side by side
    print("\nComparison of Combined Metrics Across Files:")
    categories = ['actions', 'data_types', 'purposes', 'Overall']
    header = ["File"] + [f"{category} {metric}" for category in categories for metric in ["F1", "Precision", "Recall"]]
    print("\t".join(header))
    for file_name, scores in all_results.items():
        row = [file_name]
        for category in categories:
            metrics = scores.get(category, {})
            row.extend([f"{metrics.get(metric, 0):.4f}" for metric in ["f1", "precision", "recall"]])
        print("\t".join(row))

if __name__ == "__main__":
    main()


Processing file: LLMAnnotation_groqGemma.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Processing file: LLMAnnotation_groqLlama.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Processing file: LLMAnnotation_gpt4o-latest.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Comparison of Combined Metrics Across Files:
File	actions F1	actions Precision	actions Recall	data_types F1	data_types Precision	data_types Recall	purposes F1	purposes Precision	purposes Recall	Overall F1	Overall Precision	Overall Recall
LLMAnnotation_groqGemma.csv	0.7818	1.0000	0.6418	0.6667	1.0000	0.5000	0.7143	1.0000	0.5556	0.7209	1.0000	0.5658
LLMAnnotation_groqLlama.csv	0.8246	1.0000	0.7015	0.7680	1.0000	0.6234	0.7899	1.0000	0.6528	0.7942	1.0000	0.6592
LLMAnnotation_gpt4o-latest.csv	0.8348	1.0000	0.7164	0.6550	1.0000	0.4870	0.7368	1.0000	0.5833	0.7422	1.0000	0.5956


### Prepare huggingface data

In [26]:
import os
import csv
import json
from collections import defaultdict
from sklearn.metrics import f1_score, precision_score, recall_score

def categorize_file(file_name: str) -> str:
    """Categorize the file based on its name with more robust matching."""
    file_name_lower = file_name.lower()

    print(f"Checking file: {file_name}")  # Debug log

    architecture_keywords = ["inotification", "database", "data-struct", "architecture"]
    code_spec_keywords = ["password", "account", "analytics", "configuration", "threepids"]
    readme_keywords = ["readme"]

    if any(keyword in file_name_lower for keyword in architecture_keywords):
        return "Architecture and Database Design Documents"
    elif any(keyword in file_name_lower for keyword in code_spec_keywords):
        return "Code Specification Documents"
    elif any(keyword in file_name_lower for keyword in readme_keywords):
        return "README Files"
    else:
        return "User and Developer Guides"

def process_annotation_csv_with_file_types(csv_path: str, evaluator, start_row: int = None, end_row: int = None):
    """
    Process the annotation CSV and add file type categorization.
    
    Args:
    csv_path (str): Path to the CSV file
    evaluator (AnnotationEvaluator): Annotation evaluator object
    start_row (int, optional): Starting row to process (1-indexed)
    end_row (int, optional): Ending row to process (1-indexed)
    
    Returns:
    tuple: Results, combined scores, and file type scores
    """
    results = []
    file_type_metrics = defaultdict(lambda: {
        'actions': {'y_true': [], 'y_pred': []},
        'data_types': {'y_true': [], 'y_pred': []},
        'purposes': {'y_true': [], 'y_pred': []}
    })
    combined_y_true = {'actions': [], 'data_types': [], 'purposes': []}
    combined_y_pred = {'actions': [], 'data_types': [], 'purposes': []}
    
    with open(csv_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        
        # Convert to list to enable row index tracking
        rows = list(reader)
        
        # Adjust row selection if not specified
        if start_row is None:
            start_row = 1
        if end_row is None:
            end_row = len(rows)
        
        # Validate row selection
        start_row = max(1, start_row)
        end_row = min(end_row, len(rows))
        
        # Adjust for 0-indexing
        start_index = start_row - 1
        end_index = end_row
        
        for row_index in range(start_index, end_index):
            row = rows[row_index]
            try:
                ground_truth = json.loads(row['Target Annotations'])
                file_name = ground_truth.get('file_name', 'Unknown')
                file_type = categorize_file(file_name)
                
                for response_col in ['Model Response 1', 'Model Response 2']:
                    full_text = row[response_col]
                    scores = evaluator.calculate_comprehensive_scores(
                        ground_truth['metadata'], 
                        full_text
                    )
                    result = {
                        'File': file_name,
                        'File Type': file_type,
                        'Model': row['Model'],
                        'Response': response_col,
                        'Metrics': scores
                    }
                    results.append(result)
                    
                    # Aggregate data for file type and overall metrics
                    for category in ['actions', 'data_types', 'purposes']:
                        gt_labels = evaluator._normalize_label(', '.join(ground_truth['metadata'].get(category, [])))
                        pred_labels = evaluator._match_labels(full_text, gt_labels)
                        
                        # Prepare binary labels
                        y_true = [1 if label in gt_labels else 0 for label in gt_labels]
                        y_pred = [1 if label in pred_labels else 0 for label in gt_labels]
                        
                        # Aggregate for file type
                        file_type_metrics[file_type][category]['y_true'].extend(y_true)
                        file_type_metrics[file_type][category]['y_pred'].extend(y_pred)
                        
                        # Aggregate for overall metrics
                        combined_y_true[category].extend(y_true)
                        combined_y_pred[category].extend(y_pred)
            except Exception as e:
                print(f"Error processing row {row_index + 1}: {e}")
                continue
    
    # Calculate metrics
    combined_scores = _calculate_metrics(combined_y_true, combined_y_pred)
    
    # Calculate file type metrics
    file_type_scores = {}
    for file_type, type_data in file_type_metrics.items():
        file_type_scores[file_type] = _calculate_metrics(
            {cat: type_data[cat]['y_true'] for cat in type_data},
            {cat: type_data[cat]['y_pred'] for cat in type_data}
        )
    
    return results, combined_scores, file_type_scores

def _calculate_metrics(y_true, y_pred):
    """Calculate metrics for different categories."""
    combined_scores = {}
    for category, true_labels in y_true.items():
        try:
            combined_scores[category] = {
                'f1': f1_score(true_labels, y_pred[category], average='binary'),
                'precision': precision_score(true_labels, y_pred[category], average='binary'),
                'recall': recall_score(true_labels, y_pred[category], average='binary'),
            }
        except Exception as e:
            print(f"Error calculating combined metrics for {category}: {e}")
            combined_scores[category] = {'f1': 0, 'precision': 0, 'recall': 0}
    
    # Calculate overall scores
    overall_f1 = sum([combined_scores[cat]['f1'] for cat in combined_scores]) / len(combined_scores)
    overall_precision = sum([combined_scores[cat]['precision'] for cat in combined_scores]) / len(combined_scores)
    overall_recall = sum([combined_scores[cat]['recall'] for cat in combined_scores]) / len(combined_scores)
    
    combined_scores['Overall'] = {
        'f1': overall_f1,
        'precision': overall_precision,
        'recall': overall_recall
    }
    
    return combined_scores

def main():
    csv_paths = ['LLMAnnotation_groqGemma.csv', 'LLMAnnotation_groqLlama.csv', 'LLMAnnotation_gpt4o-latest.csv','LLMAnnotation_grokLLama3_2.csv','newllama_responses.csv']
    ontology_path = 'privacy_ontology_simple.json'
    
    evaluator = AnnotationEvaluator(ontology_path)
    all_results = {}
    
    for csv_path in csv_paths:
        csv_path = csv_path.strip()
        if not os.path.exists(csv_path):
            print(f"File not found: {csv_path}")
            continue
        print(f"Processing file: {csv_path}")
        
        # Example of using row selection (rows 20-25)
        # You can modify these values or pass them as arguments
        start_row = 21  # Optional: specify start row (1-indexed)
        end_row = 26    # Optional: specify end row (1-indexed)
        
        _, combined_scores, file_type_scores = process_annotation_csv_with_file_types(
            csv_path, 
            evaluator, 
            start_row=start_row,  # Optional
            end_row=end_row       # Optional
        )
        
        all_results[os.path.basename(csv_path)] = {
            'overall': combined_scores,
            'file_types': file_type_scores
        }
    
    # Display results side by side
    print("\nComparison of Metrics:")
    
    # Print overall metrics
    print("\nOverall Metrics:")
    categories = ['actions', 'data_types', 'purposes', 'Overall']
    header = ["File"] + [f"{category} {metric}" for category in categories for metric in ["F1", "Precision", "Recall"]]
    print("\t".join(header))
    
    for file_name, results in all_results.items():
        row = [file_name]
        for category in categories:
            metrics = results['overall'].get(category, {})
            row.extend([f"{metrics.get(metric, 0):.4f}" for metric in ["f1", "precision", "recall"]])
        print("\t".join(row))
    
    # Print file type metrics
    print("\nFile Type Metrics:")
    file_type_header = ["File", "File Type"] + [f"{category} {metric}" for category in categories for metric in ["F1", "Precision", "Recall"]]
    print("\t".join(file_type_header))
    
    for file_name, results in all_results.items():
        for file_type, type_scores in results['file_types'].items():
            row = [file_name, file_type]
            for category in categories:
                metrics = type_scores.get(category, {})
                row.extend([f"{metrics.get(metric, 0):.4f}" for metric in ["f1", "precision", "recall"]])
            print("\t".join(row))

if __name__ == "__main__":
    main()

Processing file: LLMAnnotation_groqGemma.csv
Checking file: DatabaseSchema.txt
Checking file: INotificationDistributor.txt
Checking file: INotificationSubscriber.txt
Checking file: file.txt
Processing file: LLMAnnotation_groqLlama.csv
Checking file: DatabaseSchema.txt
Checking file: INotificationDistributor.txt
Checking file: INotificationSubscriber.txt
Checking file: file.txt
Processing file: LLMAnnotation_gpt4o-latest.csv
Checking file: DatabaseSchema.txt
Checking file: INotificationDistributor.txt
Checking file: INotificationSubscriber.txt
Checking file: file.txt
Processing file: LLMAnnotation_grokLLama3_2.csv
Checking file: DatabaseSchema.txt
Checking file: INotificationDistributor.txt
Checking file: INotificationSubscriber.txt


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Checking file: file.txt
Processing file: newllama_responses.csv
Checking file: DatabaseSchema.txt
Error processing row 21: 'Model Response 2'
Checking file: INotificationDistributor.txt
Error processing row 22: 'Model Response 2'
Checking file: INotificationSubscriber.txt
Error processing row 23: 'Model Response 2'
Checking file: file.txt
Error processing row 24: 'Model Response 2'

Comparison of Metrics:

Overall Metrics:
File	actions F1	actions Precision	actions Recall	data_types F1	data_types Precision	data_types Recall	purposes F1	purposes Precision	purposes Recall	Overall F1	Overall Precision	Overall Recall
LLMAnnotation_groqGemma.csv	1.0000	1.0000	1.0000	0.7907	1.0000	0.6538	0.8148	1.0000	0.6875	0.8685	1.0000	0.7804
LLMAnnotation_groqLlama.csv	0.9231	1.0000	0.8571	0.8182	1.0000	0.6923	0.7692	1.0000	0.6250	0.8368	1.0000	0.7248
LLMAnnotation_gpt4o-latest.csv	1.0000	1.0000	1.0000	0.5556	1.0000	0.3846	0.6939	1.0000	0.5312	0.7498	1.0000	0.6386
LLMAnnotation_grokLLama3_2.csv	1.0000	1.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
import pandas as pd

def process_csv(input_csv_path, output_csv_path):
    """
    Processes the input CSV and outputs a new CSV formatted for Hugging Face datasets.
    
    Args:
        input_csv_path (str): Path to the input CSV file.
        output_csv_path (str): Path to save the formatted output CSV.
    """
    # Read the input CSV
    data = pd.read_csv(input_csv_path)

    # Initialize lists for the new dataset
    prompts = []
    chosen = []
    rejected = []

    # Process the data
    for _, row in data.iterrows():
        prompts.append(row['Prompt'])

        if row['Preferred_Response'] == 1:
            chosen.append(row['Model Response 1'])
            rejected.append(row['Model Response 2'])
        elif row['Preferred_Response'] == 2:
            chosen.append(row['Model Response 2'])
            rejected.append(row['Model Response 1'])

    # Create a new DataFrame with the desired format
    formatted_data = pd.DataFrame({
        'prompt': prompts,
        'chosen': chosen,
        'rejected': rejected
    })

    # Save the formatted DataFrame to a new CSV
    formatted_data.to_csv(output_csv_path, index=False)
    print(f"Formatted dataset saved to: {output_csv_path}")

# Example usage
input_csv_path = "dpo_data_gpt4o_llama.csv"
output_csv_path = "privacy_analysis_dpo.csv"
process_csv(input_csv_path, output_csv_path)


Formatted dataset saved to: privacy_analysis_dpo.csv


### Create response prefference game 

In [23]:
import pandas as pd

# Function to process and create pairs from CSV files
def create_model_comparison(csv_files):
    # Read all CSVs into pandas DataFrames
    dataframes = [pd.read_csv(file) for file in csv_files]
    
    # Ensure that all DataFrames have the necessary columns: 'Model', 'Preferred_Response', 'Prompt', and 'Target Annotations'
    required_columns = {'Model', 'Preferred_Response', 'Model Response 1', 'Model Response 2', 'Prompt', 'Target Annotations'}
    
    for df in dataframes:
        if not required_columns.issubset(df.columns):
            raise ValueError(f"Each CSV must contain the following columns: {', '.join(required_columns)}")
    
    # Create a new DataFrame to hold the model pairs and their preferences
    model_pairs = []

    # Iterate through all possible pairs of DataFrames (4 files = 6 unique pairs)
    for i in range(len(dataframes)):
        for j in range(i+1, len(dataframes)):
            # Get the models and Preferred_Responses from the i-th and j-th CSV
            df1 = dataframes[i]
            df2 = dataframes[j]
            
            # Make sure the lengths of the DataFrames are equal, otherwise adjust
            min_len = min(len(df1), len(df2))
            df1 = df1.head(min_len)
            df2 = df2.head(min_len)
            
            # Compare model names and create a new row with paired models, preferences, Prompt, and Target Annotations
            for k in range(min_len):
                model1 = df1.iloc[k]['Model']
                model2 = df2.iloc[k]['Model']
                pref_choice_1 = df1.iloc[k]['Preferred_Response']
                pref_choice_2 = df2.iloc[k]['Preferred_Response']
                
                # Determine the preferred response based on 'Preferred_Response'
                response_1 = df1.iloc[k]['Model Response 1'] if pref_choice_1 == 1 else df1.iloc[k]['Model Response 2']
                response_2 = df2.iloc[k]['Model Response 1'] if pref_choice_2 == 1 else df2.iloc[k]['Model Response 2']
                
                # Retain the Prompt and Target Annotations
                prompt = df1.iloc[k]['Prompt']
                target_annotation = df1.iloc[k]['Target Annotations']
                
                # Create a comparison row: model names separated by '?' and their responses side by side, also include Prompt and Target Annotations
                comparison_row = {
                    'Model Pair': f"{model1}?{model2}",
                    'Preferred_Response 1': response_1,
                    'Preferred_Response 2': response_2,
                    'Prompt': prompt,
                    'Target Annotations': target_annotation
                }
                model_pairs.append(comparison_row)
    
    # Convert the list of model pairs into a DataFrame
    comparison_df = pd.DataFrame(model_pairs)
    
    # Save the new CSV with the comparisons, preserving the structure
    comparison_df.to_csv("model_comparison.csv", index=False)

# Example usage:
csv_files = [
    'LLMAnnotation_GroqLlama_shashank.csv',
    'LLMAnnotation_GroqGemma_Shashank.csv',
    'LLMAnnotation-gpt40-shashank.csv',
    'LLM_AnnotationgrokLLAMA3_2-Shreyash.csv'
]
create_model_comparison(csv_files)


### Calculate ELO 

In [33]:
import pandas as pd

def calculate_elo(output_csv, initial_rating=1000, k=32):
    """
    Calculate Elo ratings based on pairwise model comparisons.
    
    Parameters:
        output_csv (str): Path to the CSV file containing model pair evaluations.
        initial_rating (int): Initial Elo rating for all models.
        k (int): K-factor for Elo calculation, determines rating sensitivity.
    
    Returns:
        pd.DataFrame: DataFrame containing models and their Elo ratings.
    """
    # Read the annotated output CSV
    df = pd.read_csv(output_csv)
    
    # Initialize Elo ratings dictionary
    elo_ratings = {}
    
    # Define a function to calculate the probability of winning
    def win_probability(rating_a, rating_b):
        return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
    
    # Define a function to update Elo ratings
    def update_ratings(rating_a, rating_b, result_a):
        """
        Updates Elo ratings based on the match result.
        
        Parameters:
            rating_a (float): Current rating of model A.
            rating_b (float): Current rating of model B.
            result_a (float): Result of model A (1 for win, 0 for loss).
        
        Returns:
            (float, float): Updated ratings for model A and model B.
        """
        prob_a = win_probability(rating_a, rating_b)
        prob_b = 1 - prob_a
        new_rating_a = rating_a + k * (result_a - prob_a)
        new_rating_b = rating_b + k * ((1 - result_a) - prob_b)
        return new_rating_a, new_rating_b
    
    # Process each row in the DataFrame
    for _, row in df.iterrows():
        # Extract models and their annotated preference
        model_pair = row[2]
        preferred_response = row['Preferred_Response']
        model1, model2 = model_pair.split('?')
        
        # Initialize Elo ratings for models if not already present
        if model1 not in elo_ratings:
            elo_ratings[model1] = initial_rating
        if model2 not in elo_ratings:
            elo_ratings[model2] = initial_rating
        
        # Determine match result
        result_model1 = 1 if preferred_response == 1 else 0
        result_model2 = 1 - result_model1
        
        # Update Elo ratings
        elo_ratings[model1], elo_ratings[model2] = update_ratings(
            elo_ratings[model1],
            elo_ratings[model2],
            result_model1
        )
    
    # Convert Elo ratings dictionary to a DataFrame
    elo_df = pd.DataFrame(list(elo_ratings.items()), columns=['Model', 'Elo Rating'])
    
    # Sort models by Elo rating in descending order
    elo_df = elo_df.sort_values(by='Elo Rating', ascending=False).reset_index(drop=True)
    
    # Save Elo ratings to a new CSV
    elo_df.to_csv("elo_ratings.csv", index=False)
    print("Elo ratings have been saved to 'elo_ratings.csv'")
    return elo_df

# Example usage:
output_csv = "LLMannotated_model_comparision-Shreyash.csv"  # The annotated output file
elo_df = calculate_elo(output_csv)
print(elo_df)


Elo ratings have been saved to 'elo_ratings.csv'
                          Model   Elo Rating
0  groq:llama-3.3-70b-versatile  1144.200270
1      openai:gpt-4o-2024-11-20  1010.766813
2             groq:gemma2-9b-it   937.671544
3     groq:llama-3.2-3b-preview   907.361373


  model_pair = row[2]
