### Get model f1 scores

In [4]:
import json
import csv
import re
from typing import Dict, List, Set
from sklearn.metrics import f1_score

class AnnotationEvaluator:
    def __init__(self, ontology_path: str):
        """
        Initialize the evaluator with the privacy ontology.
        """
        with open(ontology_path, 'r') as f:
            self.ontology = json.load(f)
    
    def _normalize_label(self, label: str) -> List[str]:
        """
        Normalize label by removing extra whitespace, converting to lowercase,
        and splitting comma-separated values.
        """
        # Split by comma, strip whitespace, convert to lowercase
        return [
            l.strip().lower() 
            for l in label.split(',') 
            if l.strip()
        ]
    
    def _preprocess_text(self, text: str) -> str:
        """
        Preprocess text by removing content within or after <R> tags.
        
        Args:
            text (str): Input text to preprocess
        
        Returns:
            str: Preprocessed text
        """
        # Find all <R> tags
        r_tags = list(re.finditer(r'<R>(.*?)</R>', text, re.DOTALL))
        
        if not r_tags:
            # If no </R> tags, remove everything after a single <R> tag
            single_r_match = re.search(r'<R>', text)
            if single_r_match:
                return text[:single_r_match.start()]
            return text
        
        # Take the content before the first <R> tag
        preprocessed = text[:r_tags[0].start()]
        
        # Return preprocessed text
        return preprocessed.strip()
    
    def _match_labels(self, text: str, category_labels: List[str]) -> Set[str]:
        """
        Find matching labels in the text.
        
        Args:
            text (str): Text to search
            category_labels (List[str]): Labels to match
        
        Returns:
            Set[str]: Matched labels
        """
        # Preprocess text to remove R-tag content
        preprocessed_text = self._preprocess_text(text)
        
        # Normalize text
        normalized_text = preprocessed_text.lower()
        
        # Match labels
        matched_labels = set()
        
        for label in category_labels:
            # Various matching strategies
            # 1. Whole word match
            word_pattern = r'\b' + re.escape(label) + r'\b'
            # 2. Partial match
            partial_pattern = re.escape(label)
            
            if (re.search(word_pattern, normalized_text) or 
                re.search(partial_pattern, normalized_text)):
                matched_labels.add(label)
        
        return matched_labels
    
    def calculate_comprehensive_f1_scores(self, ground_truth: Dict, text: str) -> Dict[str, float]:
        """
        Calculate F1 scores for all categories.
        
        Args:
            ground_truth (Dict): Ground truth annotations
            text (str): Text to evaluate
        
        Returns:
            Dict[str, float]: F1 scores
        """
        f1_scores = {}
        categories = ['actions', 'data_types', 'purposes']
        
        for category in categories:
            # Extract labels from ground truth
            gt_labels = self._normalize_label(', '.join(ground_truth.get(category, [])))
            
            # Find matching labels in text
            pred_labels = self._match_labels(text, gt_labels)
            
            # Calculate F1 score
            y_true = [1 if label in gt_labels else 0 for label in gt_labels]
            y_pred = [1 if label in pred_labels else 0 for label in gt_labels]
            
            try:
                # Macro average F1 score
                f1 = f1_score(y_true, y_pred, average='binary')
            except Exception as e:
                print(f"F1 Score calculation error for {category}: {e}")
                f1 = 0
            
            f1_scores[category] = f1
        
        # Calculate overall F1 score
        f1_scores['Overall'] = sum(f1_scores.values()) / len(f1_scores)
        
        # Diagnostic print
        print(f"\nCategory Diagnostics:")
        for category in categories:
            print(f"{category.capitalize()}:")
            print(f"  Ground Truth Labels: {gt_labels}")
            print(f"  Matched Labels: {pred_labels}")
            print(f"  F1 Score: {f1_scores[category]:.4f}")
        
        # Print preprocessed text for verification
        print("\nPreprocessed Text:")
        print(self._preprocess_text(text))
        
        return f1_scores

def process_annotation_csv(csv_path: str, ontology_path: str) -> List[Dict]:
    """
    Process the annotation CSV and calculate F1 scores.
    """
    # Initialize the evaluator
    evaluator = AnnotationEvaluator(ontology_path)
    
    # Results storage
    results = []
    
    # Read the CSV file
    with open(csv_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        
        for row in reader:
            try:
                # Parse target annotations
                ground_truth = json.loads(row['Target Annotations'])
                
                # Process both model responses
                for response_col in ['Model Response 1', 'Model Response 2']:
                    # Extract full text
                    full_text = row[response_col]
                    
                    # Calculate F1 scores
                    f1_scores = evaluator.calculate_comprehensive_f1_scores(
                        ground_truth['metadata'], 
                        full_text
                    )
                    
                    # Prepare result dictionary
                    result = {
                        'File': ground_truth.get('file_name', 'Unknown'),
                        'Model': row['Model'],
                        'Response': response_col,
                        'Metrics': f1_scores
                    }
                    
                    results.append(result)
            
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
    
    return results

def main():
    # Paths to your files
    csv_path = 'LLMAnnotation_groqGemma.csv'
    ontology_path = 'privacy_ontology_simple.json'
    
    # Process the CSV and calculate F1 scores
    results = process_annotation_csv(csv_path, ontology_path)
    
    # Print results
    print("\nAnnotation F1 Scores:")
    for result in results:
        print(f"File: {result['File']}")
        print(f"Model: {result['Model']}")
        print(f"Response: {result['Response']}")
        for category, score in result['Metrics'].items():
            print(f"{category} F1 Score: {score:.4f}")
        print("-" * 40)
    
    # Save results to a CSV
    if results:
        output_keys = ['File', 'Model', 'Response', 'Actions F1', 'Data Types F1', 'Purposes F1', 'Overall F1']
        with open('annotation_f1_scores.csv', 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=output_keys)
            writer.writeheader()
            
            for result in results:
                # Prepare row for CSV
                csv_row = {
                    'File': result['File'],
                    'Model': result['Model'],
                    'Response': result['Response'],
                    'Actions F1': result['Metrics'].get('actions', 0),
                    'Data Types F1': result['Metrics'].get('data_types', 0),
                    'Purposes F1': result['Metrics'].get('purposes', 0),
                    'Overall F1': result['Metrics'].get('Overall', 0)
                }
                writer.writerow(csv_row)
        
        print("Results saved to annotation_f1_scores.csv")

if __name__ == "__main__":
    main()


Category Diagnostics:
Actions:
  Ground Truth Labels: ['personal analytics', 'functionality', 'account management']
  Matched Labels: {'functionality', 'account management'}
  F1 Score: 1.0000
Data_types:
  Ground Truth Labels: ['personal analytics', 'functionality', 'account management']
  Matched Labels: {'functionality', 'account management'}
  F1 Score: 0.4000
Purposes:
  Ground Truth Labels: ['personal analytics', 'functionality', 'account management']
  Matched Labels: {'functionality', 'account management'}
  F1 Score: 0.8000

Preprocessed Text:
"## Annotations:\n\n**Actions:** Collect, Use \n**Data Types:**  Account Information:\n    Account Balance,\n    User id\n**Purposes:** Account management, Functionality\n\n**Stories:** 1. We collect Account Balance, User id for account management. We use Account Balance, User id for functionality. \n\n

Category Diagnostics:
Actions:
  Ground Truth Labels: ['personal analytics', 'functionality', 'account management']
  Matched Labels: 

In [1]:
import json
import csv
import re
from typing import Dict, List, Set, Tuple
from sklearn.metrics import f1_score, precision_score, recall_score
import os

class AnnotationEvaluator:
    def __init__(self, ontology_path: str):
        with open(ontology_path, 'r') as f:
            self.ontology = json.load(f)
    
    def _normalize_label(self, label: str) -> List[str]:
        return [
            l.strip().lower() 
            for l in label.split(',') 
            if l.strip()
        ]
    
    def _preprocess_text(self, text: str) -> str:
        r_tags = list(re.finditer(r'<R>(.*?)</R>', text, re.DOTALL))
        if not r_tags:
            single_r_match = re.search(r'<R>', text)
            if single_r_match:
                return text[:single_r_match.start()]
            return text
        preprocessed = text[:r_tags[0].start()]
        return preprocessed.strip()
    
    def _match_labels(self, text: str, category_labels: List[str]) -> Set[str]:
        preprocessed_text = self._preprocess_text(text)
        normalized_text = preprocessed_text.lower()
        matched_labels = set()
        for label in category_labels:
            word_pattern = r'\b' + re.escape(label) + r'\b'
            partial_pattern = re.escape(label)
            if (re.search(word_pattern, normalized_text) or 
                re.search(partial_pattern, normalized_text)):
                matched_labels.add(label)
        return matched_labels
    
    def calculate_comprehensive_scores(self, ground_truth: Dict, text: str) -> Dict[str, float]:
        scores = {}
        categories = ['actions', 'data_types', 'purposes']
        
        for category in categories:
            gt_labels = self._normalize_label(', '.join(ground_truth.get(category, [])))
            pred_labels = self._match_labels(text, gt_labels)
            y_true = [1 if label in gt_labels else 0 for label in gt_labels]
            y_pred = [1 if label in pred_labels else 0 for label in gt_labels]
            
            try:
                f1 = f1_score(y_true, y_pred, average='binary')
                precision = precision_score(y_true, y_pred, average='binary')
                recall = recall_score(y_true, y_pred, average='binary')
            except Exception as e:
                print(f"Error calculating metrics for {category}: {e}")
                f1, precision, recall = 0, 0, 0
            
            scores[category] = {
                'f1': f1,
                'precision': precision,
                'recall': recall
            }
        
        overall_f1 = sum([scores[cat]['f1'] for cat in categories]) / len(categories)
        overall_precision = sum([scores[cat]['precision'] for cat in categories]) / len(categories)
        overall_recall = sum([scores[cat]['recall'] for cat in categories]) / len(categories)
        
        scores['Overall'] = {
            'f1': overall_f1,
            'precision': overall_precision,
            'recall': overall_recall
        }
        
        return scores

def categorize_file(file_name: str) -> str:
    """Categorize the file based on its name."""
    if "inotification" in file_name.lower() or "database" in file_name.lower() or "data-struct" in file_name.lower() or "architecture" in file_name.lower():
        return "Architecture and Database Design Documents"
    elif "password" in file_name.lower() or "account" in file_name.lower() or "analyrics" in file_name.lower() or "configuration" in file_name.lower() or "threepids" in file_name.lower():
        return "Code Specification Documents"
    elif "readme" in file_name.lower():
        return "README Files"
    else:
        return "User and Developer Guides"

def process_annotation_csv(csv_path: str, evaluator: AnnotationEvaluator) -> Tuple[List[Dict], Dict[str, Dict]]:
    results = []
    grouped_metrics = {}
    combined_y_true = {'actions': [], 'data_types': [], 'purposes': []}
    combined_y_pred = {'actions': [], 'data_types': [], 'purposes': []}
    
    with open(csv_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            try:
                ground_truth = json.loads(row['Target Annotations'])
                file_name = ground_truth.get('file_name', 'Unknown')
                file_type = categorize_file(file_name)
                
                for response_col in ['Model Response 1', 'Model Response 2']:
                    full_text = row[response_col]
                    scores = evaluator.calculate_comprehensive_scores(
                        ground_truth['metadata'], 
                        full_text
                    )
                    result = {
                        'File': file_name,
                        'File Type': file_type,
                        'Model': row['Model'],
                        'Response': response_col,
                        'Metrics': scores
                    }
                    results.append(result)
                    
                    # Aggregate data for file type
                    if file_type not in grouped_metrics:
                        grouped_metrics[file_type] = {'actions': [], 'data_types': [], 'purposes': []}
                    
                    for category in ['actions', 'data_types', 'purposes']:
                        gt_labels = evaluator._normalize_label(', '.join(ground_truth['metadata'].get(category, [])))
                        pred_labels = evaluator._match_labels(full_text, gt_labels)
                        grouped_metrics[file_type][category].extend([1 if label in gt_labels else 0 for label in gt_labels])
                        grouped_metrics[file_type][category].extend([1 if label in pred_labels else 0 for label in gt_labels])
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
    
    # Calculate combined overall metrics
    grouped_scores = {}
    for file_type, data in grouped_metrics.items():
        grouped_scores[file_type] = {}
        for category in ['actions', 'data_types', 'purposes']:
            try:
                grouped_scores[file_type][category] = {
                    'f1': f1_score(data[category], data[category], average='binary'),
                    'precision': precision_score(data[category], data[category], average='binary'),
                    'recall': recall_score(data[category], data[category], average='binary'),
                }
            except Exception as e:
                print(f"Error calculating combined metrics for {category} in {file_type}: {e}")
                grouped_scores[file_type][category] = {'f1': 0, 'precision': 0, 'recall': 0}
    
    return results, grouped_scores

def main():
    csv_paths = ['LLMAnnotation_groqGemma.csv', 'LLMAnnotation_groqLlama.csv', 'LLMAnnotation_gpt4o-latest.csv']
    ontology_path = 'privacy_ontology_simple.json'
    
    evaluator = AnnotationEvaluator(ontology_path)
    all_results = {}
    all_grouped_scores = {}
    
    for csv_path in csv_paths:
        csv_path = csv_path.strip()
        if not os.path.exists(csv_path):
            print(f"File not found: {csv_path}")
            continue
        print(f"Processing file: {csv_path}")
        results, grouped_scores = process_annotation_csv(csv_path, evaluator)
        all_results[os.path.basename(csv_path)] = results
        for file_type, scores in grouped_scores.items():
            if file_type not in all_grouped_scores:
                all_grouped_scores[file_type] = scores
            else:
                # Merge scores for the same file type
                for category, metrics in scores.items():
                    for metric, value in metrics.items():
                        all_grouped_scores[file_type][category][metric] += value
    
    print("\nGrouped Results Across File Types:")
    for file_type, scores in all_grouped_scores.items():
        print(f"File Type: {file_type}")
        for category, metrics in scores.items():
            print(f"  {category}: {metrics}")

if __name__ == "__main__":
    main()


Processing file: LLMAnnotation_groqGemma.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Processing file: LLMAnnotation_groqLlama.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Processing file: LLMAnnotation_gpt4o-latest.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Processing file: LLMAnnotation_hfLlama.csv
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'
Error processing row: 'metadata'

Comparison of Combined Metrics Across Files:
File	actions F1	actions Precision	actions Recall	data_types F1	data_types Precision	da

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

### Prepare huggingface data

In [5]:
import pandas as pd

def process_csv(input_csv_path, output_csv_path):
    """
    Processes the input CSV and outputs a new CSV formatted for Hugging Face datasets.
    
    Args:
        input_csv_path (str): Path to the input CSV file.
        output_csv_path (str): Path to save the formatted output CSV.
    """
    # Read the input CSV
    data = pd.read_csv(input_csv_path)

    # Initialize lists for the new dataset
    prompts = []
    chosen = []
    rejected = []

    # Process the data
    for _, row in data.iterrows():
        prompts.append(row['Prompt'])

        if row['Preferred_Response'] == 1:
            chosen.append(row['Model Response 1'])
            rejected.append(row['Model Response 2'])
        elif row['Preferred_Response'] == 2:
            chosen.append(row['Model Response 2'])
            rejected.append(row['Model Response 1'])

    # Create a new DataFrame with the desired format
    formatted_data = pd.DataFrame({
        'prompt': prompts,
        'chosen': chosen,
        'rejected': rejected
    })

    # Save the formatted DataFrame to a new CSV
    formatted_data.to_csv(output_csv_path, index=False)
    print(f"Formatted dataset saved to: {output_csv_path}")

# Example usage
input_csv_path = "LLMAnnotation-gpt40-shashank.csv"
output_csv_path = "privacy_analysis_dpo.csv"
process_csv(input_csv_path, output_csv_path)


Formatted dataset saved to: privacy_analysis_dpo.csv
