In [2]:
import re
import json
import text_processing
import secrets 

from typing import List, Dict
import openai  # Assuming OpenAI API, but adaptable to other models

# Load text file and taxonomy JSON

taxonomy_file_path = 'privacy_ontology_simple.json'  # Replace with actual path
example_file_path = 'annotations/Actual_Budget/Accounts_&_Transactions.txt'
target_file_path = 'input/lh-ehr/Direct_Messaging_README.txt'

# Load previously processed example files
example_data = text_processing.process_input(example_file_path)

### Building prompts


In [3]:

import json
from typing import Dict, List, Union
import tiktoken  # Library for token counting with OpenAI models
import prompt_templates

# Load the privacy ontology
ontology_path = "privacy_ontology_simple.json"
privacy_ontology = prompt_templates.load_privacy_ontology(ontology_path)

# Load one processed example file for demonstration
example_file = text_processing.process_input(example_file_path)[0]


# Define target text to be annotated (for demonstration) & Load annotated version 
target_file_annotations = text_processing.process_input(target_file_path)
new_text_to_annotate = open(target_file_path).read()

# Generate the prompt
prompt_example = prompt_templates.create_annotation_prompt(example_file, new_text_to_annotate, privacy_ontology)

# Print the prompt to review it
print(prompt_example)
print(f"\nToken Count: {prompt_templates.count_tokens(prompt_example)} tokens")


You are a privacy expert annotator tasked with annotating text files with metadata about privacy behaviors and stories. For the given text, annotate the following:

1. Actions: Actions performed or expected in the text.
2. Data Types: Types of data referenced in the text. Data types may include specific subcategories.
3. Purposes: Intentions or purposes related to the actions and data types.
4. Stories: Concise stories that describe how actions, data types, and purposes interact in context.

After providing your annotations, explain your rationale for these annotations. Place <R> tag between your annotations and your rationale.

Use only the categories listed below when annotating:

Actions:
Collect, Use, Share

Data Types:
Contact Data:
  Phone Number:
  Email address:
  User ID:
  Job Title:
  Company:
  Address:
  Name:
  Date of Birth:
  Image:
  Government ID:
  Biographical Data:
    CV:
    Education:
    Employment:
Health Data:
  Physical activity:
Social Media:
Location:
  Ap

### Annotating with LLMs 

In [10]:
import os
import json
import prompt_templates
import text_processing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

input_dir = 'input'

def find_matching_file(input_file_path, annotations_dir):
    """
    Find the matching annotation file for a given input file.
    
    Args:
        input_file_path (str): Path to the input file
        annotations_dir (str): Root directory of annotations
    
    Returns:
        str: Path to the matching annotation file, or None if not found
    """
    # Get the relative path from the input directory
    relative_path = os.path.relpath(input_file_path, input_dir)
    annotation_file_path = os.path.join(annotations_dir, relative_path)
    
    return annotation_file_path if os.path.exists(annotation_file_path) else None


def find_most_similar_file(input_file_path, annotations_dir, exclude_file):
    """
    Find the most similar annotation file to the input file, excluding a specified file.
    
    Args:
        input_file_path (str): Path to the input file
        annotations_dir (str): Directory containing the annotation files
        exclude_file (str): File to exclude from similarity search
    
    Returns:
        str: Path to the most similar annotation file
    """
    # Read the input file's content
    with open(input_file_path, 'r', encoding='utf-8') as f:
        input_text = f.read()
    
    # List to store (similarity_score, annotation_file_path)
    similarity_scores = []
    
    # Loop through the annotations directory to calculate similarity
    for root, _, files in os.walk(annotations_dir):
        for filename in files:
            if filename == exclude_file or filename.startswith('.'):
                continue
            
            annotation_file_path = os.path.join(root, filename)
            
            # Read the annotation file's content
            with open(annotation_file_path, 'r', encoding='utf-8') as f:
                annotation_text = f.read()
            
            # Compute similarity using TF-IDF and cosine similarity
            tfidf = TfidfVectorizer().fit_transform([input_text, annotation_text])
            similarity = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
            similarity_scores.append((similarity, annotation_file_path))
    
    # Sort by similarity score in descending order and return the most similar file
    similarity_scores.sort(reverse=True, key=lambda x: x[0])
    return similarity_scores[0][1] if similarity_scores else None


def create_prompt_templates_dict(input_dir='input', annotations_dir='annotations', ontology_path='privacy_ontology_simple.json'):
    """
    Create a dictionary of prompt templates matching input files with their corresponding annotation examples.
    
    Args:
        input_dir (str): Root directory containing input files to be annotated
        annotations_dir (str): Root directory containing annotated example files
        ontology_path (str): Path to the privacy ontology JSON file
    
    Returns:
        dict: Dictionary of prompt templates for each input file
    """
    # Load privacy ontology
    privacy_ontology = prompt_templates.load_privacy_ontology(ontology_path)
    
    # Dictionary to store prompt templates
    prompt_templates_dict = {}
    
    # Walk through all directories and files in the input directory
    for root, _, files in os.walk(input_dir):
        for filename in files:
            # Skip hidden files
            if filename.startswith('.'):
                continue
            
            # Full path to the input file
            input_file_path = os.path.join(root, filename)
            
            # Find corresponding annotation file (for exclusion from similarity search)
            annotation_file_path = find_matching_file(input_file_path, annotations_dir)
            
            if not annotation_file_path:
                print(f"No matching annotation found for {input_file_path}")
                continue
            
            try:
                # Read the input text to annotate
                with open(input_file_path, 'r', encoding='utf-8') as f:
                    new_text_to_annotate = f.read()
                
                # Find the most similar annotation file
                most_similar_annotation = find_most_similar_file(input_file_path, annotations_dir, os.path.basename(annotation_file_path))
                
                if not most_similar_annotation:
                    print(f"No similar annotation found for {input_file_path}")
                    continue
                
                # Process the example file from annotations
                example_file = text_processing.process_input(most_similar_annotation)[0]
                
                # Create prompt template
                prompt_template = prompt_templates.create_annotation_prompt(
                    example_file, 
                    new_text_to_annotate, 
                    privacy_ontology
                )
                
                # Create a unique key based on relative path
                relative_key = os.path.relpath(input_file_path, input_dir)
                
                # Store in dictionary
                prompt_templates_dict[relative_key] = {
                    'input_file_path': input_file_path,
                    'annotation_file_path': most_similar_annotation,
                    'prompt_template': prompt_template,
                    'target_annotations': text_processing.process_file(annotation_file_path),
                    'token_count': prompt_templates.count_tokens(prompt_template)
                }
            
            except Exception as e:
                print(f"Error processing {input_file_path}: {e}")
    
    return prompt_templates_dict


# Create the prompt templates dictionary
prompt_templates_dict = create_prompt_templates_dict()

# Save to JSON for inspection
with open('prompt_templates_output.json', 'w', encoding='utf-8') as f:
    json.dump({k: {**v, 'prompt_template': v['prompt_template']} 
              for k, v in prompt_templates_dict.items()}, 
              f, indent=2)

print(f"Created prompt templates for {len(prompt_templates_dict)} files")

# Optional: You can save the full dictionary using pickle if needed
import pickle
with open('prompt_templates.pkl', 'wb') as f:
    pickle.dump(prompt_templates_dict, f)


Error processing input\Actual_Budget\Backup_&_Restore.txt: 'charmap' codec can't decode byte 0x9d in position 232: character maps to <undefined>
Created prompt templates for 24 files


In [19]:
import os
import csv
import json
from getpass import getpass
import model_routing

# Ensure OPENAI_API_KEY is set
os.environ['OPENAI_API_KEY'] = getpass('Enter your OPENAI API key: ')
os.environ['GROQ_API_KEY'] ="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"

def save_results_to_csv(models, prompt_templates_dict, model_responses, output_file):
    """
    Save the model outputs, prompts, and file annotations to a CSV file.
    Ensures that each prompt from the input dictionary is listed once for each model.
    
    Args:
        models (list): List of model names.
        prompt_templates_dict (dict): Dictionary of prompt templates.
        model_responses (dict): Dictionary of model names as keys and lists of responses as values.
        output_file (str): Path to the output CSV file.
    """
    # Open the output CSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        # Prepare the CSV header
        fieldnames = ['File', 'Prompt', 'Model', 'Target File Path', 'Target Annotations', 'Model Response 1', 'Model Response 2']
        
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # Iterate through the prompt templates and models to pair responses
        for file_key, template_info in prompt_templates_dict.items():
            # For each model, get the corresponding responses
            for model in models:
                responses = model_responses.get(model, [])

                # Ensure that responses exist for the current model
                if responses:
                    # Ensure that the number of responses matches the number of prompts
                    num_responses = len(responses)
                    prompt = template_info['prompt_template']
                    file_path = template_info['input_file_path']
                    annotations = json.dumps(template_info['target_annotations'])

                    # Handle the mapping of responses to prompts
                    for i in range(num_responses):
                        # Use 'No Response' if there are not enough responses for a pair
                        response1 = responses[i] if i < num_responses else 'No Response'
                        response2 = responses[i+1] if i+1 < num_responses else 'No Response'

                        # Create a new row for each response
                        row = {
                            'File': file_key,
                            'Prompt': prompt,
                            'Model': model,
                            'Target File Path': file_path,
                            'Target Annotations': annotations,
                            'Model Response 1': json.dumps(response1, ensure_ascii=False),
                            'Model Response 2': json.dumps(response2, ensure_ascii=False)
                        }

                        # Write the row to the CSV file
                        writer.writerow(row)


def run_multi_file_annotations(prompt_templates_dict, output_csv, models=None):
    """
    Run annotation process for multiple files and save results to a CSV file.
    
    Args:
        prompt_templates_dict (dict): Dictionary of prompt templates.
        output_csv (str): Path to save the CSV results.
        models (list, optional): List of models to use. Defaults to GPT-4o-mini.
    """

    # Prepare prompts from the templates dictionary
    test_prompts = [template_info['prompt_template'] for template_info in prompt_templates_dict.values()]
    
    try:
        # Get model responses (this should be a dictionary with model names as keys and lists of responses as values)
        model_responses = model_routing.run_multi_model_prompts(
            models=models, 
            prompts=test_prompts, 
            num_runs=2  # Number of runs per model
        )

        # Check that model_responses is a dictionary
        if not isinstance(model_responses, dict):
            raise ValueError(f"Expected model_responses to be a dictionary, got {type(model_responses)}")

        # Save results to CSV 
        save_results_to_csv(
            models=models,
            prompt_templates_dict=prompt_templates_dict,
            model_responses=model_responses,
            output_file=output_csv
        )

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

# Path to output CSV file
output_csv = 'LLMAnnotation_MultiFile_12_8_24_.csv'
models = [
    'openai:gpt-4o-2024-11-20'
    # 'groq:gemma-7b-it'
    ] # Run annotations for up to the first 5 files
run_multi_file_annotations(prompt_templates_dict, output_csv, models=models)

print(f"Annotation results saved to {output_csv}")
print(f"Number of files processed: {len(prompt_templates_dict)}")


Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Complete

In [18]:
import os
import csv
import json
import model_routing
from getpass import getpass

# Ensure OPENAI_API_KEY is set
os.environ['OPENAI_API_KEY'] = getpass('Enter your OPENAI API key: ')

os.environ['GROQ_API_KEY'] ="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"


def save_results_to_csv(models, prompt_templates_dict, model_responses, output_file):
    """
    Save the model outputs, prompts, and file annotations to a CSV file.

    Args:
        models (list): List of model names.
        prompt_templates_dict (dict): Dictionary of prompt templates.
        model_responses (dict): Dictionary of model responses for each prompt.
        output_file (str): Path to the output CSV file.
    """
    # Open the output CSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        # Prepare the CSV header
        fieldnames = ['File', 'Prompt', 'Model', 'Target File Path', 'Target Annotations', 'Model Response 1', 'Model Response 2']
        
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # Write data for each prompt in the dictionary
        for file_key, template_info in prompt_templates_dict.items():
            for model_index, model in enumerate(models):
                responses = model_responses[model_index]  # Get responses for the current model
                
                # Loop through the responses two at a time
                for i in range(0, len(responses), 2):
                    response1 = responses[i] if i < len(responses) else 'No Response'
                    response2 = responses[i+1] if i+1 < len(responses) else 'No Response'

                    # Create a new row for each pair of responses
                    row = {
                        'File': file_key,
                        'Prompt': template_info['prompt_template'],
                        'Model': model,
                        'Target File Path': template_info['input_file_path'],
                        'Target Annotations': json.dumps(template_info.get('target_annotations', {}), ensure_ascii=False),
                        'Model Response 1': json.dumps(response1, ensure_ascii=False),
                        'Model Response 2': json.dumps(response2, ensure_ascii=False)
                    }
                    
                    # Write the row to the CSV file
                    writer.writerow(row)


def run_multi_file_annotations(prompt_templates_dict, output_csv, models=None):
    """
    Run annotation process for multiple files and save results to a CSV file.
    
    Args:
        prompt_templates_dict (dict): Dictionary of prompt templates.
        output_csv (str): Path to save the CSV results.
        models (list, optional): List of models to use. Defaults to GPT-4o-mini.
    """
    # Default to GPT-4o-mini if no models specified
    if models is None:
        models = ['groq:gemma-7b-it']

    # Prepare prompts from the templates dictionary (limit to the first 5 files for this case)
    test_prompts = [template_info['prompt_template'] for template_info in prompt_templates_dict.values()][:2]
    
    try:
        # Get model responses
        model_responses = model_routing.run_multi_model_prompts(
            models=models, 
            prompts=test_prompts, 
            num_runs=2  # Number of runs per model
        )

        # Save results to CSV 
        save_results_to_csv(
            models=models,
            prompt_templates_dict=prompt_templates_dict,
            model_responses=model_responses,
            output_file=output_csv
        )

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

output_csv = 'LLMAnnotation_MultiFile_12_8_24_GroqLLAMA.csv'

# Run annotations for all files
run_multi_file_annotations(prompt_templates_dict, output_csv)

print(f"Annotation results saved to {output_csv}")
print(f"Number of files processed: {len(prompt_templates_dict)}")


Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
Model: groq:gemma-7b-it - Completed a run
An error occurred: 0
Annotation results saved to LLMAnnotation_MultiFile_12_8_24_GroqLLAMA.csv
Number of files processed: 24


Traceback (most recent call last):
  File "C:\Users\Baldw\AppData\Local\Temp\ipykernel_25824\3475784030.py", line 81, in run_multi_file_annotations
    save_results_to_csv(
  File "C:\Users\Baldw\AppData\Local\Temp\ipykernel_25824\3475784030.py", line 34, in save_results_to_csv
    responses = model_responses[model_index]  # Get responses for the current model
                ~~~~~~~~~~~~~~~^^^^^^^^^^^^^
KeyError: 0


In [65]:
import pandas as pd
import json
import re
from sklearn.metrics import f1_score

# Load the privacy ontology from the JSON file
with open('privacy_ontology.json', 'r') as f:
    privacy_ontology = json.load(f)

# Function to extract labels from the JSON in the 'Target Annotations' column
def extract_labels_from_json(target_annotations):
    try:
        annotations = json.loads(target_annotations)
        actions = annotations.get("actions", [])
        data_types = annotations.get("data_types", [])
        purposes = annotations.get("purposes", [])
        
        # Filter labels to match the ontology keys
        actions = [action for action in actions if action in actions_ontology]
        data_types = [data_type for data_type in data_types if data_type in data_types_ontology]
        purposes = [purpose for purpose in purposes if purpose in purposes_ontology]
        
        return actions, data_types, purposes
    except (json.JSONDecodeError, TypeError):
        return [], [], []

# Function to extract labels before the <R> tag in Model Response columns
def extract_labels_before_r(model_response):
    match = re.split(r'<R>', model_response)
    if match:
        return match[0].strip().split(",")  # Split by commas if multiple labels are present
    return []

# Function to get the set of labels from the ontology for comparison
def get_labels_from_ontology(category):
    labels = set()

    # Recursive function to traverse the ontology and get labels for each category
    def traverse_ontology(data, category):
        if isinstance(data, dict):
            for key, value in data.items():
                if key == category:
                    labels.update(value.keys())  # Collect the keys (labels) in this category
                else:
                    traverse_ontology(value, category)
    
    traverse_ontology(privacy_ontology, category)
    return labels

# Function to compute F1 score
def compute_f1_score(true_labels, predicted_labels):
    true_labels_set = set(true_labels)
    predicted_labels_set = set(predicted_labels)

    # Debugging: Print out the labels to check if there are mismatches
    print(f"True labels: {true_labels_set}")
    print(f"Predicted labels: {predicted_labels_set}")

    if not true_labels_set and not predicted_labels_set:
        return 1.0  # If both are empty, F1 score is perfect
    if not true_labels_set or not predicted_labels_set:
        return 0.0  # If one is empty and the other isn't, F1 score is zero

    return f1_score(list(true_labels_set), list(predicted_labels_set), average='micro')

# Load CSV file (replace with your actual file path)
df = pd.read_csv('output.csv')

# Labels from ontology
actions_ontology = get_labels_from_ontology('Actions')
data_types_ontology = get_labels_from_ontology('Data Types')
purposes_ontology = get_labels_from_ontology('Purposes')

# Initialize lists to store the results
f1_actions_1, f1_data_types_1, f1_purposes_1 = [], [], []
f1_actions_2, f1_data_types_2, f1_purposes_2 = [], [], []

# Process each row to extract the necessary labels and compute F1 scores
for index, row in df.iterrows():
    # Extract target labels from 'Target Annotations' column
    target_actions, target_data_types, target_purposes = extract_labels_from_json(row['Target Annotations'])
    
    # Extract labels from 'Model Response 1'
    model_actions_1 = extract_labels_before_r(row['Model Response 1'])
    model_data_types_1 = extract_labels_before_r(row['Model Response 1'])
    model_purposes_1 = extract_labels_before_r(row['Model Response 1'])
    
    # Extract labels from 'Model Response 2'
    model_actions_2 = extract_labels_before_r(row['Model Response 2'])
    model_data_types_2 = extract_labels_before_r(row['Model Response 2'])
    model_purposes_2 = extract_labels_before_r(row['Model Response 2'])

    # Debugging: Print out the labels for each row to check
    print(f"Row {index}:")
    print(f"Target actions: {target_actions}")
    print(f"Model actions 1: {model_actions_1}")
    print(f"Model actions 2: {model_actions_2}")

    # Compute F1 scores for Model Response 1
    f1_actions_1.append(compute_f1_score(target_actions, model_actions_1))
    f1_data_types_1.append(compute_f1_score(target_data_types, model_data_types_1))
    f1_purposes_1.append(compute_f1_score(target_purposes, model_purposes_1))
    
    # Compute F1 scores for Model Response 2
    f1_actions_2.append(compute_f1_score(target_actions, model_actions_2))
    f1_data_types_2.append(compute_f1_score(target_data_types, model_data_types_2))
    f1_purposes_2.append(compute_f1_score(target_purposes, model_purposes_2))

# Add the F1 scores to the DataFrame (optional)
df['F1 Actions 1'] = f1_actions_1
df['F1 Data Types 1'] = f1_data_types_1
df['F1 Purposes 1'] = f1_purposes_1
df['F1 Actions 2'] = f1_actions_2
df['F1 Data Types 2'] = f1_data_types_2
df['F1 Purposes 2'] = f1_purposes_2

# Optionally, save the updated dataframe to a new CSV file
df.to_csv('processed_f1_output.csv', index=False)


Row 0:
Target actions: []
Model actions 1: ['["**Annotations:**\\n\\nActions: \\n- Collect\\n- Use\\n- Share\\n- Delete\\n\\nData Types:\\n- Financial:\\n  - Account Balance:\\n  - Orders:\\n  - Payment History:\\n\\nPurposes:\\n- Accounts\\n- Functionality\\n- Requirements\\n\\nStories:\\n1. We collect Account Balance for accounts management. We use Account Balance for functionality in tracking finances.\\n2. We share Payment History for requirements to ensure proper account management. We delete accounts when they are no longer needed for accounts management.\\n\\n']
Model actions 2: ['["**Actions:** \\nCollect', ' Use', ' Share\\n\\n**Data Types:** \\nAccount Information:\\n  Account Balance:\\n  Bank Account:\\n  Assets:\\n\\n**Purposes:** \\nAnalytics', ' Accounts', ' Functionality\\n\\n**Stories:** \\nWe collect account information', ' account balance', ' bank account', ' and assets for analytics. We use account information', ' account balance', ' bank account', ' and assets for 

Unnamed: 0,File,Prompt,Model,Target File Path,Target Annotations,Model Response 1,Model Response 2,F1 Actions 1,F1 Data Types 1,F1 Purposes 1,F1 Actions 2,F1 Data Types 2,F1 Purposes 2
0,Actual_Budget\Accounts_&_Transactions.txt,You are a privacy expert annotator tasked with...,openai:gpt-4o-mini,input\Actual_Budget\Accounts_&_Transactions.txt,"{""file_name"": ""Accounts_&_Transactions.txt"", ""...","[""**Annotations:**\n\nActions: \n- Collect\n- ...","[""**Actions:** \nCollect, Use, Share\n\n**Data...",0.0,0.0,0.0,0.0,0.0,0.0
1,Actual_Budget\Accounts_&_Transactions.txt,You are a privacy expert annotator tasked with...,openai:gpt-4o-mini,input\Actual_Budget\Accounts_&_Transactions.txt,"{""file_name"": ""Accounts_&_Transactions.txt"", ""...","[""**Actions:** \nCollect, Use, Share\n\n**Data...","[""**Actions:**\n1. Collect\n2. Use\n3. Share\n...",0.0,0.0,0.0,0.0,0.0,0.0
2,Akaunting\CreatingAnewAccount.txt,You are a privacy expert annotator tasked with...,openai:gpt-4o-mini,input\Akaunting\CreatingAnewAccount.txt,"{""file_name"": ""CreatingAnewAccount.txt"", ""full...","[""### Annotations:\n\n**Actions:**\n- Collect\...","[""### Annotations:\n\n**Actions:**\n1. Collect...",0.0,0.0,0.0,0.0,0.0,0.0
3,Attendize\features.txt,You are a privacy expert annotator tasked with...,openai:gpt-4o-mini,input\Attendize\features.txt,"{""file_name"": ""features.txt"", ""full_cleaned_te...","[""**Annotations:**\n\nActions: \n1. Collect \n...","[""**Annotations:**\n\nActions: \n- Collect\n- ...",0.0,0.0,0.0,0.0,0.0,0.0
4,element-android\add_threePids.txt,You are a privacy expert annotator tasked with...,openai:gpt-4o-mini,input\element-android\add_threePids.txt,"{""file_name"": ""add_threePids.txt"", ""full_clean...","[""**Annotations:**\n\n**Actions:**\n1. Collect...","[""### Annotations:\n\n**Actions:**\n1. Collect...",0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
import csv

def delete_rows(input_file, output_file):
    # Open the input CSV file with UTF-8 encoding
    with open(input_file, mode='r', newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        rows = list(reader)
    
    # We'll keep the first two rows, and then delete 25 and keep the 26th row, and so on.
    filtered_rows = []
    
    # Add the first two rows without modification
    filtered_rows.extend(rows[:2])

    # Now, for every group of 26 rows, we will delete the first 25 and keep the 26th
    for i in range(2, len(rows), 26):
        # Keep the 26th row (the last row of each 26-row block)
        filtered_rows.append(rows[i])
    
    # Write the filtered rows to the output CSV file
    with open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        writer.writerows(filtered_rows)

# Usage example
input_file = 'LLMAnnotation_12_8_24_Gemma.csv'  # Replace with your input CSV file path
output_file = 'LLMAnnotation_GroqGemma.csv'  # Replace with your desired output CSV file path
delete_rows(input_file, output_file)


In [1]:


import os

from groq import Groq

client = Groq(
    api_key="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",
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Explain the importance of fast language models",
        }
    ],
    model="llama3-8b-8192",
)

print(chat_completion.choices[0].message.content)


Fast language models are important because they enable large-scale natural language processing (NLP) applications to process and analyze vast amounts of text data quickly and efficiently. Here are some reasons why fast language models are crucial:

1. **Scalability**: Fast language models can process enormous amounts of text data, making them essential for applications that require processing large datasets, such as search engines, social media platforms, and e-commerce websites.
2. **Real-time processing**: Fast language models enable real-time processing of text data, which is critical for applications that require instant responses, such as chatbots, virtual assistants, and recommendation systems.
3. **Improved user experience**: Fast language models can provide faster and more accurate search results, recommendations, and translations, leading to a better user experience for consumers.
4. **Enhanced decision-making**: By processing large amounts of text data quickly, fast language 