In [79]:
import json
import os

def parse_ner_predictions(input_file):
    """Parse NER predictions into structured format"""
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    parsed_data = []
    
    for item in data:
        # Get the original text, tokens and labels
        text = item['text']
        tokens = item['tokenized_text']
        gold_labels = item['gold_label']
        pred_labels = item['prediction']
        
        # Sanity check that lengths match
        if len(tokens) != len(gold_labels) or len(tokens) != len(pred_labels):
            raise ValueError(f"Mismatched lengths in {input_file}: tokens={len(tokens)}, gold={len(gold_labels)}, pred={len(pred_labels)}")
        
        # Build gold and prediction lists
        gold = []
        pred = []
        
        # Track multi-token entities
        curr_gold_entity = ''
        curr_gold_text = []
        curr_pred_entity = ''
        curr_pred_text = []
        
        for token, gold_label, pred_label in zip(tokens, gold_labels, pred_labels):
            # Handle gold labels
            if gold_label.startswith('B-'):
                if curr_gold_entity:
                    gold.append({'text': ' '.join(curr_gold_text), 'value': curr_gold_entity.upper()})
                curr_gold_entity = gold_label[2:]
                curr_gold_text = [token]
            elif gold_label.startswith('I-'):
                if curr_gold_entity == gold_label[2:]:
                    curr_gold_text.append(token)
            elif gold_label == 'O':
                if curr_gold_entity:
                    gold.append({'text': ' '.join(curr_gold_text), 'value': curr_gold_entity.upper()})
                    curr_gold_entity = ''
                    curr_gold_text = []
            
            # Handle predicted labels
            if pred_label.startswith('B-'):
                if curr_pred_entity:
                    pred.append({'text': ' '.join(curr_pred_text), 'value': curr_pred_entity.upper()})
                curr_pred_entity = pred_label[2:]
                curr_pred_text = [token]
            elif pred_label.startswith('I-'):
                if curr_pred_entity == pred_label[2:]:
                    curr_pred_text.append(token)
            elif pred_label == 'O':
                if curr_pred_entity:
                    pred.append({'text': ' '.join(curr_pred_text), 'value': curr_pred_entity.upper()})
                    curr_pred_entity = ''
                    curr_pred_text = []
        
        # Add any remaining entities
        if curr_gold_entity:
            gold.append({'text': ' '.join(curr_gold_text), 'value': curr_gold_entity.upper()})
        if curr_pred_entity:
            pred.append({'text': ' '.join(curr_pred_text), 'value': curr_pred_entity.upper()})
            
        parsed_item = {
            'text': text,
            'gold': gold,
            'prediction': pred
        }
        parsed_data.append(parsed_item)
    
    return parsed_data

def process_ner_files(model_name):
    """Process all NER files for a given model"""
    input_dir = f'NER_{model_name}'
    output_dir = f'parsed_NER_{model_name}'
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    for filename in os.listdir(input_dir):
        if 'casual_100' not in filename:
            continue
        if filename.endswith('.json'):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)
            
            parsed_data = parse_ner_predictions(input_path)
            
            # Sanity check number of entities matches between input and parsed
            with open(input_path, 'r') as f:
                input_data = json.load(f)
            total_gold_count_input = 0    
            total_pred_count_input = 0
            total_gold_count_parsed = 0
            total_pred_count_parsed = 0
            for i, (input_item, parsed_item) in enumerate(zip(input_data, parsed_data)):
                gold_count = sum(1 for label in input_item['gold_label'] if label.startswith('B-'))
                pred_count = sum(1 for label in input_item['prediction'] if label.startswith('B-'))
                total_gold_count_input += gold_count
                total_pred_count_input += pred_count
                total_gold_count_parsed += len(parsed_item['gold'])
                total_pred_count_parsed += len(parsed_item['prediction'])
                if len(parsed_item['gold']) != gold_count:
                    raise ValueError(f"Mismatch in gold entities for item {i} in {filename}: {len(parsed_item['gold'])} vs {gold_count}")
                if len(parsed_item['prediction']) != pred_count:
                    raise ValueError(f"Mismatch in predicted entities for item {i} in {filename}: {len(parsed_item['prediction'])} vs {pred_count}")
            if filename == 'bert_active_to_passive_ori.json':
                # Count entities by class for gold labels
                input_gold_class_counts = {}
                input_pred_class_counts = {}
                parsed_gold_class_counts = {}
                parsed_pred_class_counts = {}
                
                # Count from input data
                for item in input_data:
                    for label in item['gold_label']:
                        if label.startswith('B-'):
                            entity_class = label[2:]
                            input_gold_class_counts[entity_class] = input_gold_class_counts.get(entity_class, 0) + 1
                    for label in item['prediction']:
                        if label.startswith('B-'):
                            entity_class = label[2:]
                            input_pred_class_counts[entity_class] = input_pred_class_counts.get(entity_class, 0) + 1
                
                # Count from parsed data            
                for item in parsed_data:
                    for entity in item['gold']:
                        entity_class = entity['value']
                        parsed_gold_class_counts[entity_class] = parsed_gold_class_counts.get(entity_class, 0) + 1
                    for entity in item['prediction']:
                        entity_class = entity['value'] 
                        parsed_pred_class_counts[entity_class] = parsed_pred_class_counts.get(entity_class, 0) + 1
                
                print("Input gold entity counts by class:")
                for entity_class, count in input_gold_class_counts.items():
                    print(f"{entity_class}: {count}")
                print("\nInput predicted entity counts by class:")
                for entity_class, count in input_pred_class_counts.items():
                    print(f"{entity_class}: {count}")
                print("\nParsed gold entity counts by class:")
                for entity_class, count in parsed_gold_class_counts.items():
                    print(f"{entity_class}: {count}")
                print("\nParsed predicted entity counts by class:")
                for entity_class, count in parsed_pred_class_counts.items():
                    print(f"{entity_class}: {count}")
            with open(output_path, 'w') as f:
                json.dump(parsed_data, f, indent=2)


In [81]:
process_ner_files('BERT')
process_ner_files('GPT2')
process_ner_files('T5')


In [112]:
import pandas as pd
import glob
import ast
import difflib
from tqdm import tqdm

In [113]:
def get_example_f1_and_counts(example):
    true_entities = []
    pred_entities = []
    
    # Get entities from appropriate field names
    gold_entities = example['gold']
    pred_entities_raw = example['prediction']
    
    # Handle empty case
    if not gold_entities and not pred_entities_raw:
        return 0.0, {}

    # Process gold entities
    for entity in gold_entities:
        if isinstance(entity, str):
            entity = ast.literal_eval(entity)
        # Store as tuple of (text, value, class) to handle duplicates
        if entity.get('text') is not None and entity.get('value') is not None:
            true_entities.append((entity['text'], entity['value'], entity['value']))
        elif entity.get('text') is not None and entity.get('class') is not None:
            true_entities.append((entity['text'], entity['class'], entity['class']))
        else:
            # Handle dictionary format entities
            for key, value in entity.items():
                if isinstance(value, str):
                    true_entities.append((key, value, value))

    # Process predicted entities
    for entity in pred_entities_raw:
        if isinstance(entity, str):
            entity = ast.literal_eval(entity)
        if entity.get('text') is not None:
            pred_entities.append((entity['text'], entity['value'], entity['value']))
        else:
            for key, value in entity.items():
                if isinstance(value, str):
                    pred_entities.append((key, value, value))
    
    # Calculate per-class counts
    class_counts = {}
    # Get unique classes from both true and predicted entities to ensure complete coverage
    classes = set(e[2] for e in true_entities) | set(e[2] for e in pred_entities)
    
    for cls in classes:
        # Get entities for this class
        true_cls = [e for e in true_entities if e[2] == cls]
        pred_cls = [e for e in pred_entities if e[2] == cls]
        
        # Calculate counts for this class allowing for duplicates
        tp = sum(1 for t in true_cls if t in pred_cls)
        # Count false positives - predictions that don't match any gold entity
        fp = len(pred_cls) - tp
        # Count false negatives - gold entities that weren't predicted
        fn = len(true_cls) - tp
        
        class_counts[cls] = (tp, fp, fn)
    
    # Calculate overall F1 for the example
    total_tp = sum(counts[0] for counts in class_counts.values())
    total_fp = sum(counts[1] for counts in class_counts.values())
    total_fn = sum(counts[2] for counts in class_counts.values())
    
    # Handle edge case where no true positives
    if total_tp == 0:
        return 0.0, class_counts
        
    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    
    if precision + recall == 0:
        return 0.0, class_counts
        
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1, class_counts

def get_f1_scores_and_counts(data):
    if not data:
        return [], {}
        
    scores_and_counts = [get_example_f1_and_counts(example) for example in data]
    f1_scores = [score for score, _ in scores_and_counts]
    
    # Combine per-class counts across all examples
    class_counts = {}
    for _, example_counts in scores_and_counts:
        for cls, (tp, fp, fn) in example_counts.items():
            if cls not in class_counts:
                class_counts[cls] = [0, 0, 0]
            class_counts[cls][0] += tp  # Add true positives
            class_counts[cls][1] += fp  # Add false positives
            class_counts[cls][2] += fn  # Add false negatives
            
    return f1_scores, class_counts

def calculate_micro_f1(counts):
    if isinstance(counts, tuple):
        tp, fp, fn = counts
        if tp == 0:
            return 0.0
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        if precision + recall == 0:
            return 0.0
        return 2 * (precision * recall) / (precision + recall)
    else:
        # Calculate micro F1 across all classes
        total_tp = sum(counts[cls][0] for cls in counts)
        total_fp = sum(counts[cls][1] for cls in counts)
        total_fn = sum(counts[cls][2] for cls in counts)
        
        if total_tp == 0:
            return {'micro_f1': 0.0, 'support': 0}
            
        precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
        recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
        
        if precision + recall == 0:
            micro_f1 = 0.0
        else:
            micro_f1 = 2 * (precision * recall) / (precision + recall)
            
        # Calculate per-class metrics
        class_f1s = {'micro_f1': micro_f1}
        total_support = 0
        
        for cls, (tp, fp, fn) in counts.items():
            support = tp + fn  # Support is true positives + false negatives
            total_support += support
            
            if tp == 0:
                class_f1s[cls] = {'f1': 0.0, 'support': support}
                continue
                
            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            
            if precision + recall == 0:
                class_f1s[cls] = {'f1': 0.0, 'support': support}
            else:
                f1 = 2 * (precision * recall) / (precision + recall)
                class_f1s[cls] = {'f1': f1, 'support': support}
        
        class_f1s['support'] = total_support
        return class_f1s


In [114]:
import json
import numpy as np
from scipy import stats
import pandas as pd
import os
from pathlib import Path
import ast
def load_json_file(filepath):
    with open(filepath, 'r') as f:
        return json.load(f)

# Get list of all modifications from filenames
modifications = set()
models = ['GPT2', 'BERT', 'T5']
for model in models:
    model_dir = f'parsed_NER_{model}'
    for filename in os.listdir(model_dir):
        if filename.startswith(f'{model.lower()}_') and filename.endswith('_ori.json'):
            mod = filename.replace(f'{model.lower()}_', '').replace('_ori.json', '')
            modifications.add(mod)
modifications = list(modifications)

# Load original and modified files for each model
ori_files = {model: {} for model in models}
modif_files = {model: {} for model in models}

for model in models:
    model_dir = f'parsed_NER_{model}'
    for modification in modifications:
        # Load original files
        ori_filepath = f'{model_dir}/{model.lower()}_{modification}_ori.json'
        ori_files[model][modification] = load_json_file(ori_filepath)
        
        # Load modified files
        modif_filepath = f'{model_dir}/{model.lower()}_{modification}_modif.json'
        modif_files[model][modification] = load_json_file(modif_filepath)


# Calculate and store F1 scores for each model
results = []
negation_type_results = []  # For storing negation type breakdown

for model in models:
    for modification in modifications:
        compare_file = Path(f'../../preprocessing/data_after_phase2/rahmad/{modification}_100.json')
        print(model, modification)
        if not compare_file.exists():
            continue
        compare_df = json.load(open(compare_file))
        if len(compare_df) != len(ori_files[model][modification]):
            print('mismatch',modification, model)
            print(len(compare_df), len(ori_files[model][modification]))
        ori_f1_scores, ori_counts = get_f1_scores_and_counts(ori_files[model][modification])
        modif_f1_scores, modif_counts = get_f1_scores_and_counts(modif_files[model][modification])
        
        # Multiply F1 scores by 100
        ori_f1_scores = [f1 * 100 for f1 in ori_f1_scores]
        modif_f1_scores = [f1 * 100 for f1 in modif_f1_scores]
        
        # Calculate mean F1 scores
        ori_mean_f1 = np.mean(ori_f1_scores)
        modif_mean_f1 = np.mean(modif_f1_scores)
        # Calculate micro F1 scores
        ori_micro_f1 = calculate_micro_f1(ori_counts)
        modif_micro_f1 = calculate_micro_f1(modif_counts)
        
        # Multiply micro F1 scores by 100
        ori_micro_f1['micro_f1'] *= 100
        modif_micro_f1['micro_f1'] *= 100
        
        # Calculate percentage change
        mean_f1_pct_change = ((modif_mean_f1 - ori_mean_f1) / ori_mean_f1) * 100
        micro_f1_pct_change = ((modif_micro_f1['micro_f1'] - ori_micro_f1['micro_f1']) / ori_micro_f1['micro_f1']) * 100
        # Calculate weighted delta
        weighted_delta = (modif_mean_f1 - ori_mean_f1) * np.log10(ori_mean_f1) / np.log10(100)
        # print(ori_micro_f1, modif_micro_f1)
        # Perform paired t-test on per-example F1 scores
        try:
            _, p_wilcoxon = stats.wilcoxon(ori_f1_scores, modif_f1_scores)
        except ValueError:
            p_wilcoxon = 1.0
        try:
            _, p_mannwhitney = stats.mannwhitneyu(ori_f1_scores, modif_f1_scores)
        except ValueError:
            p_mannwhitney = 1.0
        p_value = min(p_wilcoxon, p_mannwhitney)
        
        # Determine significance level
        if p_value < 0.01:
            significance = "**"
        elif p_value < 0.05:
            significance = "*"
        elif p_value < 0.1:
            significance = "."
        else:
            significance = "ns"
        
        results.append({
            'model': model,
            'modification': modification,
            'original_mean_f1': ori_mean_f1,
            'modified_mean_f1': modif_mean_f1,
            'mean_f1_pct_change': mean_f1_pct_change,
            'original_micro_f1': ori_micro_f1,
            'modified_micro_f1': modif_micro_f1,
            'micro_f1_pct_change': micro_f1_pct_change,
            'weighted_delta': weighted_delta,
            'p_value': p_value,
            'significance': significance
        })
        
        # Additional analysis for negation types
        if modification == 'negation':
            # Load negation type information
            negation_file = Path(f'../../preprocessing/data_after_phase2/rahmad/negation_100.json')
            negation_data = json.load(open(negation_file))
            
            # Group examples by negation type
            type_results = {}
            for idx, (ori_f1, mod_f1) in enumerate(zip(ori_f1_scores, modif_f1_scores)):
                neg_type = negation_data[idx].get('subtype', 'unknown')
                if neg_type not in type_results:
                    type_results[neg_type] = {'ori_f1s': [], 'mod_f1s': []}
                type_results[neg_type]['ori_f1s'].append(ori_f1)
                type_results[neg_type]['mod_f1s'].append(mod_f1)
            
            # Calculate metrics for each negation type
            for neg_type, scores in type_results.items():
                ori_mean = np.mean(scores['ori_f1s'])
                mod_mean = np.mean(scores['mod_f1s'])
                pct_change = ((mod_mean - ori_mean) / ori_mean) * 100 if ori_mean > 0 else 0
                weighted_delta = (mod_mean - ori_mean) * np.log10(ori_mean) / np.log10(100)
                
                # Statistical tests
                if len(scores['ori_f1s']) > 1:  # Only if we have enough samples
                    try:    
                        _, p_wilcoxon = stats.wilcoxon(scores['ori_f1s'], scores['mod_f1s'])
                    except ValueError:
                        p_wilcoxon = 1.0
                    try:
                        _, p_mannwhitney = stats.mannwhitneyu(scores['ori_f1s'], scores['mod_f1s'])
                    except ValueError:
                        p_mannwhitney = 1.0
                    p_value = min(p_wilcoxon, p_mannwhitney)
                else:
                    p_value = 1.0
                
                # Determine significance level for negation types
                if p_value < 0.01:
                    significance = "**"
                elif p_value < 0.05:
                    significance = "*"
                elif p_value < 0.1:
                    significance = "."
                else:
                    significance = "ns"
                
                negation_type_results.append({
                    'model': model,
                    'negation_type': neg_type,
                    'original_mean_f1': ori_mean,
                    'modified_mean_f1': mod_mean,
                    'mean_f1_pct_change': pct_change,
                    'weighted_delta': weighted_delta,
                    'sample_size': len(scores['ori_f1s']),
                    'p_value': p_value,
                    'significance': significance
                })

# Create DataFrame and save to CSV
df = pd.DataFrame(results)
df.to_csv('ner_modification_results_plm.csv', index=False)

# Save negation type results
if negation_type_results:
    df_negation = pd.DataFrame(negation_type_results)
    df_negation.to_csv('ner_negation_type_results_plm.csv', index=False)


GPT2 casual
GPT2 dialectal
GPT2 derivation
GPT2 grammatical_role
GPT2 discourse
GPT2 punctuation
GPT2 concept_replacement
GPT2 geographical_bias
GPT2 length_bias
GPT2 coordinating_conjunction
GPT2 capitalization
GPT2 negation
GPT2 active_to_passive
GPT2 sentiment
GPT2 typo_bias
GPT2 singlish
GPT2 temporal_bias
GPT2 compound_word
BERT casual
BERT dialectal
BERT derivation
BERT grammatical_role
BERT discourse
BERT punctuation
BERT concept_replacement
BERT geographical_bias
BERT length_bias
BERT coordinating_conjunction
BERT capitalization
BERT negation
BERT active_to_passive
BERT sentiment
BERT typo_bias
BERT singlish
BERT temporal_bias
BERT compound_word
T5 casual
T5 dialectal
T5 derivation
T5 grammatical_role
T5 discourse
T5 punctuation
T5 concept_replacement
T5 geographical_bias
T5 length_bias
T5 coordinating_conjunction
T5 capitalization
T5 negation
T5 active_to_passive
T5 sentiment
T5 typo_bias
T5 singlish
T5 temporal_bias
T5 compound_word


In [115]:
import json
import numpy as np
from scipy import stats
import pandas as pd
import os


In [116]:
def get_example_f1_and_counts_list(gold, pred):
    # Convert string labels to lists if needed
    if isinstance(gold, str):
        gold = ast.literal_eval(gold)
    if isinstance(pred, str):
        pred = ast.literal_eval(pred)

    # Standardize format to list of dicts with 'text' and 'value' keys
    def standardize_format(data):
        if isinstance(data, dict):
            return [{'text': k, 'value': v} for k, v in data.items()]
        elif isinstance(data, list) and len(data) > 0:
            if isinstance(data[0], dict):
                standardized = []
                for item in data:
                    if 'text' not in item:
                        for text, value in item.items():
                            standardized.append({'text': text, 'value': value})
                    else:
                        standardized.append(item)
                return standardized
        return data

    gold = standardize_format(gold)
    pred = standardize_format(pred)

    # Calculate metrics by comparing each prediction against gold
    tp = 0
    gold_matched = [False] * len(gold)
    pred_matched = [False] * len(pred)

    # First pass - find exact matches
    for i, p in enumerate(pred):
        for j, g in enumerate(gold):
            if not gold_matched[j] and not pred_matched[i]:
                if p['text'] == g['text'] and p['value'] == g['value']:
                    tp += 1
                    gold_matched[j] = True
                    pred_matched[i] = True

    # Calculate false positives and false negatives
    fp = len(pred) - tp  # Predictions that didn't match any gold
    fn = len(gold) - tp  # Gold entities that weren't matched

    # Calculate F1 score for this example
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1, (tp, fp, fn)

def calculate_micro_f1_list(counts):
    # Sum up all true positives, false positives, and false negatives
    tp = sum(count[0] for count in counts)
    fp = sum(count[1] for count in counts)
    fn = sum(count[2] for count in counts)

    # Calculate micro-averaged precision and recall
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    # Calculate micro F1
    micro_f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return micro_f1


In [117]:
# Read and analyze modification results from rahmad directory
import glob
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import stats
from pathlib import Path
import json
rahmad_results_dir = '../../eval/results/rahmad/'
rahmad_results_files = glob.glob(os.path.join(rahmad_results_dir, '*.csv'))

print("\nAnalyzing results from rahmad directory:")
print("-" * 50)

# Create list to store results
results_data = []
negation_results_data = []

for results_file in rahmad_results_files:
    # Extract model and modification from filename
    filename = os.path.basename(results_file)
    print(filename)
    if 'DP' in filename or 'ner' in filename:
        continue
    model = filename.split('-')[0]
    modification = filename.split('-')[2].replace('_100_new.csv', '')
    print(modification)
    
    print(f"\nResults from {filename}:")
    print("=" * 50)
    
    # Read the CSV file
    df = pd.read_csv(results_file)
    compare_file = Path(f'../../preprocessing/data_after_phase2/rahmad/{modification}_100.json')
    if not compare_file.exists():
        continue
    compare_df = json.load(open(compare_file))
    if len(compare_df) != len(df):
        print('mismatch',modification, model)
    # Calculate macro F1 scores
    # Get labels and predictions
    # Get original and modified labels/predictions
    ori_labels = df['original_label'].values
    ori_preds = df['original_pred'].values
    mod_labels = df['modified_label'].values
    mod_preds = df['modified_pred'].values
    # Calculate F1 scores using helper functions
    ori_f1_scores = []
    modif_f1_scores = []
    print('original')
    for l, p in tqdm(zip(ori_labels, ori_preds)):
        f1, _ = get_example_f1_and_counts_list(l, p)
        ori_f1_scores.append(f1 * 100)
    print('modified')
    for l, p in tqdm(zip(mod_labels, mod_preds)):
        f1, _ = get_example_f1_and_counts_list(l, p)
        modif_f1_scores.append(f1 * 100)
    print('done calculating f1 scores')
    # Calculate mean F1 scores
    ori_mean_f1 = np.mean(ori_f1_scores)
    modif_mean_f1 = np.mean(modif_f1_scores)
    # Calculate weighted delta
    weighted_delta = (modif_mean_f1 - ori_mean_f1) * np.log10(ori_mean_f1) / np.log10(100)
    # Perform t-test
    _, p_value_mw = stats.mannwhitneyu(ori_f1_scores, modif_f1_scores, alternative='two-sided')
    _, p_value_w = stats.wilcoxon(ori_f1_scores, modif_f1_scores)
    p_value = min(p_value_mw, p_value_w)
    
    # Determine significance level
    if p_value < 0.01:
        significance = "**"
    elif p_value < 0.05:
        significance = "*"
    elif p_value < 0.1:
        significance = "."
    else:
        significance = "ns"
        
    print(f"\n{model} - {modification.upper()} Modification:")
    print(f"Original Mean F1: {ori_mean_f1:.3f}")
    print(f"Modified Mean F1: {modif_mean_f1:.3f}")
    print(f"Weighted Delta: {weighted_delta:.3f}")
    print(f"P-value: {p_value:.4f}")
    print(f"Significance: {significance}")
    
    # For negation, get subtype results
    if modification == 'negation':
        for subtype in ['verbal', 'lexical', 'double', 'approximate', 'absolute']:
            subtype_df = df[df['type'] == subtype]
            if len(subtype_df) == 0:
                continue
                
            # Calculate F1 scores for subtype
            ori_subtype_f1 = []
            mod_subtype_f1 = []
            
            for l, p in zip(subtype_df['original_label'], subtype_df['original_pred']):
                f1, _ = get_example_f1_and_counts_list(l, p)
                ori_subtype_f1.append(f1 * 100)
                
            for l, p in zip(subtype_df['modified_label'], subtype_df['modified_pred']):
                f1, _ = get_example_f1_and_counts_list(l, p)
                mod_subtype_f1.append(f1 * 100)
                
            # Calculate stats
            ori_mean = np.mean(ori_subtype_f1)
            mod_mean = np.mean(mod_subtype_f1)
            weighted_delta = (mod_mean - ori_mean) * np.log10(ori_mean) / np.log10(100)
            
            # Statistical tests
            _, p_mw = stats.mannwhitneyu(ori_subtype_f1, mod_subtype_f1, alternative='two-sided')
            _, p_w = stats.wilcoxon(ori_subtype_f1, mod_subtype_f1)
            p_val = min(p_mw, p_w)
            
            # Determine significance
            if p_val < 0.001:
                sig = '***'
            elif p_val < 0.01:
                sig = '**'
            elif p_val < 0.05:
                sig = '*'
            elif p_val < 0.1:
                sig = '.'
            else:
                sig = 'ns'
                
            # Store subtype results in negation results
            negation_results_data.append({
                'model': model,
                'negation_type': subtype,
                'original_mean_f1': ori_mean,
                'modified_mean_f1': mod_mean,
                'weighted_delta': weighted_delta,
                'sample_size': len(subtype_df),
                'p_value': p_val,
                'significance': sig
            })
    
    # Store main results
    results_data.append({
        'model': model,
        'modification': modification,
        'original_mean_f1': ori_mean_f1,
        'modified_mean_f1': modif_mean_f1,
        'weighted_delta': weighted_delta,
        'p_value': p_value,
        'significance': significance
    })

# Create DataFrame and save to CSV
results_df = pd.DataFrame(results_data)
results_df.to_csv('ner_modification_results_llm.csv', index=False)
print("\nResults saved to ner_modification_results_llm.csv")

# Save negation results
negation_results_df = pd.DataFrame(negation_results_data)
negation_results_df.to_csv('ner_negation_type_results_llm.csv', index=False)
print("\nNegation results saved to ner_negation_type_results_llm.csv")



Analyzing results from rahmad directory:
--------------------------------------------------
llama-0shot-concept_replacement_100.csv
concept_replacement_100.csv

Results from llama-0shot-concept_replacement_100.csv:
claude-0shot-capitalization_100_new.csv
capitalization

Results from claude-0shot-capitalization_100_new.csv:
original


100it [00:00, 24184.42it/s]


modified


100it [00:00, 30856.35it/s]




done calculating f1 scores

claude - CAPITALIZATION Modification:
Original Mean F1: 60.677
Modified Mean F1: 56.444
Weighted Delta: -3.774
P-value: 0.0792
Significance: .
gpt4o-0shot-sentiment_100_new.csv
sentiment

Results from gpt4o-0shot-sentiment_100_new.csv:
original


123it [00:00, 36039.08it/s]


modified


123it [00:00, 38342.58it/s]


done calculating f1 scores

gpt4o - SENTIMENT Modification:
Original Mean F1: 57.025
Modified Mean F1: 57.081
Weighted Delta: 0.049
P-value: 0.9547
Significance: ns
llama-0shot-negation_100.csv
negation_100.csv

Results from llama-0shot-negation_100.csv:
llama-0shot-temporal_bias_100.csv
temporal_bias_100.csv

Results from llama-0shot-temporal_bias_100.csv:
llama-0shot-discourse_100_compare.csv
discourse_100_compare.csv

Results from llama-0shot-discourse_100_compare.csv:
gpt4o-0shot-coordinating_conjunction_100.csv
coordinating_conjunction_100.csv

Results from gpt4o-0shot-coordinating_conjunction_100.csv:
llama-0shot-sentiment_100_compare.csv
sentiment_100_compare.csv

Results from llama-0shot-sentiment_100_compare.csv:
llama-0shot-grammatical_role_100.csv
grammatical_role_100.csv

Results from llama-0shot-grammatical_role_100.csv:
gpt4o-0shot-grammatical_role_100.csv
grammatical_role_100.csv

Results from gpt4o-0shot-grammatical_role_100.csv:
claude-0shot-compound_word_100_compare.c

100it [00:00, 42629.37it/s]


modified


100it [00:00, 44131.99it/s]


done calculating f1 scores

gpt4o - PUNCTUATION Modification:
Original Mean F1: 50.585
Modified Mean F1: 47.106
Weighted Delta: -2.965
P-value: 0.0978
Significance: .
claude-0shot-compound_word_100.csv
compound_word_100.csv

Results from claude-0shot-compound_word_100.csv:
llama-0shot-geographical_bias_100.csv
geographical_bias_100.csv

Results from llama-0shot-geographical_bias_100.csv:
gpt4o-0shot-coordinating_conjunction_100_compare.csv
coordinating_conjunction_100_compare.csv

Results from gpt4o-0shot-coordinating_conjunction_100_compare.csv:
llama-0shot-singlish_100_new.csv
singlish

Results from llama-0shot-singlish_100_new.csv:
original


96it [00:00, 43534.78it/s]


modified


96it [00:00, 54700.88it/s]


done calculating f1 scores

llama - SINGLISH Modification:
Original Mean F1: 59.550
Modified Mean F1: 56.677
Weighted Delta: -2.550
P-value: 0.0927
Significance: .
gpt4o-0shot-geographical_bias_100.csv
geographical_bias_100.csv

Results from gpt4o-0shot-geographical_bias_100.csv:
gpt4o-0shot-capitalization_100.csv
capitalization_100.csv

Results from gpt4o-0shot-capitalization_100.csv:
claude-0shot-capitalization_100.csv
capitalization_100.csv

Results from claude-0shot-capitalization_100.csv:
llama-0shot-punctuation_100.csv
punctuation_100.csv

Results from llama-0shot-punctuation_100.csv:
gpt4o-0shot-derivation_100_new.csv
derivation

Results from gpt4o-0shot-derivation_100_new.csv:
original


69it [00:00, 41196.72it/s]


modified


69it [00:00, 40836.32it/s]


done calculating f1 scores

gpt4o - DERIVATION Modification:
Original Mean F1: 55.509
Modified Mean F1: 55.502
Weighted Delta: -0.006
P-value: 0.9396
Significance: ns
claude-0shot-derivation_100_new.csv
derivation

Results from claude-0shot-derivation_100_new.csv:
original


69it [00:00, 42938.72it/s]


modified


69it [00:00, 43532.94it/s]


done calculating f1 scores

claude - DERIVATION Modification:
Original Mean F1: 56.304
Modified Mean F1: 54.351
Weighted Delta: -1.709
P-value: 0.0679
Significance: .
claude-0shot-discourse_100_compare.csv
discourse_100_compare.csv

Results from claude-0shot-discourse_100_compare.csv:
llama-0shot-compound_word_100_compare.csv
compound_word_100_compare.csv

Results from llama-0shot-compound_word_100_compare.csv:
claude-0shot-sentiment_100_compare.csv
sentiment_100_compare.csv

Results from claude-0shot-sentiment_100_compare.csv:
gpt4o-0shot-casual_100_compare.csv
casual_100_compare.csv

Results from gpt4o-0shot-casual_100_compare.csv:
claude-0shot-punctuation_100_new.csv
punctuation

Results from claude-0shot-punctuation_100_new.csv:
original


100it [00:00, 45704.52it/s]


modified


100it [00:00, 50430.49it/s]


done calculating f1 scores

claude - PUNCTUATION Modification:
Original Mean F1: 45.389
Modified Mean F1: 37.331
Weighted Delta: -6.676
P-value: 0.0007
Significance: **
llama-0shot-length_bias_100.csv
length_bias_100.csv

Results from llama-0shot-length_bias_100.csv:
claude-0shot-casual_100_compare.csv
casual_100_compare.csv

Results from claude-0shot-casual_100_compare.csv:
gpt4o-0shot-discourse_100_new.csv
discourse

Results from gpt4o-0shot-discourse_100_new.csv:
original


72it [00:00, 40313.69it/s]


modified


72it [00:00, 33821.24it/s]


done calculating f1 scores

gpt4o - DISCOURSE Modification:
Original Mean F1: 54.912
Modified Mean F1: 55.007
Weighted Delta: 0.083
P-value: 0.8125
Significance: ns
llama-0shot-capitalization_100.csv
capitalization_100.csv

Results from llama-0shot-capitalization_100.csv:
claude-0shot-temporal_bias_100_new.csv
temporal_bias

Results from claude-0shot-temporal_bias_100_new.csv:
original


91it [00:00, 47668.50it/s]


modified


91it [00:00, 44866.78it/s]


done calculating f1 scores

claude - TEMPORAL_BIAS Modification:
Original Mean F1: 53.267
Modified Mean F1: 48.995
Weighted Delta: -3.688
P-value: 0.0180
Significance: *
claude-0shot-negation_100_new.csv
negation

Results from claude-0shot-negation_100_new.csv:
original


110it [00:00, 44754.43it/s]


modified


110it [00:00, 42950.42it/s]


done calculating f1 scores

claude - NEGATION Modification:
Original Mean F1: 46.497
Modified Mean F1: 46.388
Weighted Delta: -0.091
P-value: 0.6768
Significance: ns
gpt4o-0shot-dialectal_100_new.csv
dialectal

Results from gpt4o-0shot-dialectal_100_new.csv:
original


99it [00:00, 47596.98it/s]


modified


99it [00:00, 43380.29it/s]


done calculating f1 scores

gpt4o - DIALECTAL Modification:
Original Mean F1: 51.646
Modified Mean F1: 54.935
Weighted Delta: 2.817
P-value: 0.2043
Significance: ns
llama-0shot-coordinating_conjunction_100_new.csv
coordinating_conjunction

Results from llama-0shot-coordinating_conjunction_100_new.csv:
original


61it [00:00, 33223.29it/s]


modified


61it [00:00, 30455.01it/s]


done calculating f1 scores

llama - COORDINATING_CONJUNCTION Modification:
Original Mean F1: 73.382
Modified Mean F1: 73.627
Weighted Delta: 0.229
P-value: 0.6528
Significance: ns
llama-0shot-casual_100_compare.csv
casual_100_compare.csv

Results from llama-0shot-casual_100_compare.csv:
claude-0shot-active_to_passive_100.csv
active_to_passive_100.csv

Results from claude-0shot-active_to_passive_100.csv:
llama-0shot-derivation_100_new.csv
derivation

Results from llama-0shot-derivation_100_new.csv:
original


69it [00:00, 43756.72it/s]


modified


69it [00:00, 39204.41it/s]


done calculating f1 scores

llama - DERIVATION Modification:
Original Mean F1: 60.106
Modified Mean F1: 58.870
Weighted Delta: -1.099
P-value: 0.1441
Significance: ns
gpt4o-0shot-discourse_100.csv
discourse_100.csv

Results from gpt4o-0shot-discourse_100.csv:
gpt4o-0shot-typo_bias_100_new.csv
typo_bias

Results from gpt4o-0shot-typo_bias_100_new.csv:
original


100it [00:00, 44897.28it/s]


modified


100it [00:00, 41634.94it/s]


done calculating f1 scores

gpt4o - TYPO_BIAS Modification:
Original Mean F1: 49.703
Modified Mean F1: 50.250
Weighted Delta: 0.464
P-value: 0.7088
Significance: ns
claude-0shot-temporal_bias_100.csv
temporal_bias_100.csv

Results from claude-0shot-temporal_bias_100.csv:
claude-0shot-typo_bias_100.csv
typo_bias_100.csv

Results from claude-0shot-typo_bias_100.csv:
claude-0shot-coordinating_conjunction_100.csv
coordinating_conjunction_100.csv

Results from claude-0shot-coordinating_conjunction_100.csv:
gpt4o-0shot-compound_word_100_compare.csv
compound_word_100_compare.csv

Results from gpt4o-0shot-compound_word_100_compare.csv:
llama-0shot-punctuation_100_new.csv
punctuation

Results from llama-0shot-punctuation_100_new.csv:
original


100it [00:00, 43500.35it/s]


modified


100it [00:00, 41577.16it/s]


done calculating f1 scores

llama - PUNCTUATION Modification:
Original Mean F1: 53.950
Modified Mean F1: 47.381
Weighted Delta: -5.689
P-value: 0.0030
Significance: **
claude-0shot-active_to_passive_100_compare.csv
active_to_passive_100_compare.csv

Results from claude-0shot-active_to_passive_100_compare.csv:
llama-0shot-singlish_100.csv
singlish_100.csv

Results from llama-0shot-singlish_100.csv:
claude-0shot-dialectal_100.csv
dialectal_100.csv

Results from claude-0shot-dialectal_100.csv:
claude-0shot-capitalization_100_compare.csv
capitalization_100_compare.csv

Results from claude-0shot-capitalization_100_compare.csv:
claude-0shot-concept_replacement_100_new.csv
concept_replacement

Results from claude-0shot-concept_replacement_100_new.csv:
original


85it [00:00, 46067.43it/s]


modified


85it [00:00, 39758.65it/s]


done calculating f1 scores

claude - CONCEPT_REPLACEMENT Modification:
Original Mean F1: 49.185
Modified Mean F1: 49.104
Weighted Delta: -0.069
P-value: 0.9161
Significance: ns
gpt4o-0shot-derivation_100_compare.csv
derivation_100_compare.csv

Results from gpt4o-0shot-derivation_100_compare.csv:
claude-0shot-length_bias_100_compare.csv
length_bias_100_compare.csv

Results from claude-0shot-length_bias_100_compare.csv:
gpt4o-0shot-singlish_100_new.csv
singlish

Results from gpt4o-0shot-singlish_100_new.csv:
original


96it [00:00, 39902.21it/s]


modified


96it [00:00, 53036.51it/s]


done calculating f1 scores

gpt4o - SINGLISH Modification:
Original Mean F1: 54.252
Modified Mean F1: 53.290
Weighted Delta: -0.835
P-value: 0.8925
Significance: ns
llama-0shot-active_to_passive_100.csv
active_to_passive_100.csv

Results from llama-0shot-active_to_passive_100.csv:
gpt4o-0shot-temporal_bias_100_new.csv
temporal_bias

Results from gpt4o-0shot-temporal_bias_100_new.csv:
original


91it [00:00, 42823.03it/s]


modified


91it [00:00, 40321.33it/s]


done calculating f1 scores

gpt4o - TEMPORAL_BIAS Modification:
Original Mean F1: 57.036
Modified Mean F1: 53.207
Weighted Delta: -3.362
P-value: 0.0275
Significance: *
gpt4o-0shot-active_to_passive_100_compare.csv
active_to_passive_100_compare.csv

Results from gpt4o-0shot-active_to_passive_100_compare.csv:
claude-0shot-concept_replacement_100.csv
concept_replacement_100.csv

Results from claude-0shot-concept_replacement_100.csv:
gpt4o-0shot-capitalization_100_compare.csv
capitalization_100_compare.csv

Results from gpt4o-0shot-capitalization_100_compare.csv:
llama-0shot-compound_word_100.csv
compound_word_100.csv

Results from llama-0shot-compound_word_100.csv:
gpt4o-0shot-active_to_passive_100.csv
active_to_passive_100.csv

Results from gpt4o-0shot-active_to_passive_100.csv:
llama-0shot-temporal_bias_100_new.csv
temporal_bias

Results from llama-0shot-temporal_bias_100_new.csv:
original


91it [00:00, 36402.64it/s]


modified


91it [00:00, 37099.70it/s]


done calculating f1 scores

llama - TEMPORAL_BIAS Modification:
Original Mean F1: 57.115
Modified Mean F1: 55.802
Weighted Delta: -1.154
P-value: 0.1730
Significance: ns
llama-0shot-geographical_bias_100_compare.csv
geographical_bias_100_compare.csv

Results from llama-0shot-geographical_bias_100_compare.csv:
gpt4o-0shot-punctuation_100_compare.csv
punctuation_100_compare.csv

Results from gpt4o-0shot-punctuation_100_compare.csv:
claude-0shot-grammatical_role_100_new.csv
grammatical_role

Results from claude-0shot-grammatical_role_100_new.csv:
original


83it [00:00, 32395.98it/s]


modified


83it [00:00, 38125.86it/s]


done calculating f1 scores

claude - GRAMMATICAL_ROLE Modification:
Original Mean F1: 60.329
Modified Mean F1: 57.999
Weighted Delta: -2.075
P-value: 0.0853
Significance: .
llama-0shot-ner.csv
llama-0shot-capitalization_100_compare.csv
capitalization_100_compare.csv

Results from llama-0shot-capitalization_100_compare.csv:
gpt4o-0shot-derivation_100.csv
derivation_100.csv

Results from gpt4o-0shot-derivation_100.csv:
llama-0shot-dialectal_100.csv
dialectal_100.csv

Results from llama-0shot-dialectal_100.csv:
claude-0shot-length_bias_100_new.csv
length_bias

Results from claude-0shot-length_bias_100_new.csv:
original


92it [00:00, 40764.42it/s]


modified


92it [00:00, 45343.83it/s]


done calculating f1 scores

claude - LENGTH_BIAS Modification:
Original Mean F1: 48.274
Modified Mean F1: 43.352
Weighted Delta: -4.144
P-value: 0.2576
Significance: ns
llama-0shot-sentiment_100_new.csv
sentiment

Results from llama-0shot-sentiment_100_new.csv:
original


123it [00:00, 37775.46it/s]


modified


123it [00:00, 42210.72it/s]


done calculating f1 scores

llama - SENTIMENT Modification:
Original Mean F1: 59.942
Modified Mean F1: 58.206
Weighted Delta: -1.543
P-value: 0.1921
Significance: ns
llama-0shot-concept_replacement_100_new.csv
concept_replacement

Results from llama-0shot-concept_replacement_100_new.csv:
original


85it [00:00, 35417.83it/s]


modified


85it [00:00, 29048.79it/s]


done calculating f1 scores

llama - CONCEPT_REPLACEMENT Modification:
Original Mean F1: 56.100
Modified Mean F1: 53.173
Weighted Delta: -2.559
P-value: 0.1257
Significance: ns
llama-0shot-casual_100_new.csv
casual

Results from llama-0shot-casual_100_new.csv:
original


98it [00:00, 37634.30it/s]


modified


98it [00:00, 37562.08it/s]


done calculating f1 scores

llama - CASUAL Modification:
Original Mean F1: 57.339
Modified Mean F1: 54.858
Weighted Delta: -2.181
P-value: 0.3201
Significance: ns
claude-0shot-typo_bias_100_new.csv
typo_bias

Results from claude-0shot-typo_bias_100_new.csv:
original


100it [00:00, 46402.30it/s]


modified


100it [00:00, 40563.87it/s]


done calculating f1 scores

claude - TYPO_BIAS Modification:
Original Mean F1: 46.693
Modified Mean F1: 45.667
Weighted Delta: -0.857
P-value: 0.2367
Significance: ns
llama-0shot-concept_replacement_100_compare.csv
concept_replacement_100_compare.csv

Results from llama-0shot-concept_replacement_100_compare.csv:
llama-0shot-typo_bias_100.csv
typo_bias_100.csv

Results from llama-0shot-typo_bias_100.csv:
gpt4o-0shot-temporal_bias_100.csv
temporal_bias_100.csv

Results from gpt4o-0shot-temporal_bias_100.csv:
llama-0shot-punctuation_100_compare.csv
punctuation_100_compare.csv

Results from llama-0shot-punctuation_100_compare.csv:
claude-0shot-dialectal_100_new.csv
dialectal

Results from claude-0shot-dialectal_100_new.csv:
original


99it [00:00, 10144.53it/s]


modified


99it [00:00, 10297.75it/s]


done calculating f1 scores

claude - DIALECTAL Modification:
Original Mean F1: 49.416
Modified Mean F1: 47.940
Weighted Delta: -1.250
P-value: 0.5723
Significance: ns
claude-0shot-temporal_bias_100_compare.csv
temporal_bias_100_compare.csv

Results from claude-0shot-temporal_bias_100_compare.csv:
gpt4o-0shot-dialectal_100_compare.csv
dialectal_100_compare.csv

Results from gpt4o-0shot-dialectal_100_compare.csv:
claude-0shot-discourse_100_new.csv
discourse

Results from claude-0shot-discourse_100_new.csv:
original


72it [00:00, 11189.37it/s]


modified


72it [00:00, 38421.11it/s]


done calculating f1 scores

claude - DISCOURSE Modification:
Original Mean F1: 53.172
Modified Mean F1: 52.152
Weighted Delta: -0.880
P-value: 0.2785
Significance: ns
claude-0shot-compound_word_100_new.csv
compound_word

Results from claude-0shot-compound_word_100_new.csv:
original


86it [00:00, 35802.50it/s]


modified


86it [00:00, 24048.95it/s]


done calculating f1 scores

claude - COMPOUND_WORD Modification:
Original Mean F1: 51.450
Modified Mean F1: 49.718
Weighted Delta: -1.483
P-value: 0.1088
Significance: ns
gpt4o-0shot-length_bias_100_new.csv
length_bias

Results from gpt4o-0shot-length_bias_100_new.csv:
original


92it [00:00, 19246.64it/s]


modified


92it [00:00, 31176.86it/s]


done calculating f1 scores

gpt4o - LENGTH_BIAS Modification:
Original Mean F1: 50.422
Modified Mean F1: 55.338
Weighted Delta: 4.185
P-value: 0.0750
Significance: .
llama-0shot-dialectal_100_compare.csv
dialectal_100_compare.csv

Results from llama-0shot-dialectal_100_compare.csv:
gpt4o-0shot-grammatical_role_100_new.csv
grammatical_role

Results from gpt4o-0shot-grammatical_role_100_new.csv:
original


83it [00:00, 27686.28it/s]


modified


83it [00:00, 32845.29it/s]

done calculating f1 scores

gpt4o - GRAMMATICAL_ROLE Modification:
Original Mean F1: 59.695
Modified Mean F1: 61.743
Weighted Delta: 1.818
P-value: 0.1935
Significance: ns
claude-0shot-active_to_passive_100_new.csv
active_to_passive

Results from claude-0shot-active_to_passive_100_new.csv:





original


81it [00:00, 48478.68it/s]


modified


81it [00:00, 47396.57it/s]


done calculating f1 scores

claude - ACTIVE_TO_PASSIVE Modification:
Original Mean F1: 51.052
Modified Mean F1: 52.909
Weighted Delta: 1.586
P-value: 0.3326
Significance: ns
llama-0shot-geographical_bias_100_new.csv
geographical_bias

Results from llama-0shot-geographical_bias_100_new.csv:
original


102it [00:00, 29623.25it/s]


modified


102it [00:00, 34390.60it/s]


done calculating f1 scores

llama - GEOGRAPHICAL_BIAS Modification:
Original Mean F1: 62.562
Modified Mean F1: 63.299
Weighted Delta: 0.662
P-value: 0.8502
Significance: ns
llama-0shot-grammatical_role_100_new.csv
grammatical_role

Results from llama-0shot-grammatical_role_100_new.csv:
original


83it [00:00, 30029.09it/s]


modified


83it [00:00, 38655.03it/s]

done calculating f1 scores

llama - GRAMMATICAL_ROLE Modification:
Original Mean F1: 64.082
Modified Mean F1: 63.701
Weighted Delta: -0.344
P-value: 0.7554
Significance: ns
llama-0shot-temporal_bias_100_compare.csv
temporal_bias_100_compare.csv

Results from llama-0shot-temporal_bias_100_compare.csv:
llama-0shot-coordinating_conjunction_100_compare.csv
coordinating_conjunction_100_compare.csv

Results from llama-0shot-coordinating_conjunction_100_compare.csv:
claude-0shot-grammatical_role_100_compare.csv
grammatical_role_100_compare.csv

Results from claude-0shot-grammatical_role_100_compare.csv:
claude-0shot-dialectal_100_compare.csv
dialectal_100_compare.csv

Results from claude-0shot-dialectal_100_compare.csv:
gpt4o-0shot-compound_word_100_new.csv
compound_word

Results from gpt4o-0shot-compound_word_100_new.csv:





original


86it [00:00, 44466.24it/s]


modified


86it [00:00, 42788.87it/s]


done calculating f1 scores

gpt4o - COMPOUND_WORD Modification:
Original Mean F1: 54.489
Modified Mean F1: 52.890
Weighted Delta: -1.388
P-value: 0.5281
Significance: ns
gpt4o-DP.csv
claude-0shot-geographical_bias_100_new.csv
geographical_bias

Results from claude-0shot-geographical_bias_100_new.csv:
original


102it [00:00, 31706.74it/s]


modified


102it [00:00, 39675.32it/s]

done calculating f1 scores

claude - GEOGRAPHICAL_BIAS Modification:
Original Mean F1: 65.997
Modified Mean F1: 61.513
Weighted Delta: -4.079
P-value: 0.2847
Significance: ns
claude-0shot-singlish_100.csv
singlish_100.csv

Results from claude-0shot-singlish_100.csv:
llama-0shot-active_to_passive_100_new.csv
active_to_passive

Results from llama-0shot-active_to_passive_100_new.csv:





original


81it [00:00, 35792.10it/s]


modified


81it [00:00, 42994.00it/s]


done calculating f1 scores

llama - ACTIVE_TO_PASSIVE Modification:
Original Mean F1: 54.761
Modified Mean F1: 56.830
Weighted Delta: 1.799
P-value: 0.2198
Significance: ns
llama-0shot-compound_word_100_new.csv
compound_word

Results from llama-0shot-compound_word_100_new.csv:
original


86it [00:00, 43438.12it/s]


modified


86it [00:00, 41513.42it/s]


done calculating f1 scores

llama - COMPOUND_WORD Modification:
Original Mean F1: 58.061
Modified Mean F1: 57.092
Weighted Delta: -0.855
P-value: 0.1797
Significance: ns
claude-0shot-coordinating_conjunction_100_compare.csv
coordinating_conjunction_100_compare.csv

Results from claude-0shot-coordinating_conjunction_100_compare.csv:
gpt4o-0shot-capitalization_100_new.csv
capitalization

Results from gpt4o-0shot-capitalization_100_new.csv:
original


100it [00:00, 47820.13it/s]


modified


100it [00:00, 46065.94it/s]


done calculating f1 scores

gpt4o - CAPITALIZATION Modification:
Original Mean F1: 55.648
Modified Mean F1: 55.061
Weighted Delta: -0.512
P-value: 0.7260
Significance: ns
gpt4o-0shot-sentiment_100.csv
sentiment_100.csv

Results from gpt4o-0shot-sentiment_100.csv:
claude-0shot-derivation_100.csv
derivation_100.csv

Results from claude-0shot-derivation_100.csv:
llama-0shot-negation_100_new.csv
negation

Results from llama-0shot-negation_100_new.csv:
original


110it [00:00, 32079.92it/s]


modified


110it [00:00, 36457.80it/s]


done calculating f1 scores

llama - NEGATION Modification:
Original Mean F1: 50.035
Modified Mean F1: 49.231
Weighted Delta: -0.683
P-value: 0.6871
Significance: ns
gpt4o-0shot-punctuation_100.csv
punctuation_100.csv

Results from gpt4o-0shot-punctuation_100.csv:
gpt4o-0shot-length_bias_100.csv
length_bias_100.csv

Results from gpt4o-0shot-length_bias_100.csv:
llama-0shot-active_to_passive_100_compare.csv
active_to_passive_100_compare.csv

Results from llama-0shot-active_to_passive_100_compare.csv:
gpt4o-0shot-typo_bias_100_compare.csv
typo_bias_100_compare.csv

Results from gpt4o-0shot-typo_bias_100_compare.csv:
gpt4o-0shot-dialectal_100.csv
dialectal_100.csv

Results from gpt4o-0shot-dialectal_100.csv:
gpt4o-0shot-geographical_bias_100_new.csv
geographical_bias

Results from gpt4o-0shot-geographical_bias_100_new.csv:
original


102it [00:00, 29990.82it/s]


modified


102it [00:00, 34776.38it/s]


done calculating f1 scores

gpt4o - GEOGRAPHICAL_BIAS Modification:
Original Mean F1: 64.277
Modified Mean F1: 69.178
Weighted Delta: 4.430
P-value: 0.0651
Significance: .
claude-0shot-negation_100_compare.csv
negation_100_compare.csv

Results from claude-0shot-negation_100_compare.csv:
gpt4o-0shot-grammatical_role_100_compare.csv
grammatical_role_100_compare.csv

Results from gpt4o-0shot-grammatical_role_100_compare.csv:
llama-0shot-typo_bias_100_compare.csv
typo_bias_100_compare.csv

Results from llama-0shot-typo_bias_100_compare.csv:
gpt4o-0shot-typo_bias_100.csv
typo_bias_100.csv

Results from gpt4o-0shot-typo_bias_100.csv:
llama-0shot-discourse_100_new.csv
discourse

Results from llama-0shot-discourse_100_new.csv:
original


72it [00:00, 37938.43it/s]


modified


72it [00:00, 40886.80it/s]


done calculating f1 scores

llama - DISCOURSE Modification:
Original Mean F1: 58.555
Modified Mean F1: 57.366
Weighted Delta: -1.050
P-value: 0.3946
Significance: ns
gpt4o-0shot-geographical_bias_100_compare.csv
geographical_bias_100_compare.csv

Results from gpt4o-0shot-geographical_bias_100_compare.csv:
claude-0shot-negation_100.csv
negation_100.csv

Results from claude-0shot-negation_100.csv:
llama-0shot-dialectal_100_new.csv
dialectal

Results from llama-0shot-dialectal_100_new.csv:
original


99it [00:00, 44103.67it/s]


modified


99it [00:00, 47832.75it/s]

done calculating f1 scores

llama - DIALECTAL Modification:
Original Mean F1: 57.443
Modified Mean F1: 56.364
Weighted Delta: -0.949
P-value: 0.6460
Significance: ns
llama-0shot-capitalization_100_new.csv
capitalization

Results from llama-0shot-capitalization_100_new.csv:
original



100it [00:00, 47760.24it/s]


modified


100it [00:00, 48782.32it/s]


done calculating f1 scores

llama - CAPITALIZATION Modification:
Original Mean F1: 62.331
Modified Mean F1: 55.166
Weighted Delta: -6.430
P-value: 0.0410
Significance: *
llama-0shot-derivation_100.csv
derivation_100.csv

Results from llama-0shot-derivation_100.csv:
claude-0shot-concept_replacement_100_compare.csv
concept_replacement_100_compare.csv

Results from claude-0shot-concept_replacement_100_compare.csv:
llama-0shot-typo_bias_100_new.csv
typo_bias

Results from llama-0shot-typo_bias_100_new.csv:
original


100it [00:00, 45104.89it/s]


modified


100it [00:00, 37409.06it/s]


done calculating f1 scores

llama - TYPO_BIAS Modification:
Original Mean F1: 53.180
Modified Mean F1: 54.728
Weighted Delta: 1.336
P-value: 0.2289
Significance: ns
llama-0shot-grammatical_role_100_compare.csv
grammatical_role_100_compare.csv

Results from llama-0shot-grammatical_role_100_compare.csv:
claude-0shot-sentiment_100_new.csv
sentiment

Results from claude-0shot-sentiment_100_new.csv:
original


123it [00:00, 48278.06it/s]


modified


123it [00:00, 49058.52it/s]


done calculating f1 scores

claude - SENTIMENT Modification:
Original Mean F1: 54.543
Modified Mean F1: 51.578
Weighted Delta: -2.575
P-value: 0.0172
Significance: *
llama-DP.csv
claude-0shot-singlish_100_new.csv
singlish

Results from claude-0shot-singlish_100_new.csv:
original


96it [00:00, 48629.61it/s]


modified


96it [00:00, 60842.13it/s]


done calculating f1 scores

claude - SINGLISH Modification:
Original Mean F1: 52.187
Modified Mean F1: 47.124
Weighted Delta: -4.348
P-value: 0.0493
Significance: *
claude-0shot-geographical_bias_100_compare.csv
geographical_bias_100_compare.csv

Results from claude-0shot-geographical_bias_100_compare.csv:
llama-0shot-length_bias_100_new.csv
length_bias

Results from llama-0shot-length_bias_100_new.csv:
original


92it [00:00, 43114.63it/s]


modified


92it [00:00, 42898.94it/s]


done calculating f1 scores

llama - LENGTH_BIAS Modification:
Original Mean F1: 54.248
Modified Mean F1: 55.622
Weighted Delta: 1.192
P-value: 0.5196
Significance: ns
claude-0shot-ner.csv
gpt4o-0shot-casual_100_new.csv
casual

Results from gpt4o-0shot-casual_100_new.csv:
original


98it [00:00, 41141.21it/s]


modified


98it [00:00, 43345.12it/s]


done calculating f1 scores

gpt4o - CASUAL Modification:
Original Mean F1: 56.961
Modified Mean F1: 57.416
Weighted Delta: 0.399
P-value: 0.3318
Significance: ns
claude-0shot-derivation_100_compare.csv
derivation_100_compare.csv

Results from claude-0shot-derivation_100_compare.csv:
llama-0shot-negation_100_compare.csv
negation_100_compare.csv

Results from llama-0shot-negation_100_compare.csv:
claude-0shot-discourse_100.csv
discourse_100.csv

Results from claude-0shot-discourse_100.csv:
claude-0shot-coordinating_conjunction_100_new.csv
coordinating_conjunction

Results from claude-0shot-coordinating_conjunction_100_new.csv:
original


61it [00:00, 36903.58it/s]


modified


61it [00:00, 30214.05it/s]

done calculating f1 scores

claude - COORDINATING_CONJUNCTION Modification:
Original Mean F1: 69.680
Modified Mean F1: 69.382
Weighted Delta: -0.275
P-value: 0.7167
Significance: ns
gpt4o-0shot-length_bias_100_compare.csv
length_bias_100_compare.csv

Results from gpt4o-0shot-length_bias_100_compare.csv:
gpt4o-0shot-coordinating_conjunction_100_new.csv
coordinating_conjunction

Results from gpt4o-0shot-coordinating_conjunction_100_new.csv:





original


61it [00:00, 31091.57it/s]


modified


61it [00:00, 31102.91it/s]


done calculating f1 scores

gpt4o - COORDINATING_CONJUNCTION Modification:
Original Mean F1: 64.273
Modified Mean F1: 68.988
Weighted Delta: 4.263
P-value: 0.0411
Significance: *
claude-0shot-casual_100_new.csv
casual

Results from claude-0shot-casual_100_new.csv:
original


98it [00:00, 49003.55it/s]


modified


98it [00:00, 42349.25it/s]


done calculating f1 scores

claude - CASUAL Modification:
Original Mean F1: 50.492
Modified Mean F1: 49.287
Weighted Delta: -1.027
P-value: 0.5316
Significance: ns
gpt4o-0shot-negation_100.csv
negation_100.csv

Results from gpt4o-0shot-negation_100.csv:
gpt4o-0shot-negation_100_compare.csv
negation_100_compare.csv

Results from gpt4o-0shot-negation_100_compare.csv:
claude-0shot-typo_bias_100_compare.csv
typo_bias_100_compare.csv

Results from claude-0shot-typo_bias_100_compare.csv:
gpt4o-0shot-concept_replacement_100.csv
concept_replacement_100.csv

Results from gpt4o-0shot-concept_replacement_100.csv:
gpt4o-0shot-compound_word_100.csv
compound_word_100.csv

Results from gpt4o-0shot-compound_word_100.csv:
gpt4o-0shot-negation_100_new.csv
negation

Results from gpt4o-0shot-negation_100_new.csv:
original


110it [00:00, 37324.93it/s]


modified


110it [00:00, 36317.18it/s]


done calculating f1 scores

gpt4o - NEGATION Modification:
Original Mean F1: 50.776
Modified Mean F1: 50.885
Weighted Delta: 0.093
P-value: 0.4870
Significance: ns
llama-0shot-sentiment_100.csv
sentiment_100.csv

Results from llama-0shot-sentiment_100.csv:
gpt4o-0shot-active_to_passive_100_new.csv
active_to_passive

Results from gpt4o-0shot-active_to_passive_100_new.csv:
original


81it [00:00, 39001.10it/s]


modified


81it [00:00, 42999.45it/s]


done calculating f1 scores

gpt4o - ACTIVE_TO_PASSIVE Modification:
Original Mean F1: 52.995
Modified Mean F1: 55.889
Weighted Delta: 2.495
P-value: 0.3258
Significance: ns
gpt4o-0shot-concept_replacement_100_new.csv
concept_replacement

Results from gpt4o-0shot-concept_replacement_100_new.csv:
original


85it [00:00, 29951.76it/s]


modified


85it [00:00, 38688.64it/s]


done calculating f1 scores

gpt4o - CONCEPT_REPLACEMENT Modification:
Original Mean F1: 52.962
Modified Mean F1: 57.130
Weighted Delta: 3.593
P-value: 0.0464
Significance: *
claude-0shot-punctuation_100_compare.csv
punctuation_100_compare.csv

Results from claude-0shot-punctuation_100_compare.csv:
gpt4o-0shot-temporal_bias_100_compare.csv
temporal_bias_100_compare.csv

Results from gpt4o-0shot-temporal_bias_100_compare.csv:

Results saved to ner_modification_results_llm.csv

Negation results saved to ner_negation_type_results_llm.csv


In [118]:
# Read both CSV files
llm_results = pd.read_csv('ner_modification_results_llm.csv')
plm_results = pd.read_csv('ner_modification_results_plm.csv')

# Combine the dataframes
combined_results = pd.concat([llm_results, plm_results], ignore_index=True)

# Save combined results
combined_results.to_csv('ner_modification_results_combined.csv', index=False)
print("\nCombined results saved to ner_modification_results_combined.csv")

negation_results_llm = pd.read_csv('ner_negation_type_results_llm.csv')
negation_results_plm = pd.read_csv('ner_negation_type_results_plm.csv')

combined_negation_results = pd.concat([negation_results_llm, negation_results_plm], ignore_index=True)

combined_negation_results.to_csv('ner_negation_type_results_combined.csv', index=False)
print("\nCombined negation results saved to ner_negation_type_results_combined.csv")



Combined results saved to ner_modification_results_combined.csv

Combined negation results saved to ner_negation_type_results_combined.csv


In [119]:
modification_order =["B: Tem", "B: Geo", "B: Len", "O: Spell","O: Cap","O: Punc",
"M: Deri",
"M: Com",
"Sx: Voice",
"Sx: Gra",
"Sx: Conj",
"Sm: Con",
"P: Neg",
"P: Disc",
"P: Senti",
"G: Cas",
"G: Dial",
"G: Sing"]

In [120]:
import numpy as np

# Read the combined results
df = pd.read_csv('ner_modification_results_combined.csv')

# Create mapping from modification names to standardized names with categories
mod_mapping = {
    'temporal_bias': ('Bias', 'Temporal'),
    'geographical_bias': ('Bias', 'Geographical'), 
    'length_bias': ('Bias', 'Length'),
    'typo_bias': ('Orthographic', 'Spelling'),
    'capitalization': ('Orthographic', 'Capitalization'),
    'punctuation': ('Orthographic', 'Punctuation'),
    'derivation': ('Morphological', 'Derivation'),
    'compound_word': ('Morphological', 'Compound'),
    'active_to_passive': ('Syntactic', 'Voice'),
    'grammatical_role': ('Syntactic', 'Grammar'),
    'coordinating_conjunction': ('Syntactic', 'Conjunction'),
    'concept_replacement': ('Semantic', 'Concept'),
    'negation': ('Semantic', 'Negation'),
    'discourse': ('Pragmatic', 'Discourse'),
    'sentiment': ('Pragmatic', 'Sentiment'),
    'casual': ('Genre', 'Casual'),
    'dialectal': ('Genre', 'Dialectal'),
    'singlish': ('Genre', 'Singlish')
}

# Define model order and normalize names
model_order = ['BERT', 'GPT-2', 'T5', 'GPT-4o', 'Claude 3.5', 'Llama 3.1']
model_map = {'gpt4o': 'GPT-4o', 'claude': 'Claude 3.5', 'llama': 'Llama 3.1', 'GPT2': 'GPT-2'}
df['model'] = df['model'].replace(model_map)

# Add category and modification columns
df['category'] = df['modification'].map(lambda x: mod_mapping[x][0])
df['modification'] = df['modification'].map(lambda x: mod_mapping[x][1])

# Pivot the data to get modifications as rows and models as columns
pivot_df = df.pivot(index=['category', 'modification'], columns='model', values='weighted_delta')
p_values = df.pivot(index=['category', 'modification'], columns='model', values='p_value')
significance = df.pivot(index=['category', 'modification'], columns='model', values='significance')

print(pivot_df)

# Function to generate color based on value
def get_color(val, sig):
    if np.isnan(val):
        return ''
    elif val > 0:
        # Green gradient for positive values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'+{val:.2f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{green!{int(intensity*30)}}} {val_str}'
    else:
        # Red gradient for negative values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'{val:.2f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{red!{int(intensity*30)}}} {val_str}'

# Generate LaTeX table
latex_table = '\\begin{table}[h]\n\\centering\n\\resizebox{\\linewidth}{!}{\n\\begin{tabular}{llr' + 'r'*len(model_order) + '}\n'
latex_table += '\\hline\n'
latex_table += 'Category & Modification & ' + ' & '.join([f'\\textbf{{{col}}}' for col in model_order]) + ' \\\\\n'
latex_table += '\\hline\n'

# Keep original order from mod_mapping
categories_seen = []
for mod, (category, modification) in mod_mapping.items():
    if category not in categories_seen:
        if categories_seen:  # Add hline between categories except before first
            latex_table += '\\hline\n'
        categories_seen.append(category)
        row_start = f'\\textbf{{{category}}}'
    else:
        row_start = ' '
    
    latex_table += f'{row_start} & \\textbf{{{modification}}} & '
    latex_table += ' & '.join([get_color(pivot_df.loc[(category, modification), col], 
                                       significance.loc[(category, modification), col]) for col in model_order]) + ' \\\\\n'

latex_table += '\\hline\n'
latex_table += '\\end{tabular}}\n'
latex_table += '\\caption{Weighted Delta Score by Model and Modification Type}\n'
latex_table += '\\label{tab:ner_results}\n'
latex_table += '\\end{table}'

# Save to file
with open('ner_results_table.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to ner_results_table.tex")


model                              BERT  Claude 3.5      GPT-2    GPT-4o  \
category      modification                                                 
Bias          Geographical     7.246421   -4.078650   1.042956  4.430418   
              Length          -0.433732   -4.143969   0.244144  4.185142   
              Temporal        -4.977408   -3.687997  -2.310513 -3.361895   
Genre         Casual          -2.337139   -1.026684  -2.196722  0.399290   
              Dialectal       -6.829746   -1.250404  -3.255341  2.816613   
              Singlish       -11.513399   -4.347931  -4.296672 -0.834512   
Morphological Compound        -0.257036   -1.482735  -0.314691 -1.388483   
              Derivation      -0.193705   -1.709467  -2.493454 -0.006384   
Orthographic  Capitalization -12.276385   -3.773813 -16.853842 -0.512300   
              Punctuation     -4.466147   -6.675797   1.221348 -2.964893   
              Spelling        -3.247760   -0.856916  -0.873977  0.464285   
Pragmatic   

In [123]:
# Load results from CSV
results_df = pd.read_csv('ner_modification_results_combined.csv')
negation_results_df = pd.read_csv('ner_negation_type_results_combined.csv')
# Create lists of unique modifications and models
modification_order = list(mod_mapping.keys())
model_order = ['BERT', 'GPT2', 'T5', 'gpt4o', 'claude', 'llama']
negation_order = ['verbal', 'lexical', 'double', 'approximate', 'absolute']
# Create empty DataFrame with multi-level columns
columns = pd.MultiIndex.from_product([model_order, ['original', 'modified', 'diff']])
results_df_pivot = pd.DataFrame(index=modification_order, columns=columns)

negation_columns = pd.MultiIndex.from_product([model_order, ['original', 'modified', 'diff']])
negation_results_df_pivot = pd.DataFrame(index=negation_order, columns=negation_columns)
# Fill DataFrame
for mod in modification_order:
    for model in model_order:
        row = results_df[(results_df['modification'] == mod) & (results_df['model'] == model)]
        if not row.empty:
            results_df_pivot.loc[mod, (model, 'original')] = row['original_mean_f1'].values[0]
            results_df_pivot.loc[mod, (model, 'modified')] = row['modified_mean_f1'].values[0]
            results_df_pivot.loc[mod, (model, 'diff')] = row['mean_f1_pct_change'].values[0]
            if mod == 'temporal_bias' and model == 'bert':
                print(row)
        
for mod in negation_order:
    for model in model_order:
        row = negation_results_df[(negation_results_df['negation_type'] == mod) & (negation_results_df['model'] == model)]
        if not row.empty:
            negation_results_df_pivot.loc[mod, (model, 'original')] = row['original_mean_f1'].values[0]
            negation_results_df_pivot.loc[mod, (model, 'modified')] = row['modified_mean_f1'].values[0]
            negation_results_df_pivot.loc[mod, (model, 'diff')] = row['mean_f1_pct_change'].values[0]
            if mod == 'temporal_bias' and model == 'bert':
                print(row)


# Save to CSV
results_df_pivot.to_csv('ner_results_df.csv')
negation_results_df_pivot.to_csv('ner_negation_results_df.csv')

print("Results saved to ner_results_df.csv")
print("Negation results saved to ner_negation_results_df.csv")


Results saved to ner_results_df.csv
Negation results saved to ner_negation_results_df.csv


In [124]:
import numpy as np

# Read the combined results
df = pd.read_csv('ner_negation_type_results_combined.csv')

# Create mapping from modification names to standardized names
negation_order= ['Verbal', 'Lexical', 'Double', 'Approximate', 'Absolute']
mod_mapping = {
    'verbal': 'Verbal',
    'lexical': 'Lexical',
    'double': 'Double',
    'approximate': 'Approximate',
    'absolute': 'Absolute',
}

# Define model order
model_order = ['BERT', 'GPT2', 'T5', 'gpt4o', 'claude', 'llama']

# Map the modification names
df['negation_type'] = df['negation_type'].map(mod_mapping)

# Pivot the data to get modifications as rows and models as columns
pivot_df = df.pivot(index='negation_type', columns='model', values='weighted_delta')
p_values = df.pivot(index='negation_type', columns='model', values='p_value')
significance = df.pivot(index='negation_type', columns='model', values='significance')

# Reorder rows according to modification_order
pivot_df = pivot_df.reindex(negation_order)
p_values = p_values.reindex(negation_order)
significance = significance.reindex(negation_order)

print(pivot_df)

# Function to generate color based on value
def get_color(val, sig):
    if np.isnan(val):
        return ''
    elif val > 0:
        # Green gradient for positive values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'+{val:.2f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{green!{int(intensity*30)}}} {val_str}'
    else:
        # Red gradient for negative values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'{val:.2f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{red!{int(intensity*30)}}} {val_str}'

# Generate LaTeX table
latex_table = '\\begin{table}[h]\n\\centering\n\\begin{tabular}{l' + 'r'*len(model_order) + '}\n'
latex_table += '\\hline\n'
latex_table += 'Modification & ' + ' & '.join([f'\\textbf{{{col}}}' for col in model_order]) + ' \\\\\n'
latex_table += '\\hline\n'

prev_category = None
for idx, row in pivot_df.iterrows():
    current_category = idx[0]  # Get first character of modification name
    if prev_category is not None and current_category != prev_category:
        latex_table += '\\hline\n'
    prev_category = current_category
    
    latex_table += f'\\textbf{{{idx}}} & '
    latex_table += ' & '.join([get_color(row[col], significance.loc[idx, col]) for col in model_order]) + ' \\\\\n'

latex_table += '\\hline\n'
latex_table += '\\end{tabular}\n'
latex_table += '\\caption{Weighted Delta Score by Model and Modification Type}\n'
latex_table += '\\label{tab:ner_results}\n'
latex_table += '\\end{table}'

# Save to file
with open('ner_negation_type_results_table.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to ner_results_table.tex")


model              BERT      GPT2        T5    claude     gpt4o     llama
negation_type                                                            
Verbal        -2.743259  0.104785  1.494364 -1.137108 -1.479953 -0.365473
Lexical       -2.725502 -3.123246  0.476221 -2.665256  1.513276  0.119679
Double         0.380551 -3.122061 -7.861116  2.605664  7.031438  2.042326
Approximate    0.224439  0.149489 -1.748570  3.908497  2.926344 -6.231944
Absolute      -2.978162  3.583626  2.732454 -0.084966 -4.571956 -0.338853
LaTeX table saved to ner_results_table.tex


In [111]:
import os

def convert_predictions_to_json(input_file, output_file):
    """
    Converts a tab-separated prediction file to JSON format.
    Assumes input file format: token gold_label predicted_label
    Sentences are separated by empty lines.
    Output JSON format: list of records, each record is a dict with "text", "gold", "prediction" keys.
    """
    records = []
    current_tokens = []
    current_gold_labels = []
    current_pred_labels = []

    with open(input_file, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:  # Empty line indicates sentence boundary
                if current_tokens:
                    text = " ".join(current_tokens)
                    gold_entities = []
                    pred_entities = []

                    # Process gold entities
                    i = 0
                    while i < len(current_gold_labels):
                        if current_gold_labels[i].startswith('B-'):
                            entity_type = current_gold_labels[i][2:]
                            start_index = i
                            end_index = i
                            while end_index + 1 < len(current_gold_labels) and current_gold_labels[end_index + 1].startswith('I-') and current_gold_labels[end_index + 1][2:] == entity_type:
                                end_index += 1
                            entity_text = " ".join(current_tokens[start_index:end_index+1])
                            gold_entities.append({"text": entity_text, "value": entity_type})
                            i = end_index + 1
                        else:
                            i += 1

                    # Process predicted entities
                    i = 0
                    while i < len(current_pred_labels):
                        if current_pred_labels[i].startswith('B-'):
                            entity_type = current_pred_labels[i][2:]
                            start_index = i
                            end_index = i
                            while end_index + 1 < len(current_pred_labels) and current_pred_labels[end_index + 1].startswith('I-') and current_pred_labels[end_index + 1][2:] == entity_type:
                                end_index += 1
                            entity_text = " ".join(current_tokens[start_index:end_index+1])
                            pred_entities.append({"text": entity_text, "value": entity_type})
                            i = end_index + 1
                        else:
                            i += 1

                    records.append({
                        "text": text,
                        "gold": gold_entities,
                        "prediction": pred_entities
                    })
                    current_tokens = []
                    current_gold_labels = []
                    current_pred_labels = []
            else:
                parts = line.split('\t')
                if len(parts) == 3:
                    token, gold_label, pred_label = parts
                    current_tokens.append(token)
                    current_gold_labels.append(gold_label)
                    current_pred_labels.append(pred_label)
                elif len(parts) == 2: # in case there is no prediction column
                    token, gold_label = parts
                    current_tokens.append(token)
                    current_gold_labels.append(gold_label)
                    current_pred_labels.append("O") # default prediction as O

        # Process the last sentence if file does not end with empty line
        if current_tokens:
            text = " ".join(current_tokens)
            gold_entities = []
            pred_entities = []

            # Process gold entities
            i = 0
            while i < len(current_gold_labels):
                if current_gold_labels[i].startswith('B-'):
                    entity_type = current_gold_labels[i][2:]
                    start_index = i
                    end_index = i
                    while end_index + 1 < len(current_gold_labels) and current_gold_labels[end_index + 1].startswith('I-') and current_gold_labels[end_index + 1][2:] == entity_type:
                        end_index += 1
                    entity_text = " ".join(current_tokens[start_index:end_index+1])
                    gold_entities.append({"text": entity_text, "value": entity_type})
                    i = end_index + 1
                else:
                    i += 1

            # Process predicted entities
            i = 0
            while i < len(current_pred_labels):
                if current_pred_labels[i].startswith('B-'):
                    entity_type = current_pred_labels[i][2:]
                    start_index = i
                    end_index = i
                    while end_index + 1 < len(current_pred_labels) and current_pred_labels[end_index + 1].startswith('I-') and current_pred_labels[end_index + 1][2:] == entity_type:
                        end_index += 1
                    entity_text = " ".join(current_tokens[start_index:end_index+1])
                    pred_entities.append({"text": entity_text, "value": entity_type})
                    i = end_index + 1
                else:
                    i += 1
            records.append({
                "text": text,
                "gold": gold_entities,
                "prediction": pred_entities
            })

    with open(output_file, 'w') as outfile:
        json.dump(records, outfile, indent=2)

# Example usage:
for model in ['bert', 'gpt2', 't5']:
    input_prediction_files = glob.glob(f"{model}/*.txt") # Get all .txt files in t5 directory
    for input_prediction_file in input_prediction_files:
        if "casual" not in input_prediction_file:
            continue
        output_file = input_prediction_file.split(".")[0].replace("_100", "")
        output_json_file = f"parsed_NER_{model.upper()}/{os.path.basename(output_file)}.json"
        convert_predictions_to_json(input_prediction_file, output_json_file)
        print(f"Converted predictions from '{input_prediction_file}' to JSON format and saved to '{output_json_file}'")


Converted predictions from 'bert/bert_casual_100_ori.txt' to JSON format and saved to 'parsed_NER_BERT/bert_casual_ori.json'
Converted predictions from 'bert/bert_casual_100_modif.txt' to JSON format and saved to 'parsed_NER_BERT/bert_casual_modif.json'
Converted predictions from 'gpt2/gpt2_casual_100_modif.txt' to JSON format and saved to 'parsed_NER_GPT2/gpt2_casual_modif.json'
Converted predictions from 'gpt2/gpt2_casual_100_ori.txt' to JSON format and saved to 'parsed_NER_GPT2/gpt2_casual_ori.json'
Converted predictions from 't5/t5_casual_100_ori.txt' to JSON format and saved to 'parsed_NER_T5/t5_casual_ori.json'
Converted predictions from 't5/t5_casual_100_modif.txt' to JSON format and saved to 'parsed_NER_T5/t5_casual_modif.json'
