In [1]:
import json
import pandas as pd
from pathlib import Path
import decimal

# Load derivation data to get indices
derivation_path = Path('../preprocessing/data_after_phase2/thinh/derivation_100.json')
with open(derivation_path) as f:
    derivation_data = json.load(f)

# Extract indices from derivation data
derivation_indices = [item['index'] for item in derivation_data]
print(derivation_indices)
# Create mapping between modification types and their filenames
modification_mapping = {
    'derivation': 'derivation'
}

# Base path for results
base_path = Path('../pretrained_language_models/coreference_resolution/tmp')

# Create index mapping for each model and modification
index_mappings = {}
for model in ['bert-base-cased', 'gpt2', 't5-base']:
    index_mappings[model] = {}
    model_path = base_path / f'{model}_results'
    
    for mod_type, filename in modification_mapping.items():
        csv_path = model_path / f'{filename}_predictions.csv'
        if csv_path.exists():
            df = pd.read_csv(csv_path)
            # Replace indices with derivation indices
            df['index'] = derivation_indices[:len(df)]
            # Create new file with replaced indices
            output_path = model_path / f'{filename}_predictions.csv'
            df.to_csv(output_path, index=False)


[1203, 1292, 1393, 85, 185, 837, 783, 830, 356, 518, 315, 1492, 849, 232, 734, 671, 1486, 1478, 894, 601, 312, 1154, 211, 1225, 438, 817, 1451, 117, 89, 189, 37, 828, 965, 405, 794, 1386, 1339, 989, 1300, 1104, 1400, 27, 55, 534, 1381, 724, 547, 884, 285, 130, 124, 747, 1449, 1012, 114, 1219, 1056, 207, 274, 679, 845, 1433, 2, 553, 131, 1353, 1084, 684, 121, 436, 1232, 821, 1448, 120, 1141, 1191, 1312, 159, 77, 155, 87, 963, 376, 796, 198, 1187, 1308, 887, 902, 602, 964, 659, 871, 800, 1223, 772, 323, 1066]


In [2]:
# Additional test cases: (A, B) = (100, 90) and (50, 40)
import math
additional_test_cases = [(100, 90), (50, 40)]

# Compute scaled log drop for each case using the weighted formula
scaled_log_weighted_additional = [(B - A) * math.log(A) / math.log(100) for A, B in additional_test_cases]
print(scaled_log_weighted_additional)


delta = [(B-A)/A*100 for A, B in additional_test_cases]
print(delta)



[-10.0, -8.494850021680092]
[-10.0, -20.0]


In [3]:
import json
import os
import pandas as pd
from pathlib import Path
from scipy import stats
import numpy as np

# Base path for results
base_path = Path('../pretrained_language_models/dialogue_contradiction_detection/tmp')

# Function to load and convert predictions to binary
def load_predictions(filepath):
    with open(filepath) as f:
        preds = json.load(f)
    for pred in preds.items():
        if pred[1] == "contradictory":
            preds[pred[0]] = 1
        elif pred[1] == "not contradictory":
            preds[pred[0]] = 0
    return preds

# Load original model predictions
model_orig_preds = {}
model_names = ['bert-base-cased', 'gpt2', 't5-base']

for model in model_names:
    filepath = base_path / f'{model}_results/{model}_predictions.json'
    model_orig_preds[model] = load_predictions(filepath)

# Load predictions for each modification
modifications = []
for model in model_names:
    mod_path = base_path / f'{model}_results'
    if mod_path.exists():
        # Get all CSV files containing predictions
        modifications.extend([f.stem for f in mod_path.glob('*_predictions.csv')])
modifications = list(set(modifications))  # Remove duplicates
modifications = [mod.replace('_predictions', '') for mod in modifications]

# Load negation types from GPT4 results
gpt4_negation_df = pd.read_csv('../eval/results/rongxin/gpt4o-0shot-negation_100.csv')
negation_types = gpt4_negation_df['type'].tolist()

# Sanity check negation types
valid_types = {'absolute', 'double', 'lexical', 'approximate', 'verbal'}
for neg_type in negation_types:
    if neg_type not in valid_types:
        print(f"WARNING: Invalid negation type found: {neg_type}")

# Create results list to store accuracy and statistical test results
results_rows = []
negation_results_rows = []

for mod in modifications:
    for model in model_names:
        # Get original predictions
        orig_preds = model_orig_preds[model]
        
        # Get modified predictions from CSV file
        mod_filepath = base_path / f'{model}_results/{mod}_predictions.csv'
        if mod_filepath.exists():
            mod_df = pd.read_csv(mod_filepath)
            # Calculate accuracies
            orig_correct = 0
            mod_correct = 0
            total = 0
            orig_list = []
            mod_list = []
            
            # Track results by negation type if this is negation mod
            if mod == 'negation':
                results_by_type = {
                    'absolute': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'double': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'lexical': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'approximate': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'verbal': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []}
                }
            
            # Load labels from eval/results/rongxin CSV
            eval_filepath = Path(f'../preprocessing/data_after_phase2/rongxin/{mod}_100.json')
            if eval_filepath.exists():
                label_df = json.load(open(eval_filepath))
            if len(mod_df) != len(label_df):
                print(mod, model, len(mod_df), len(label_df))
                
            for idx, row in mod_df.iterrows():
                # Get ground truth labels from JSON in order
                orig_label = label_df[idx][1]['original_label'] if 'original_label' in label_df[idx][1] else label_df[idx][1]['label']
                mod_label = label_df[idx][1]['modified_label'] if 'modified_label' in label_df[idx][1] else label_df[idx][1]['label']
                
                # Get predictions from CSV
                orig_pred = row['original'] if 'original' in row else row['original_pred']
                mod_pred = row['modified'] if 'modified' in row else row['modified_pred']
                
                # Compare predictions with ground truth
                orig_correct_bool = orig_pred == orig_label
                mod_correct_bool = mod_pred == mod_label
                
                if orig_correct_bool:
                    orig_correct += 1
                    if mod == 'negation':
                        if idx >= len(negation_types):
                            print(f"WARNING: Index {idx} exceeds negation types list length")
                            continue
                        results_by_type[negation_types[idx]]['orig_correct'] += 1
                if mod_correct_bool:
                    mod_correct += 1
                    if mod == 'negation':
                        if idx >= len(negation_types):
                            continue
                        results_by_type[negation_types[idx]]['mod_correct'] += 1
                        
                if mod == 'negation':
                    if idx >= len(negation_types):
                        continue
                    results_by_type[negation_types[idx]]['total'] += 1
                    results_by_type[negation_types[idx]]['orig_binary'].append(1 if orig_correct_bool else 0)
                    results_by_type[negation_types[idx]]['mod_binary'].append(1 if mod_correct_bool else 0)
                    
                orig_list.append(orig_pred)
                mod_list.append(mod_pred)
                total += 1
                
            orig_acc = orig_correct / total * 100 if total > 0 else 0
            mod_acc = mod_correct / total * 100 if total > 0 else 0
            pct_diff = ((mod_acc - orig_acc) / orig_acc) * 100 if orig_acc > 0 else 0
            weighted_delta = (mod_acc - orig_acc) * np.log10(orig_acc) / np.log10(100)  if orig_acc > 0 else 0
            
            orig_list_binary = [1 if pred == label else 0 for pred, label in zip(orig_list, [label_df[i][1]['original_label'] if 'original_label' in label_df[i][1] else label_df[i][1]['label'] for i in range(len(orig_list))])]
            mod_list_binary = [1 if pred == label else 0 for pred, label in zip(mod_list, [label_df[i][1]['modified_label'] if 'modified_label' in label_df[i][1] else label_df[i][1]['label'] for i in range(len(mod_list))])]

            # Perform paired t-test on the raw predictions (0/1)
            try:
                _, p_value_mw = stats.mannwhitneyu(orig_list_binary, mod_list_binary)
                _, p_value_w = stats.wilcoxon(orig_list_binary, mod_list_binary)
                p_value = min(p_value_mw, p_value_w)  # Use the more conservative p-value
            except ValueError:
                # If all elements are identical, set p-value to 1.0 since there is no difference
                p_value = 1.0
                
            # Add significance level
            if p_value < 0.01:
                significance = '**'
            elif p_value < 0.05:
                significance = '*'
            elif p_value < 0.1:
                significance = '.'
            else:
                significance = 'ns'
            
            row = {
                'model': model,
                'modification': mod,
                'original_acc': decimal.Decimal(orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'modified_acc': decimal.Decimal(mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'percentage_diff': decimal.Decimal(pct_diff).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'weighted_delta': decimal.Decimal(weighted_delta).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'p_value': decimal.Decimal(p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'significance': significance,
                'significant': 'Yes' if p_value < 0.05 else 'No'
            }
            results_rows.append(row)
            
            # Add rows for each negation type if this is negation mod
            if mod == 'negation':
                for neg_type, results in results_by_type.items():
                    type_orig_acc = results['orig_correct'] / results['total'] * 100 if results['total'] > 0 else 0
                    type_mod_acc = results['mod_correct'] / results['total'] * 100 if results['total'] > 0 else 0
                    type_pct_diff = ((type_mod_acc - type_orig_acc) / type_orig_acc) * 100 if type_orig_acc > 0 else 0
                    type_weighted_delta = (type_mod_acc - type_orig_acc) * np.log10(type_orig_acc) / np.log10(100) if type_orig_acc > 0 else 0

                    try:
                        _, p_value_mw = stats.mannwhitneyu(results['orig_binary'], results['mod_binary'])
                        _, p_value_w = stats.wilcoxon(results['orig_binary'], results['mod_binary'])
                        p_value = min(p_value_mw, p_value_w)  # Use the more conservative p-value
                    except ValueError:
                        # If all elements are identical, set p-value to 1.0 since there is no difference
                        p_value = 1.0
                        
                    # Add significance level
                    if p_value < 0.01:
                        significance = '**'
                    elif p_value < 0.05:
                        significance = '*'
                    elif p_value < 0.1:
                        significance = '.'
                    else:
                        significance = 'ns'

                    type_row = {
                        'model': model,
                        'modification': neg_type,
                        'original_acc': decimal.Decimal(type_orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'modified_acc': decimal.Decimal(type_mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'pct_diff': decimal.Decimal(type_pct_diff).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'weighted_delta': decimal.Decimal(type_weighted_delta).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'wilcoxon_pvalue': decimal.Decimal(p_value_w).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'mannwhitney_pvalue': decimal.Decimal(p_value_mw).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'pvalue': decimal.Decimal(p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'significance': significance,
                        'significant': p_value < 0.05
                    }
                    negation_results_rows.append(type_row)

# Convert to dataframes
results_df = pd.DataFrame(results_rows)
negation_results_df = pd.DataFrame(negation_results_rows)

# Save results
results_df.to_csv(base_path / 'dialogue_plm_results.csv', index=False)
negation_results_df.to_csv(base_path / 'dialogue_plm_negation_results.csv', index=False)

# Display results
print("Results by modification and model:")
print(results_df)


  z = (r_plus - mn) / se
  z = (r_plus - mn) / se


Results by modification and model:
              model              modification original_acc modified_acc  \
0   bert-base-cased                    casual       90.000       82.000   
1              gpt2                    casual       93.000       85.000   
2           t5-base                    casual       82.000       88.000   
3   bert-base-cased  coordinating_conjunction       89.000       88.000   
4              gpt2  coordinating_conjunction       90.000       88.000   
5           t5-base  coordinating_conjunction       80.000       83.000   
6   bert-base-cased                  singlish       84.906       76.415   
7              gpt2                  singlish       80.189       77.358   
8           t5-base                  singlish       83.962       81.132   
9   bert-base-cased            capitalization       92.708       92.708   
10             gpt2            capitalization       90.625       86.458   
11          t5-base            capitalization       79.167       

In [4]:
# Also analyze results from eval/results/rongxin/
eval_base_path = Path('../eval/results/rongxin')
eval_results_rows = []
eval_negation_results_rows = []

for mod in os.listdir(eval_base_path):
    if not mod.endswith('_100.csv'):
        continue
    model = mod.split('-0shot-')[0]
    if model == 'mixtral':
        continue
    mod = mod.split('-0shot-')[1].replace('_100.csv', '')
    # Load predictions from CSV
    eval_filepath = eval_base_path / f'{model}-0shot-{mod}_100.csv'
    if not eval_filepath.exists():
        continue
    df = pd.read_csv(eval_filepath)
    
    compare_file = Path(f'../preprocessing/data_after_phase2/rongxin/{mod}_100.json')
    if not compare_file.exists():
        continue
    compare_df = json.load(open(compare_file))
    if len(df) != len(compare_df):
        print(f"Warning: Length mismatch for {mod} {model}")
    
    # Calculate accuracies
    orig_correct = sum(df['original_pred'] == df['original_label'])
    mod_correct = sum(df['modified_pred'] == df['modified_label'])
    total = len(df)
        
    orig_acc = orig_correct / total * 100 if total > 0 else 0
    mod_acc = mod_correct / total * 100 if total > 0 else 0
    pct_diff = ((mod_acc - orig_acc) / orig_acc) * 100 if orig_acc > 0 else 0
    
    # Calculate weighted delta
    weighted_delta = (mod_acc - orig_acc) * np.log10(orig_acc) / np.log10(100) if orig_acc > 0 else 0
    
    # Convert predictions to binary (0/1) based on correctness for t-test
    orig_binary = (df['original_pred'] == df['original_label']).astype(int)
    mod_binary = (df['modified_pred'] == df['modified_label']).astype(int)
    
    # Perform paired t-test on binary correctness values
    try:
        _, p_value_mw = stats.mannwhitneyu(orig_binary, mod_binary)
        _, p_value_wilc = stats.wilcoxon(orig_binary, mod_binary)
        p_value = min(p_value_mw, p_value_wilc)  # Use most conservative p-value
    except ValueError:
        # If all elements are identical, set p-value to 1.0 since there is no difference
        p_value = 1.0
        
    # Add significance level
    if p_value < 0.01:
        significance = '**'
    elif p_value < 0.05:
        significance = '*'
    elif p_value < 0.1:
        significance = '.'
    else:
        significance = 'ns'
    
    row = {
        'model': model,
        'modification': mod,
        'original_acc': decimal.Decimal(orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'modified_acc': decimal.Decimal(mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'percentage_diff': decimal.Decimal(pct_diff).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'weighted_delta': decimal.Decimal(weighted_delta).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'p_value': decimal.Decimal(p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'significance': significance,
        'significant': p_value < 0.05
    }
    eval_results_rows.append(row)
    
    # Add rows for each negation type if this is negation mod
    if mod == 'negation':
        # Verify negation types match expected values
        expected_types = {'absolute', 'double', 'lexical', 'approximate', 'verbal'}
        actual_types = set(df['type'].unique())
        if actual_types != expected_types:
            print(f"Warning: Unexpected negation types for {model}")
            print(f"Expected: {expected_types}")
            print(f"Found: {actual_types}")
            
        for neg_type in df['type'].unique():
            type_df = df[df['type'] == neg_type]
            
            type_orig_correct = sum(type_df['original_pred'] == type_df['original_label'])
            type_mod_correct = sum(type_df['modified_pred'] == type_df['modified_label'])
            type_total = len(type_df)
            
            type_orig_acc = type_orig_correct / type_total * 100 if type_total > 0 else 0
            type_mod_acc = type_mod_correct / type_total * 100 if type_total > 0 else 0
            type_pct_diff = ((type_mod_acc - type_orig_acc) / type_orig_acc) * 100 if type_orig_acc > 0 else 0
            
            # Calculate weighted delta for this type
            type_weighted_delta = (type_mod_acc - type_orig_acc) * np.log10(type_orig_acc) / np.log10(100) if type_orig_acc > 0 else 0
            
            # Statistical tests for this negation type
            type_orig_binary = (type_df['original_pred'] == type_df['original_label']).astype(int)
            type_mod_binary = (type_df['modified_pred'] == type_df['modified_label']).astype(int)
            
            try:
                _, type_p_value_mw = stats.mannwhitneyu(type_orig_binary, type_mod_binary)
                _, type_p_value_wilc = stats.wilcoxon(type_orig_binary, type_mod_binary)
                type_p_value = min(type_p_value_mw, type_p_value_wilc)
            except ValueError:
                type_p_value = 1.0
                
            # Add significance level
            if type_p_value < 0.01:
                significance = '**'
            elif type_p_value < 0.05:
                significance = '*'
            elif type_p_value < 0.1:
                significance = '.'
            else:
                significance = 'ns'
                
            type_row = {
                'model': model,
                'modification': neg_type,
                'original_acc': decimal.Decimal(type_orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'modified_acc': decimal.Decimal(type_mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'pct_diff': decimal.Decimal(type_pct_diff).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'weighted_delta': decimal.Decimal(type_weighted_delta).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'wilcoxon_pvalue': decimal.Decimal(type_p_value_wilc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'mannwhitney_pvalue': decimal.Decimal(type_p_value_mw).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'pvalue': decimal.Decimal(type_p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'significance': significance,
                'significant': type_p_value < 0.05
            }
            eval_negation_results_rows.append(type_row)

# Convert to dataframes
eval_results_df = pd.DataFrame(eval_results_rows)
eval_negation_results_df = pd.DataFrame(eval_negation_results_rows)

# Save results
eval_results_df.to_csv(base_path / 'dialogue_llm_results.csv', index=False)
eval_negation_results_df.to_csv(base_path / 'dialogue_llm_negation_results.csv', index=False)

# Display results
print("\nEvaluation Results by modification and model:")
# print(eval_results_df)
print("\nNegation Results by type and model:")
# print(eval_negation_results_df)



Evaluation Results by modification and model:

Negation Results by type and model:


In [5]:
# Load LLM and PLM results
base_path = Path('../pretrained_language_models/dialogue_contradiction_detection/tmp')
llm_results = pd.read_csv(base_path / 'dialogue_llm_results.csv')
plm_results = pd.read_csv(base_path / 'dialogue_plm_results.csv')

# Load LLM and PLM negation results
llm_neg_results = pd.read_csv(base_path / 'dialogue_llm_negation_results.csv')
plm_neg_results = pd.read_csv(base_path / 'dialogue_plm_negation_results.csv')

# Combine the regular results
combined_results = pd.concat([llm_results, plm_results], ignore_index=True)

# Combine the negation results
combined_neg_results = pd.concat([llm_neg_results, plm_neg_results], ignore_index=True)

# Sort by model and modification
combined_results = combined_results.sort_values(['model', 'modification'])
combined_neg_results = combined_neg_results.sort_values(['model', 'modification'])

# Save combined results
combined_results.to_csv(base_path / 'dialogue_combined_results.csv', index=False)
combined_neg_results.to_csv(base_path / 'dialogue_combined_negation_results.csv', index=False)

print("\nCombined Results:")
print(combined_results)
print("\nCombined Negation Results:")
print(combined_neg_results)



Combined Results:
              model         modification  original_acc  modified_acc  \
78  bert-base-cased    active_to_passive        87.000        90.000   
63  bert-base-cased       capitalization        92.708        92.708   
54  bert-base-cased               casual        90.000        82.000   
72  bert-base-cased        compound_word        89.000        85.000   
84  bert-base-cased  concept_replacement        89.000        86.000   
..              ...                  ...           ...           ...   
83          t5-base          punctuation        86.000        86.000   
98          t5-base            sentiment        84.000        81.000   
62          t5-base             singlish        83.962        81.132   
68          t5-base        temporal_bias        81.000        76.000   
92          t5-base            typo_bias        84.000        82.000   

    percentage_diff  weighted_delta  p_value significance significant  
78            3.448           2.909    0.317

In [7]:
import numpy as np
base_path = Path('../pretrained_language_models/dialogue_contradiction_detection/tmp')
modification_order = [
    "Temporal", "Geographical", "Length",
    "Spelling", "Capitalization", "Punctuation", 
    "Derivation", "Compound",
    "Voice", "Grammar", "Conjunction",
    "Concept", "Negation", 
    "Discourse", "Sentiment",
    "Casual", "Dialectal", "Singlish"
]

# Read and process data
df = pd.read_csv(base_path / 'dialogue_combined_results.csv')
df['modification'] = df['modification'].map({
    'temporal_bias': 'Temporal', 'geographical_bias': 'Geographical', 'length_bias': 'Length',
    'typo_bias': 'Spelling', 'capitalization': 'Capitalization', 'punctuation': 'Punctuation',
    'derivation': 'Derivation', 'compound_word': 'Compound', 'active_to_passive': 'Voice',
    'grammatical_role': 'Grammar', 'coordinating_conjunction': 'Conjunction',
    'concept_replacement': 'Concept', 'negation': 'Negation', 'discourse': 'Discourse',
    'sentiment': 'Sentiment', 'casual': 'Casual', 'dialectal': 'Dialectal',
    'singlish': 'Singlish'
})

models = {
    'bert-base-cased': 'BERT', 'gpt2': 'GPT-2', 't5-base': 'T5',
    'gpt4o': 'GPT-4o', 'claude-3-5-sonnet': 'Claude 3.5', 'llama': 'Llama 3.1'
}

# Calculate weighted delta instead of percentage difference

# Pivot data
pivot_df = df.pivot(index='modification', columns='model', values='weighted_delta')
p_values = df.pivot(index='modification', columns='model', values='p_value')
pivot_df = pivot_df.reindex(modification_order).reindex(models.keys(), axis=1)
p_values = p_values.reindex(modification_order).reindex(models.keys(), axis=1)

# Get mean original accuracy for each model
model_original_acc = df.groupby('model')['original_acc'].mean()

# Generate LaTeX table with adjustbox
latex = [
    '\\begin{table}[h]',
    '\\centering',
    '\\footnotesize',
    '\\begin{adjustbox}{max width=\\linewidth}',
    '\\begin{tabular}{llc' + 'c'*(len(models)) + '}',
    '\\hline',
    'Category & Modification & ' + ' & '.join([f'\\textbf{{{models[col]}}}' for col in pivot_df.columns]) + ' \\\\',
    '\\hline'
]

# Add mean original accuracy row with textit and grey background
cells = []
for col in pivot_df.columns:
    val = model_original_acc[col]
    cells.append(f'{val:.2f}')
latex.append('\\rowcolor{gray!20}\\textit{Original} & & ' + ' & '.join(cells) + ' \\\\')

categories = {
    'Temporal': 'Bias',
    'Geographical': 'Bias', 
    'Length': 'Bias',
    'Spelling': 'Orthographic',
    'Capitalization': 'Orthographic',
    'Punctuation': 'Orthographic',
    'Derivation': 'Morphological',
    'Compound': 'Morphological',
    'Voice': 'Syntactic',
    'Grammar': 'Syntactic',
    'Conjunction': 'Syntactic',
    'Concept': 'Semantic',
    'Negation': 'Semantic',
    'Discourse': 'Pragmatic',
    'Sentiment': 'Pragmatic',
    'Casual': 'Genre',
    'Dialectal': 'Genre',
    'Singlish': 'Genre'
}

prev_category = None
for idx, row in pivot_df.iterrows():
    current_category = categories[idx]
    if prev_category is not None and current_category != prev_category:
        latex.append('\\hline')
    
    cells = []
    for col, val in row.items():
        if np.isnan(val):
            cells.append('')
            continue
        color = 'green' if val > 0 else 'red'
        intensity = min(abs(val)/10, 1) * 30  # Adjusted scale for percentage
        val_str = f'{val:+.2f}' if val > 0 else f'{val:.2f}'  # Show as percentage
        p = p_values.loc[idx, col]
        if p < 0.01: val_str = f'\\textbf{{{val_str}}}**'
        elif p < 0.05: val_str = f'\\textbf{{{val_str}}}*'
        elif p < 0.1: val_str = f'\\textbf{{{val_str}}}'
        cells.append(f'\\cellcolor{{{color}!{int(intensity)}}} {val_str}')
    
    category_text = f'\\textbf{{{current_category}}}' if current_category != prev_category else ''
    latex.append(f'{category_text} & \\textbf{{{idx}}} & ' + ' & '.join(cells) + ' \\\\')
    prev_category = current_category

latex.extend([
    '\\hline',
    '\\end{tabular}',
    '\\end{adjustbox}',
    '\\caption{Percentage Change in Micro F1 Score by Model and Modification Type}',
    '\\label{tab:dialogue_results_table}',
    '\\end{table}'
])

with open(base_path / 'dialogue_results_table.tex', 'w') as f:
    f.write('\n'.join(latex))

print("LaTeX table saved to .tex")


LaTeX table saved to .tex


In [9]:
# Define modification mapping
mod_mapping = {
    'temporal_bias': 'Temporal',
    'geographical_bias': 'Geographical', 
    'length_bias': 'Length',
    'spelling': 'Spelling',
    'capitalization': 'Capitalization',
    'punctuation': 'Punctuation',
    'derivation': 'Derivation',
    'compound': 'Compound',
    'voice': 'Voice',
    'grammar': 'Grammar',
    'conjunction': 'Conjunction',
    'concept': 'Concept',
    'negation': 'Negation',
    'discourse': 'Discourse',
    'sentiment': 'Sentiment',
    'casual': 'Casual',
    'dialectal': 'Dialectal',
    'singlish': 'Singlish'
}

# Load results from CSV
base_path = Path('../pretrained_language_models/dialogue_contradiction_detection/tmp')

results_df = pd.read_csv(base_path / 'dialogue_combined_results.csv')

# Create lists of unique modifications and models
modification_order = list(mod_mapping.keys())
negation_order = ['verbal', 'lexical', 'double', 'approximate', 'absolute']
model_order = ['bert-base-cased', 'gpt2', 't5-base', 'gpt4o', 'claude-3-5-sonnet', 'llama']

# Create empty DataFrame with multi-level columns
columns = pd.MultiIndex.from_product([model_order, ['original', 'modified', 'diff']])
results_df_pivot = pd.DataFrame(index=modification_order, columns=columns)

# Fill DataFrame
for mod in modification_order:
    for model in model_order:
        row = results_df[(results_df['modification'] == mod) & (results_df['model'] == model)]
        if not row.empty:
            results_df_pivot.loc[mod, (model, 'original')] = row['original_acc'].values[0]
            results_df_pivot.loc[mod, (model, 'modified')] = row['modified_acc'].values[0]
            results_df_pivot.loc[mod, (model, 'diff')] = row['percentage_diff'].values[0]
            if mod == 'temporal_bias' and model == 'bert':
                print(row)

# Save to CSV
results_df_pivot.to_csv(base_path / 'dialogue_results_df.csv')

# Save negation results to separate CSV
# Load negation results from CSV
negation_results_df = pd.read_csv(base_path / 'dialogue_combined_negation_results.csv')

# Create empty DataFrame with multi-level columns for negation results
negation_df_pivot = pd.DataFrame(index=['negation'], columns=columns)

# Fill DataFrame with negation results
for mod in negation_order:  
    for model in model_order:
        row = negation_results_df[(negation_results_df['modification'] == mod) & (negation_results_df['model'] == model)]
        if not row.empty:
            negation_df_pivot.loc[mod, (model, 'original')] = row['original_acc'].values[0]
            negation_df_pivot.loc[mod, (model, 'modified')] = row['modified_acc'].values[0] 
            negation_df_pivot.loc[mod, (model, 'diff')] = row['pct_diff'].values[0]

# Save negation results to CSV
negation_df_pivot.to_csv(base_path / 'dialogue_negation_results_df.csv')

print("Results saved to dialogue_results_df.csv and dialogue_negation_results_df.csv")


Results saved to dialogue_results_df.csv and dialogue_negation_results_df.csv


In [10]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
from scipy import stats

# Base path for results
base_path = Path('../pretrained_language_models/coreference_resolution/tmp')

# Function to load and convert predictions to binary
def load_predictions(filepath):
    with open(filepath) as f:
        preds = json.load(f)
    return preds

# Load original model predictions
model_orig_preds = {}
model_names = ['bert-base-cased', 'gpt2', 't5-base']

for model in model_names:
    filepath = base_path / f'{model}_results/{model}_predictions.json'
    model_orig_preds[model] = load_predictions(filepath)

# Load predictions for each modification
modifications = []
for model in model_names:
    mod_path = base_path / f'{model}_results'
    if mod_path.exists():
        # Get all CSV files containing predictions
        modifications.extend([f.stem for f in mod_path.glob('*_predictions.csv')])
modifications = list(set(modifications))  # Remove duplicates
modifications = [mod.replace('_predictions', '') for mod in modifications]

# Load negation types from GPT4 results
gpt4_negation_df = pd.read_csv('../eval/results/thinh/gpt4o-0shot-negation_100.csv')
negation_types = gpt4_negation_df['type'].tolist()

# Sanity check negation types
valid_types = {'absolute', 'double', 'lexical', 'approximate', 'verbal'}
for neg_type in negation_types:
    if neg_type not in valid_types:
        print(f"WARNING: Invalid negation type found: {neg_type}")

# Create results list to store accuracy and statistical test results
results_rows = []
negation_results_rows = []

for mod in modifications:
    for model in model_names:
        # Get original predictions
        orig_preds = model_orig_preds[model]
        
        # Get modified predictions from CSV file
        mod_filepath = base_path / f'{model}_results/{mod}_predictions.csv'
        if mod_filepath.exists():
            mod_df = pd.read_csv(mod_filepath)
            # Calculate accuracies
            orig_correct = 0
            mod_correct = 0
            total = 0
            orig_list = []
            mod_list = []
            
            # Track results by negation type if this is negation mod
            if mod == 'negation':
                results_by_type = {
                    'absolute': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'double': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'lexical': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'approximate': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'verbal': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []}
                }
            
            # Load labels from eval/results/rongxin CSV
            eval_filepath = Path(f'../preprocessing/data_after_phase2/thinh/{mod}_100.json')
            if eval_filepath.exists():
                label_df = json.load(open(eval_filepath))
            if len(mod_df) != len(label_df):
                print(mod, model)
                
            for idx, row in mod_df.iterrows():
                # Get ground truth labels from JSON in order
                orig_label = label_df[idx]['original_label'] if 'original_label' in label_df[idx] else label_df[idx]['label']
                mod_label = label_df[idx]['modified_label'] if 'modified_label' in label_df[idx] else label_df[idx]['label']
                
                # Get predictions from CSV
                orig_pred = row['original'] if 'original' in row else row['original_pred']
                mod_pred = row['modified'] if 'modified' in row else row['modified_pred']
                
                # Compare predictions with ground truth
                orig_correct_bool = orig_pred == orig_label
                mod_correct_bool = mod_pred == mod_label
                
                if orig_correct_bool:
                    orig_correct += 1
                    if mod == 'negation':
                        if idx >= len(negation_types):
                            print(f"WARNING: Index {idx} exceeds negation types list length")
                            continue
                        results_by_type[negation_types[idx]]['orig_correct'] += 1
                if mod_correct_bool:
                    mod_correct += 1
                    if mod == 'negation':
                        if idx >= len(negation_types):
                            continue
                        results_by_type[negation_types[idx]]['mod_correct'] += 1
                        
                if mod == 'negation':
                    if idx >= len(negation_types):
                        continue
                    results_by_type[negation_types[idx]]['total'] += 1
                    results_by_type[negation_types[idx]]['orig_binary'].append(1 if orig_correct_bool else 0)
                    results_by_type[negation_types[idx]]['mod_binary'].append(1 if mod_correct_bool else 0)
                    
                orig_list.append(orig_pred)
                mod_list.append(mod_pred)
                total += 1
                
            orig_acc = orig_correct / total * 100 if total > 0 else 0
            mod_acc = mod_correct / total * 100 if total > 0 else 0
            pct_diff = ((mod_acc - orig_acc) / orig_acc) * 100 if orig_acc > 0 else 0
            weighted_delta = (mod_acc - orig_acc) * np.log10(orig_acc) / np.log10(100) if orig_acc > 0 else 0
            
            orig_list_binary = [1 if pred == label else 0 for pred, label in zip(orig_list, [label_df[i]['original_label'] if 'original_label' in label_df[i] else label_df[i]['label'] for i in range(len(orig_list))])]
            mod_list_binary = [1 if pred == label else 0 for pred, label in zip(mod_list, [label_df[i]['modified_label'] if 'modified_label' in label_df[i] else label_df[i]['label'] for i in range(len(mod_list))])]

            # Perform paired t-test on the raw predictions (0/1)
            try:
                _, p_value_mw = stats.mannwhitneyu(orig_list_binary, mod_list_binary)
                _, p_value_w = stats.wilcoxon(orig_list_binary, mod_list_binary)
                p_value = min(p_value_mw, p_value_w)  # Use the more conservative p-value
            except ValueError:
                # If all elements are identical, set p-value to 1.0 since there is no difference
                p_value = 1.0
                
            # Add significance level
            if p_value < 0.01:
                significance = '**'
            elif p_value < 0.05:
                significance = '*'
            elif p_value < 0.1:
                significance = '.'
            else:
                significance = 'ns'
            
            row = {
                'model': model,
                'modification': mod,
                'original_acc': decimal.Decimal(orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'modified_acc': decimal.Decimal(mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'percentage_diff': decimal.Decimal(pct_diff).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'weighted_delta': decimal.Decimal(weighted_delta).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'p_value': decimal.Decimal(p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'significance': significance,
                'significant': 'Yes' if p_value < 0.05 else 'No'
            }
            results_rows.append(row)
            
            # Add rows for each negation type if this is negation mod
            if mod == 'negation':
                for neg_type, results in results_by_type.items():
                    type_orig_acc = results['orig_correct'] / results['total'] * 100 if results['total'] > 0 else 0
                    type_mod_acc = results['mod_correct'] / results['total'] * 100 if results['total'] > 0 else 0
                    type_pct_diff = ((type_mod_acc - type_orig_acc) / type_orig_acc) * 100 if type_orig_acc > 0 else 0
                    type_weighted_delta = (type_mod_acc - type_orig_acc) * np.log10(type_orig_acc) / np.log10(100) if type_orig_acc > 0 else 0

                    try:
                        _, p_value_mw = stats.mannwhitneyu(results['orig_binary'], results['mod_binary'])
                        _, p_value_w = stats.wilcoxon(results['orig_binary'], results['mod_binary'])
                        p_value = min(p_value_mw, p_value_w)  # Use the more conservative p-value
                    except ValueError:
                        # If all elements are identical, set p-value to 1.0 since there is no difference
                        p_value = 1.0
                        
                    # Add significance level
                    if p_value < 0.01:
                        significance = '**'
                    elif p_value < 0.05:
                        significance = '*'
                    elif p_value < 0.1:
                        significance = '.'
                    else:
                        significance = 'ns'

                    type_row = {
                        'model': model,
                        'modification': neg_type,
                        'original_acc': decimal.Decimal(type_orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'modified_acc': decimal.Decimal(type_mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'pct_diff': decimal.Decimal(type_pct_diff).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'weighted_delta': decimal.Decimal(type_weighted_delta).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'wilcoxon_pvalue': decimal.Decimal(p_value_w).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'mannwhitney_pvalue': decimal.Decimal(p_value_mw).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'pvalue': decimal.Decimal(p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'significance': significance,
                        'significant': p_value < 0.05
                    }
                    negation_results_rows.append(type_row)

# Convert to dataframes
results_df = pd.DataFrame(results_rows)
negation_results_df = pd.DataFrame(negation_results_rows)

# Save results
results_df.to_csv(base_path / 'coreference_plm_results.csv', index=False)
negation_results_df.to_csv(base_path / 'coreference_plm_negation_results.csv', index=False)

# Display results
# print("Results by modification and model:")
# print(results_df)


In [20]:
# Also analyze results from eval/results/rongxin/
eval_base_path = Path('../eval/results/thinh')
eval_results_rows = []
eval_negation_results_rows = []

for mod in os.listdir(eval_base_path):
    if not mod.endswith('_100.csv'):
        continue
    model = mod.split('-0shot-')[0]
    if model == 'mixtral':
        continue
    mod = mod.split('-0shot-')[1].replace('_100.csv', '')
    # Load predictions from CSV
    eval_filepath = eval_base_path / f'{model}-0shot-{mod}_100.csv'
    if not eval_filepath.exists():
        continue
    df = pd.read_csv(eval_filepath)
    
    compare_file = Path(f'../preprocessing/data_after_phase2/thinh/{mod}_100.json')
    if not compare_file.exists():
        continue
    compare_df = json.load(open(compare_file))
    if len(df) != len(compare_df):
        print(f"Warning: Length mismatch for {mod} {model}")
    
    # Calculate accuracies
    orig_correct = sum(df['original_pred'] == df['original_label'])
    mod_correct = sum(df['modified_pred'] == df['modified_label'])
    total = len(df)
        
    orig_acc = orig_correct / total * 100 if total > 0 else 0
    mod_acc = mod_correct / total * 100 if total > 0 else 0
    pct_diff = ((mod_acc - orig_acc) / orig_acc) * 100 if orig_acc > 0 else 0
    weighted_delta = (mod_acc - orig_acc) * np.log10(orig_acc) / np.log10(100) if orig_acc > 0 else 0
    
    # Convert predictions to binary (0/1) based on correctness for t-test
    orig_binary = (df['original_pred'] == df['original_label']).astype(int)
    mod_binary = (df['modified_pred'] == df['modified_label']).astype(int)
    
    # Perform paired t-test on binary correctness values
    try:
        _, p_value_mw = stats.mannwhitneyu(orig_binary, mod_binary)
        _, p_value_wilc = stats.wilcoxon(orig_binary, mod_binary)
        p_value = min(p_value_mw, p_value_wilc)  # Use most conservative p-value
    except ValueError:
        # If all elements are identical, set p-value to 1.0 since there is no difference
        p_value = 1.0
        
    # Add significance level
    if p_value < 0.01:
        significance = '**'
    elif p_value < 0.05:
        significance = '*'
    elif p_value < 0.1:
        significance = '.'
    else:
        significance = 'ns'
    
    row = {
        'model': model,
        'modification': mod,
        'original_acc': decimal.Decimal(orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'modified_acc': decimal.Decimal(mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'percentage_diff': decimal.Decimal(pct_diff).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'weighted_delta': decimal.Decimal(weighted_delta).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'p_value': decimal.Decimal(p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'significance': significance,
        'significant': p_value < 0.05
    }
    eval_results_rows.append(row)
    
    # Add rows for each negation type if this is negation mod
    if mod == 'negation':
        # Verify negation types match expected values
        expected_types = {'absolute', 'double', 'lexical', 'approximate', 'verbal'}
        actual_types = set(df['type'].unique())
        if actual_types != expected_types:
            print(f"Warning: Unexpected negation types for {model}")
            print(f"Expected: {expected_types}")
            print(f"Found: {actual_types}")
            
        for neg_type in df['type'].unique():
            type_df = df[df['type'] == neg_type]
            
            type_orig_correct = sum(type_df['original_pred'] == type_df['original_label'])
            type_mod_correct = sum(type_df['modified_pred'] == type_df['modified_label'])
            type_total = len(type_df)
            
            type_orig_acc = type_orig_correct / type_total * 100 if type_total > 0 else 0
            type_mod_acc = type_mod_correct / type_total * 100 if type_total > 0 else 0
            type_pct_diff = ((type_mod_acc - type_orig_acc) / type_orig_acc) * 100 if type_orig_acc > 0 else 0
            type_weighted_delta = (type_mod_acc - type_orig_acc) * np.log10(type_orig_acc) / np.log10(100) if type_orig_acc > 0 else 0
            
            # Statistical tests for this negation type
            type_orig_binary = (type_df['original_pred'] == type_df['original_label']).astype(int)
            type_mod_binary = (type_df['modified_pred'] == type_df['modified_label']).astype(int)
            
            try:
                _, type_p_value_mw = stats.mannwhitneyu(type_orig_binary, type_mod_binary)
                _, type_p_value_wilc = stats.wilcoxon(type_orig_binary, type_mod_binary)
                type_p_value = min(type_p_value_mw, type_p_value_wilc)
            except ValueError:
                type_p_value = 1.0
                
            # Add significance level
            if type_p_value < 0.01:
                significance = '**'
            elif type_p_value < 0.05:
                significance = '*'
            elif type_p_value < 0.1:
                significance = '.'
            else:
                significance = 'ns'
                
            type_row = {
                'model': model,
                'modification': neg_type,
                'original_acc': decimal.Decimal(type_orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'modified_acc': decimal.Decimal(type_mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'pct_diff': decimal.Decimal(type_pct_diff).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'weighted_delta': decimal.Decimal(type_weighted_delta).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'wilcoxon_pvalue': decimal.Decimal(type_p_value_wilc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'mannwhitney_pvalue': decimal.Decimal(type_p_value_mw).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'pvalue': decimal.Decimal(type_p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'significance': significance,
                'significant': type_p_value < 0.05
            }
            eval_negation_results_rows.append(type_row)

# Convert to dataframes
eval_results_df = pd.DataFrame(eval_results_rows)
eval_negation_results_df = pd.DataFrame(eval_negation_results_rows)

# Save results
eval_results_df.to_csv(base_path / 'coreference_llm_results.csv', index=False)
eval_negation_results_df.to_csv(base_path / 'coreference_llm_negation_results.csv', index=False)

# Display results
print("\nEvaluation Results by modification and model:")
# print(eval_results_df)
print("\nNegation Results by type and model:")
# print(eval_negation_results_df)



Evaluation Results by modification and model:

Negation Results by type and model:


In [21]:
# Load LLM and PLM results
llm_results = pd.read_csv(base_path / 'coreference_llm_results.csv')
plm_results = pd.read_csv(base_path / 'coreference_plm_results.csv')

# Load LLM and PLM negation results
llm_neg_results = pd.read_csv(base_path / 'coreference_llm_negation_results.csv')
plm_neg_results = pd.read_csv(base_path / 'coreference_plm_negation_results.csv')

# Combine the regular results
combined_results = pd.concat([llm_results, plm_results], ignore_index=True)

# Combine the negation results
combined_neg_results = pd.concat([llm_neg_results, plm_neg_results], ignore_index=True)

# Sort by model and modification
combined_results = combined_results.sort_values(['model', 'modification'])
combined_neg_results = combined_neg_results.sort_values(['model', 'modification'])

# Save combined results
combined_results.to_csv(base_path / 'coreference_combined_results.csv', index=False)
combined_neg_results.to_csv(base_path / 'coreference_combined_negation_results.csv', index=False)

print("\nCombined Results:")
print(combined_results)
print("\nCombined Negation Results:")
print(combined_neg_results)



Combined Results:
              model         modification  original_acc  modified_acc  \
78  bert-base-cased    active_to_passive        49.474        47.368   
63  bert-base-cased       capitalization        53.535        48.485   
54  bert-base-cased               casual        55.000        43.000   
72  bert-base-cased        compound_word        60.417        60.417   
84  bert-base-cased  concept_replacement        44.000        42.000   
..              ...                  ...           ...           ...   
83          t5-base          punctuation        58.586        59.596   
98          t5-base            sentiment        64.000        64.000   
62          t5-base             singlish        42.157        44.118   
68          t5-base        temporal_bias        60.000        58.000   
92          t5-base            typo_bias        63.265        64.286   

    percentage_diff  weighted_delta  p_value significance significant  
78           -4.255          -1.784    0.732

In [22]:
import numpy as np
base_path = Path('../pretrained_language_models/coreference_resolution/tmp')
modification_order = [
    "Bias", "Temporal", "Geographical", "Length",
    "Orthographic", "Spelling", "Capitalization", "Punctuation",
    "Morphological", "Derivation", "Compound",
    "Syntactic", "Voice", "Grammar", "Conjunction",
    "Semantic", "Concept", "Negation",
    "Pragmatic", "Discourse", "Sentiment",
    "Genre", "Casual", "Dialectal", "Singlish"
]

# Read the combined results
df = pd.read_csv(base_path / 'coreference_combined_results.csv')

# Create mapping from modification names to standardized names
mod_mapping = {
    'temporal_bias': 'Temporal',
    'geographical_bias': 'Geographical', 
    'length_bias': 'Length',
    'typo_bias': 'Spelling',
    'capitalization': 'Capitalization',
    'punctuation': 'Punctuation',
    'derivation': 'Derivation',
    'compound_word': 'Compound',
    'active_to_passive': 'Voice',
    'grammatical_role': 'Grammar',
    'coordinating_conjunction': 'Conjunction',
    'concept_replacement': 'Concept',
    'negation': 'Negation',
    'discourse': 'Discourse',
    'sentiment': 'Sentiment',
    'casual': 'Casual',
    'dialectal': 'Dialectal',
    'singlish': 'Singlish'
}

# Map the modification names
df['modification'] = df['modification'].map(mod_mapping)

# Define model order and display names
model_order = ['bert-base-cased', 'gpt2', 't5-base', 'gpt4o', 'claude-3-5-sonnet', 'llama']
model_display = ['BERT', 'GPT-2', 'T5', 'GPT-4o', 'Claude 3.5', 'Llama 3.1']

# Get original accuracy for each model
original_acc = df.groupby('model')['original_acc'].first() * 100

# Pivot the data to get modifications as rows and models as columns
pivot_df = df.pivot(index='modification', columns='model', values='weighted_delta')
p_values = df.pivot(index='modification', columns='model', values='p_value')

# Reorder rows and columns
pivot_df = pivot_df.reindex(modification_order, axis=0)
pivot_df = pivot_df.reindex(model_order, axis=1)
p_values = p_values.reindex(modification_order, axis=0)
p_values = p_values.reindex(model_order, axis=1)

# Function to generate color based on value
def get_color(val, p_val):
    if np.isnan(val):
        return ''
    elif val > 0:
        # Green gradient for positive values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'+{val:.2f}'
        if p_val < 0.01:
            val_str = f'\\textbf{{{val_str}}}**'
        elif p_val < 0.05:
            val_str = f'\\textbf{{{val_str}}}*'
        elif p_val < 0.1:
            val_str = f'\\textbf{{{val_str}}}'
        return f'\\cellcolor{{green!{int(intensity*30)}}} {val_str}'
    else:
        # Red gradient for negative values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'{val:.2f}'
        if p_val < 0.01:
            val_str = f'\\textbf{{{val_str}}}**'
        elif p_val < 0.05:
            val_str = f'\\textbf{{{val_str}}}*'
        elif p_val < 0.1:
            val_str = f'\\textbf{{{val_str}}}'
        return f'\\cellcolor{{red!{int(intensity*30)}}} {val_str}'

# Generate LaTeX table
latex_table = '\\begin{table}[h]\n\\centering\n\\footnotesize\n\\begin{adjustbox}{max width=\\linewidth}\n'
latex_table += '\\begin{tabular}{ll' + 'c'*len(pivot_df.columns) + '}\n'
latex_table += '\\hline\n'
latex_table += 'Category & Modification & ' + ' & '.join([f'\\textbf{{{name}}}' for name in model_display]) + ' \\\\\n'
latex_table += '\\hline\n'

# Add original accuracy row
latex_table += '\\rowcolor{gray!20}\\textit{Original} & & ' + ' & '.join([f'{acc:.2f}' for acc in original_acc]) + ' \\\\\n'

# Track displayed categories
displayed_categories = set()

# Display all modifications including categories
for idx in modification_order:
    if idx in ['Bias', 'Orthographic', 'Morphological', 'Syntactic', 'Semantic', 'Pragmatic', 'Genre']:
        if idx not in displayed_categories:
            latex_table += f'\\textbf{{{idx}}} & & & & & & & \\\\\n'
            displayed_categories.add(idx)
    else:
        latex_table += f'& \\textbf{{{idx}}} & '
        if idx in pivot_df.index:
            latex_table += ' & '.join([get_color(val, p_values.loc[idx, col]) for col, val in pivot_df.loc[idx].items()]) + ' \\\\\n'
        else:
            latex_table += ' & '.join([''] * len(model_order)) + ' \\\\\n'

latex_table += '\\end{tabular}\n'
latex_table += '\\end{adjustbox}\n'
latex_table += '\\caption{Percentage Change in Micro F1 Score by Model and Modification Type}\n'
latex_table += '\\label{tab:dialogue_results_table}\n'
latex_table += '\\end{table}'

# Save to file
with open(base_path / 'coreference_results_table.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to .tex")


LaTeX table saved to .tex


In [23]:
# Load results from CSV
base_path = Path('../pretrained_language_models/coreference_resolution/tmp')

results_df = pd.read_csv(base_path / 'coreference_combined_results.csv')

# Create lists of unique modifications and models
modification_order = list(mod_mapping.keys())
model_order = ['bert-base-cased', 'gpt2', 't5-base', 'gpt4o', 'claude-3-5-sonnet', 'llama']
negation_order = ['verbal', 'lexical', 'double', 'approximate', 'absolute']
# Create empty DataFrame with multi-level columns
columns = pd.MultiIndex.from_product([model_order, ['original', 'modified', 'diff']])
results_df_pivot = pd.DataFrame(index=modification_order, columns=columns)

# Fill DataFrame
for mod in modification_order:
    for model in model_order:
        row = results_df[(results_df['modification'] == mod) & (results_df['model'] == model)]
        if not row.empty:
            results_df_pivot.loc[mod, (model, 'original')] = row['original_acc'].values[0]
            results_df_pivot.loc[mod, (model, 'modified')] = row['modified_acc'].values[0]
            results_df_pivot.loc[mod, (model, 'diff')] = row['percentage_diff'].values[0]
            if mod == 'temporal_bias' and model == 'bert':
                print(row)

# Save to CSV
results_df_pivot.to_csv(base_path / 'coreference_results_df.csv')

# Save negation results to separate CSV
# Load negation results from CSV
negation_results_df = pd.read_csv(base_path / 'coreference_combined_negation_results.csv')

# Create empty DataFrame with multi-level columns for negation results
negation_df_pivot = pd.DataFrame(index=['negation'], columns=columns)

# Fill DataFrame with negation results
for mod in negation_order:
    for model in model_order:
        row = negation_results_df[(negation_results_df['modification'] == mod) & (negation_results_df['model'] == model)]
        if not row.empty:   
            negation_df_pivot.loc[mod, (model, 'original')] = row['original_acc'].values[0]
            negation_df_pivot.loc[mod, (model, 'modified')] = row['modified_acc'].values[0] 
            negation_df_pivot.loc[mod, (model, 'diff')] = row['pct_diff'].values[0]

# Save negation results to CSV
negation_df_pivot.to_csv(base_path / 'coreference_negation_results_df.csv')


print("Results saved to coreference_results_df.csv and coreference_negation_results_df.csv")


Results saved to coreference_results_df.csv and coreference_negation_results_df.csv


In [24]:
import numpy as np

# Read the combined results
base_path = Path('dialogue_contradiction_detection/tmp')
df = pd.read_csv(base_path / 'dialogue_combined_negation_results.csv')

# Create mapping from modification names to standardized names
negation_order= ['Verbal', 'Lexical', 'Double', 'Approximate', 'Absolute']
mod_mapping = {
    'verbal': 'Verbal',
    'lexical': 'Lexical',
    'double': 'Double',
    'approximate': 'Approximate',
    'absolute': 'Absolute',
}

# Define model order
model_order = ['bert-base-cased', 'gpt2', 't5-base', 'gpt4o', 'claude-3-5-sonnet', 'llama']

# Map the modification names
df['modification'] = df['modification'].map(mod_mapping)

# Pivot the data to get modifications as rows and models as columns
pivot_df = df.pivot(index='modification', columns='model', values='weighted_delta')
p_values = df.pivot(index='modification', columns='model', values='pvalue')
significance = df.pivot(index='modification', columns='model', values='significance')

# Reorder rows according to modification_order
pivot_df = pivot_df.reindex(negation_order)
p_values = p_values.reindex(negation_order)
significance = significance.reindex(negation_order)

print(pivot_df)

# Function to generate color based on value
def get_color(val, sig):
    if np.isnan(val):
        return ''
    elif val > 0:
        # Green gradient for positive values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'+{val:.1f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{green!{int(intensity*30)}}} {val_str}'
    else:
        # Red gradient for negative values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'{val:.1f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{red!{int(intensity*30)}}} {val_str}'

# Generate LaTeX table
latex_table = '\\begin{table}[h]\n\\centering\n\\begin{tabular}{l' + 'r'*len(model_order) + '}\n'
latex_table += '\\hline\n'
latex_table += 'Modification & ' + ' & '.join([f'\\textbf{{{col}}}' for col in model_order]) + ' \\\\\n'
latex_table += '\\hline\n'

prev_category = None
for idx, row in pivot_df.iterrows():
    current_category = idx[0]  # Get first character of modification name
    if prev_category is not None and current_category != prev_category:
        latex_table += '\\hline\n'
    prev_category = current_category
    
    latex_table += f'\\textbf{{{idx}}} & '
    latex_table += ' & '.join([get_color(row[col], significance.loc[idx, col]) for col in model_order]) + ' \\\\\n'

latex_table += '\\hline\n'
latex_table += '\\end{tabular}\n'
latex_table += '\\caption{Percentage Change in Micro F1 Score by Model and Modification Type}\n'
latex_table += '\\label{tab:ner_results}\n'
latex_table += '\\end{table}'

# Save to file
with open('dialogue_negation_type_results_table.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to ner_results_table.tex")


model         bert-base-cased  claude-3-5-sonnet    gpt2   gpt4o   llama  \
modification                                                               
Verbal                -21.342             -5.413 -15.757   5.487 -27.778   
Lexical               -27.731            -27.309 -30.582 -30.097 -30.344   
Double                  8.199            -36.364 -26.708 -17.390 -44.514   
Approximate           -26.923            -23.077 -22.676 -15.117 -19.067   
Absolute              -22.240            -30.769 -29.653 -30.234 -38.462   

model         t5-base  
modification           
Verbal         -5.336  
Lexical       -27.731  
Double        -17.390  
Approximate   -18.897  
Absolute      -37.793  
LaTeX table saved to ner_results_table.tex


In [25]:
import numpy as np

# Read the combined results
base_path = Path('coreference_resolution/tmp')
df = pd.read_csv(base_path / 'coreference_combined_negation_results.csv')

# Create mapping from modification names to standardized names
negation_order= ['Verbal', 'Lexical', 'Double', 'Approximate', 'Absolute']
mod_mapping = {
    'verbal': 'Verbal',
    'lexical': 'Lexical',
    'double': 'Double',
    'approximate': 'Approximate',
    'absolute': 'Absolute',
}

# Define model order
model_order = ['bert-base-cased', 'gpt2', 't5-base', 'gpt4o', 'claude-3-5-sonnet', 'llama']

# Map the modification names
df['modification'] = df['modification'].map(mod_mapping)

# Pivot the data to get modifications as rows and models as columns
pivot_df = df.pivot(index='modification', columns='model', values='weighted_delta')
p_values = df.pivot(index='modification', columns='model', values='pvalue')
significance = df.pivot(index='modification', columns='model', values='significance')

# Reorder rows according to modification_order
pivot_df = pivot_df.reindex(negation_order)
p_values = p_values.reindex(negation_order)
significance = significance.reindex(negation_order)

print(pivot_df)

# Function to generate color based on value
def get_color(val, sig):
    if np.isnan(val):
        return ''
    elif val > 0:
        # Green gradient for positive values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'+{val:.1f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{green!{int(intensity*30)}}} {val_str}'
    else:
        # Red gradient for negative values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'{val:.1f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{red!{int(intensity*30)}}} {val_str}'

# Generate LaTeX table
latex_table = '\\begin{table}[h]\n\\centering\n\\begin{tabular}{l' + 'r'*len(model_order) + '}\n'
latex_table += '\\hline\n'
latex_table += 'Modification & ' + ' & '.join([f'\\textbf{{{col}}}' for col in model_order]) + ' \\\\\n'
latex_table += '\\hline\n'

prev_category = None
for idx, row in pivot_df.iterrows():
    current_category = idx[0]  # Get first character of modification name
    if prev_category is not None and current_category != prev_category:
        latex_table += '\\hline\n'
    prev_category = current_category
    
    latex_table += f'\\textbf{{{idx}}} & '
    latex_table += ' & '.join([get_color(row[col], significance.loc[idx, col]) for col in model_order]) + ' \\\\\n'

latex_table += '\\hline\n'
latex_table += '\\end{tabular}\n'
latex_table += '\\caption{Percentage Change in Micro F1 Score by Model and Modification Type}\n'
latex_table += '\\label{tab:ner_results}\n'
latex_table += '\\end{table}'

# Save to file
with open(base_path / 'coreference_negation_type_results_table.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to coreference_negation_type_results_table.tex")

model         bert-base-cased  claude-3-5-sonnet    gpt2   gpt4o   llama  \
modification                                                               
Verbal                  4.351            -24.118   0.000 -29.314 -28.941   
Lexical                -3.540             -8.176  -7.224  -4.002 -12.138   
Double                  0.000              0.000   5.563  -6.218   0.000   
Approximate            -5.435              8.859  -5.525 -12.186  -3.075   
Absolute              -11.719              0.000  22.449  12.138  22.449   

model         t5-base  
modification           
Verbal        -14.063  
Lexical        -7.483  
Double         11.127  
Approximate    -2.763  
Absolute       22.449  
LaTeX table saved to coreference_negation_type_results_table.tex
