In [19]:
from datasets import load_dataset
import dspy
import openai
import os
import re
import pandas as pd
import json
from dotenv import load_dotenv
import glob

In [None]:
load_dotenv()

In [21]:

openai.api_key = os.getenv('OPENAI_API_KEY')
openai.organization = os.getenv('OPENAI_ORGANIZATION')


In [22]:
lm = dspy.LM('together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', temperature=0, max_tokens=300)
dspy.configure(lm=lm)

In [10]:
ds = pd.read_json('../preprocessing/train_dev_test_data/coref/test.json')
ds = ds.to_dict('records')


In [None]:
ds

In [23]:
def remove_space(text):
    # Remove multiple spaces
    text = ' '.join(text.split())
    # lines = text.split('\n')
    # for i,line in enumerate(lines):
        
    # lines[i] = lines[i].replace('  ', ' ')
    # Fix spacing around punctuation
    text = re.sub(r'\s+([.,!?])', r'\1', text)
    text = re.sub(r'([.,!?])\s+', r'\1 ', text)
    
    # Fix contractions
    text = re.sub(r'\s*\'\s*s\b', "'s", text)
    text = re.sub(r'\s*n\s*\'\s*t\b', "n't", text)
    text = re.sub(r'\s*\'\s*ve\b', "'ve", text)
    text = re.sub(r'\s*\'\s*re\b', "'re", text)
    text = re.sub(r'\s*\'\s*ll\b', "'ll", text)
    text = re.sub(r'\s*\'\s*d\b', "'d", text)
    text = re.sub(r'\s*\'\s*m\b', "'m", text)
    
    # Fix spaces around parentheses
    text = re.sub(r'\(\s+', '(', text)
    text = re.sub(r'\s+\)', ')', text)
    
    # Remove spaces before and after text
    text = text.strip()
    # text = text.replace('agent 0: ','')
    # text = text.replace('agent 1: ','')
    return text


In [None]:
examples = [
    dspy.Example({ 
                  "text" : remove_space(r["text"]), 
                  "pronoun": r['pronoun'],
                  "candidate": '0: '+  str(r['candidate'][0]) +  ', 1: ' + str(r['candidate'][1]),
                  "label": r['label']

                }
                  ).with_inputs("text", 'pronoun', 'candidate') 
    
    for r in ds
    
    ]


In [None]:
example = examples[0]
for k, v in example.items():
    print(f"\n{k.upper()}:\n")
    print(v)


In [25]:
def extract_prediction(text):
    matches = re.findall(r'\b[0-2]\b', text)
    # print(matches)
    parsed_answer = matches[-1] if matches else ""
    return parsed_answer

In [26]:
def eval_metric(true, prediction, trace=None):
    pred = prediction.label
    matches = re.findall(r'\b[0-9]\b', pred)
    # print(matches)
    parsed_answer = matches[-1] if matches else ""
    # print(parsed_answer)
    return parsed_answer == str(true.label)

# Evaluate the original test set

In [27]:
from dspy.evaluate import Evaluate

# evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10)


In [23]:
class Coref(dspy.Signature):
    """Which candidate does the pronoun refer to? Answer with either 0 or 1."""
    text = dspy.InputField()
    pronoun = dspy.InputField()
    candidate = dspy.InputField()
    label = dspy.OutputField(desc="The index 0 or 1 of the candidates.", prefix = 'Answer:')

In [24]:
class SimpleCoref(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Coref)

    def forward(self, text, pronoun, candidate):

        return self.prog(text = text, pronoun = pronoun, candidate = candidate)


In [25]:
simple_coref = SimpleCoref()

In [None]:
pred = simple_coref(text=example.text, pronoun = example.pronoun, candidate = example.candidate)
print("\nQUESTION:\n")
print(example.text)
print("\nPRONOUN:\n")
print(example.pronoun)
print("\nCANDIDATES:\n")
print(example.candidate)

# print("\nANSWER:\n")
# print(example.label)
print("\nPREDICTION:\n")
print(pred)


In [None]:
eval_metric(example, pred)

In [None]:
evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10, return_outputs= True, return_all_scores=True)
results = evaluate(simple_coref)
items = []
for sample in results[1]:
    item = {}
    sentence = sample[0]['text']
    pronoun = sample[0]['pronoun']
    candidate = sample[0]['candidate']
    label = sample[0]['label']
    pred = sample[1]['label']
    item['text'] = sentence
    item['pronoun'] = pronoun
    item['candidate'] = candidate
    item['label'] = label
    item['pred'] = pred
    items.append(item)
df_result = pd.DataFrame(data = items)
df_result.to_csv('results/coref/llama-0shot-coref.csv')

In [151]:
class CoTCoref(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.ChainOfThought(Coref)

    def forward(self, text, pronoun, candidate):

        return self.prog(text = text, pronoun = pronoun, candidate = candidate)


In [None]:
cot_coref = CoTCoref()
pred = cot_coref(text=example.text, pronoun = example.pronoun, candidate = example.candidate)
print("\nQUESTION:\n")
print(example.text)
# print("\nANSWER:\n")
# print(example.label)
print("\nPREDICTION:\n")
print(pred)


In [None]:
evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10, return_outputs= True, return_all_scores=True)
results = evaluate(cot_coref)
items = []
for sample in results[1]:
    item = {}
    sentence = sample[0]['text']
    pronoun = sample[0]['pronoun']
    candidate = sample[0]['candidate']
    label = sample[0]['label']
    pred = sample[1]['label']
    reasoning = sample[1]['reasoning']
    item['text'] = sentence
    item['pronoun'] = pronoun
    item['candidate'] = candidate
    item['rationale'] = reasoning
    item['label'] = label
    item['pred'] = pred
    items.append(item)
df_result = pd.DataFrame(data = items)
df_result.to_csv('results/coref/llama-0shot-cot-coref.csv')

# Evaluate by modification

## Without label change

In [17]:
def evaluate_modified_set(ds, program):
    examples = [
    dspy.Example({ 
                  "text" : remove_space(r['modified_text']), 
                  "original_text": remove_space(r['original_text']),
                  "pronoun": r['modified_pronoun'],
                  "candidate": '0: '+  str(r['modified_candidates'][0]) +  ', 1: ' + str(r['modified_candidates'][1]),
                  "label": int(r['modified_label']),
                  "modified_label": int(r['modified_label'])
                }
                  ).with_inputs("text", "pronoun", "candidate") 
    for r in ds
    ]
    evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=1, return_outputs= True, return_all_scores=True)
    results = evaluate(program)
    return results

In [18]:
class Coref(dspy.Signature):
    """Which candidate does the pronoun refer to? Answer with either 0 or 1."""
    text = dspy.InputField()
    pronoun = dspy.InputField()
    candidate = dspy.InputField()
    label = dspy.OutputField(desc="The index 0 or 1 of the candidates.", prefix = 'Answer:')
class SimpleCoref(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Coref)

    def forward(self, text, pronoun, candidate):

        return self.prog(text = text, pronoun = pronoun, candidate = candidate)
simple_coref = SimpleCoref()

In [None]:
# Configure GPT-4 as the language model
lm = dspy.LM('together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', temperature=0, max_tokens=300)
dspy.configure(lm=lm)

# Get all json files in the specified directory
json_files = glob.glob('../data/modified_data/coref/*_100.json')
original_pred_ds = pd.read_csv('results/coref/llama-0shot-coref.csv', index_col=False)
original_pred_ds['text'] = original_pred_ds['text'].apply(remove_space)  # Replace 'your_function' with the actual function
# print(original_pred_ds['dialog'][1958])
for json_file in json_files:
    print(json_file)
    if not any(x in json_file for x in ['grammatical_role', 'derivation']):
        continue
    # Load the json file
    # with open(json_file, 'r') as f:
    #     data = json.load(f)
    with open(json_file,'r') as f:
        data = json.load(f)
        # data = pd.read_json(json_file)
        # data = data.to_json(orient = 'records')
        # data = ast.literal_eval(data)
    # print(data)
    results_modified = evaluate_modified_set(data, simple_coref)
    items = []
    for sample in results_modified[1]:
        item = {}
        # print(sample[0])
        modified_text = sample[0]['text']
        original_text = sample[0]['original_text']

        pred = sample[1]['label']
        # rationale = sample[1]['reasoning']
        # original_pred = compare_dialog(original_pred_ds, original_pred_ds['dialog'], original_text)
        original_text = remove_space(original_text)
        # print(original_text)
        pred = extract_prediction(pred)
        # print()
        original_pred = original_pred_ds.loc[original_pred_ds['text'] == original_text]['pred'].values[0]
        item['original_text'] = original_text
        item['modified_text'] = modified_text
        item['modified_pronoun'] = sample[0]['pronoun']
        item['modified_candidates'] = sample[0]['candidate']
        item['modified_label'] = sample[0]['modified_label']
        item['modified_pred'] = pred
        item['original_pred'] = original_pred
        item['original_label'] = sample[0]['label']
        # item['reasoning'] = rationale
        items.append(item)
    
    df_result = pd.DataFrame(data=items)
    
    # Save results with filename based on input json
    output_filename = f"results/coref/llama-0shot-{json_file.split('/')[-1].replace('.json', '')}.csv"
    df_result.to_csv(output_filename)

## With label change

In [28]:
def evaluate_modified_set(ds, program):
    examples = [
    dspy.Example({ 
                  "text" : remove_space(r['modified_text']), 
                  "original_text": remove_space(r['original_text']),
                  "pronoun": r['modified_pronoun'],
                  "candidate": '0: '+  str(r['modified_candidates'][0]) +  ', 1: ' + str(r['modified_candidates'][1]),
                  "label": int(r['modified_label']),
                  "original_label": int(r['original_label']),
                  "original_pronoun": r['original_pronoun'],
                  "type": r['type']
                }
                  ).with_inputs("text", "pronoun", "candidate") 
    for r in ds
    ]
    evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=1, return_outputs= True, return_all_scores=True)
    results = evaluate(program)
    return results

In [29]:
class Coref(dspy.Signature):
    """Which candidate does the pronoun refer to? Answer with either 0 or 1."""
    text = dspy.InputField()
    pronoun = dspy.InputField()
    candidate = dspy.InputField()
    label = dspy.OutputField(desc="The index 0 or 1 of the candidates.", prefix = 'Answer:')
class SimpleCoref(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Coref)

    def forward(self, text, pronoun, candidate):

        return self.prog(text = text, pronoun = pronoun, candidate = candidate)
simple_coref = SimpleCoref()

In [None]:
lm = dspy.LM('together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', temperature=0, max_tokens=300)
dspy.configure(lm=lm)

# Get all json files in the specified directory
json_files = glob.glob('../data/modified_data/coref/*_100.json')
original_pred_ds = pd.read_csv('results/coref/llama-0shot-coref.csv', index_col=False)
original_pred_ds['text'] = original_pred_ds['text'].apply(remove_space)  # Replace 'your_function' with the actual function
# print(original_pred_ds['dialog'][1958])
for json_file in json_files:
    # if not any(x in json_file for x in ['active_to_passive']):
    #     continue
 
    # Load the json file
    print(json_file)
    # with open(json_file, 'r') as f:
    #     data = json.load(f)
    with open(json_file,'r') as f:
        data = json.load(f)
        # data = pd.read_json(json_file)
        # data = data.to_json(orient = 'records')
        # data = ast.literal_eval(data)
    # print(data)
    results_modified = evaluate_modified_set(data, simple_coref)
    items = []
    for sample in results_modified[1]:
        item = {}
        modified_text = sample[0]['text']
        original_text = sample[0]['original_text']

        label = sample[0]['label']
        pred = sample[1]['label']
        # rationale = sample[1]['reasoning']
        # original_pred = compare_dialog(original_pred_ds, original_pred_ds['dialog'], original_text)
        original_text = remove_space(original_text)
        # print(original_text)
        pred = extract_prediction(pred)

        # print()
        original_pred = original_pred_ds.loc[original_pred_ds['text'] == original_text]['pred'].values[0]
        item['original_text'] = original_text
        item['modified_text'] = modified_text
        item['modified_label'] = sample[0]['label']
        item['modified_pred'] = pred
        item['original_pred'] = original_pred
        item['modified_pronoun'] = sample[0]['pronoun']
        # if sample[0]['pronoun'] != sample[0]['original_pronoun']:
            # continue
        item['modified_candidates'] = sample[0]['candidate']
        item['original_label'] = sample[0]['original_label']
        item['type'] = sample[0]['type']
        # item['reasoning'] = rationale
        items.append(item)
    
    df_result = pd.DataFrame(data=items)
    
    # Save results with filename based on input json
    output_filename = f"results/coref/llama-0shot-{json_file.split('/')[-1].replace('.json', '')}.csv"
    print('saved to', output_filename)
    df_result.to_csv(output_filename)

# Aggregate results

In [35]:
from scipy import stats

In [None]:
result_files = glob.glob('results/coref/llama-0shot-*_100.csv')

aggregated_results = []

for file in result_files:
    # Extract modification type from filename
    mod_type = file.split('-')[-1].replace('.csv','')
    
    # Read results file
    df = pd.read_csv(file)

    # Calculate accuracies
    original_correct = (df['original_pred'] == df['original_label']).sum()
    modified_correct = (df['modified_pred'] == df['modified_label']).sum()
    total = len(df)

    original_acc = original_correct / total
    modified_acc = modified_correct / total
    
    # Calculate the difference between original_res and modified_res
    difference = -round(original_acc - modified_acc, 2)
    
    # Calculate percentage difference with respect to total samples
    pct_difference = -round((original_correct - modified_correct) / original_correct * 100, 2)
    
    # Perform t-test between original and modified predictions
    t_stat, p_value = stats.ttest_ind(
        (df['original_pred'] == df['original_label']).astype(float),
        (df['modified_pred'] == df['modified_label']).astype(float)
    )
    
    aggregated_results.append({
        'task': 'dialogue_contradiction_detection',
        'modification': mod_type,
        'original_res': round(original_acc, 2),
        'modified_res': round(modified_acc, 2),
        'difference': difference,  # Difference in accuracy
        'pct_difference': pct_difference,  # Percentage difference relative to total samples
        'p_value': p_value  # Add p-value from t-test
    })

# Create final results dataframe
results_df = pd.DataFrame(aggregated_results)

# Sort the results based on modification_name
modification_name = ['temporal_bias_100', 'geographical_bias_100','length_bias_100', 'typo_bias_100', 'capitalization_100', 'punctuation_100', 'derivation_100', 'compound_word_100','active_to_passive_100','grammatical_role_100', 'coordinating_conjunction_100', 'concept_replacement_100','negation_100','discourse_100','sentiment_100','casual_100', 'dialectal_100']
results_df['modification'] = pd.Categorical(results_df['modification'], categories=modification_name, ordered=True)
results_df = results_df.sort_values(by='modification')

# Calculate averages across all modifications
avg_original = results_df['original_res'].mean()
avg_modified = results_df['modified_res'].mean()
avg_difference = avg_original - avg_modified
avg_pct_difference = results_df['pct_difference'].mean()

# Add averages as a new row
results_df.loc[len(results_df)] = {
    'task': 'dialogue_contradiction_detection',
    'modification': 'average',
    'original_res': round(avg_original, 2),
    'modified_res': round(avg_modified, 2),
    'difference': -round(avg_difference, 2),
    'pct_difference': round(avg_pct_difference, 2),
    'p_value': None  # No p-value for average row
}

print("\n")
results_df.to_csv('results/coref/llama-DP.csv')

# Apply styling to highlight rows where original_res > modified_res and significant p-values
def highlight_drops_and_significance(row):
    colors = [''] * len(row)
    if row['original_res'] > row['modified_res']:
        colors = ['background-color: red'] * len(row)
        # If p-value < 0.05, add bold text
        if 'p_value' in row and row['p_value'] is not None and row['p_value'] < 0.05:
            colors = ['background-color: red; font-weight: bold'] * len(row)
    return colors

results_df.round(2).style.apply(highlight_drops_and_significance, axis=1)


In [None]:
# Load results from different models
gpt4_df = pd.read_csv('results/coref/llama-0shot-coref.csv')
claude_df = pd.read_csv('results/coref/claude-3-5-sonnet-0shot-coref.csv')
mixtral_df = pd.read_csv('results/coref/mixtral-8x22b-0shot-coref.csv')

# Calculate accuracy between predictions and labels
gpt4_acc = (gpt4_df['pred'] == gpt4_df['label']).mean()
claude_acc = (claude_df['pred'] == claude_df['label']).mean()
mixtral_acc = (mixtral_df['pred'] == mixtral_df['label']).mean()
# Calculate average accuracy for each model
print(f"GPT-4 Average Accuracy: {gpt4_acc:.2%}")
print(f"Claude-3.5 Average Accuracy: {claude_acc:.2%}")
print(f"Mixtral Average Accuracy: {mixtral_acc:.2%}")

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': ['GPT-4', 'Claude-3.5', 'Mixtral'],
    'Accuracy': [gpt4_acc, claude_acc, mixtral_acc]
})

# Style the dataframe
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: green' if v else '' for v in is_max]

styled_df = comparison_df.style.apply(highlight_max, subset=['Accuracy'])
styled_df
