In [13]:
from datasets import load_dataset
import dspy
import openai
import os
import re
import pandas as pd
import json
from dotenv import load_dotenv
import glob

In [23]:
load_dotenv()

True

In [15]:

openai.api_key = os.getenv('OPENAI_API_KEY')
openai.organization = os.getenv('OPENAI_ORGANIZATION')


In [16]:
lm = dspy.LM('openai/gpt-4o', temperature=0, max_tokens=250)
dspy.configure(lm=lm)

In [10]:
ds = pd.read_json('../preprocessing/train_dev_test_data/coref/test.json')
ds = ds.to_dict('records')


In [11]:
ds

[{'label': 0,
  'candidates': ['Sabina', 'Maria'],
  'pronoun': 'she',
  'text': 'Sabina is trying to look for Maria, but to no avail, as she worries for her safety.'},
 {'label': 1,
  'candidates': ['The map', 'The building'],
  'pronoun': 'it',
  'text': "I'm sure that my map will show this building; it is very famous."},
 {'label': 0,
  'candidates': ['Gino', 'Ross'],
  'pronoun': 'he',
  'text': 'Though Gino tries to avoid Ross, he can not help but eventually fall in love with him.'},
 {'label': 1,
  'candidates': ['Yueh', 'De Vries'],
  'pronoun': 'he',
  'text': 'De Vries kills Yueh but he also dies with Leto in the assassination attempt ; however Harkonnen survives.'},
 {'label': 0,
  'candidates': ['Hair Stylists', "the Cowboy's Cheerleaders"],
  'pronoun': 'they',
  'text': "Hair Stylists transformed the Cowboy's Cheerleaders into beauties, so they put much product into their hair."},
 {'label': 1,
  'candidates': ['Derek Smeathe', 'Ed Helms'],
  'pronoun': 'him',
  'text': 'E

In [17]:
def remove_space(text):
    # Remove multiple spaces
    text = ' '.join(text.split())
    # lines = text.split('\n')
    # for i,line in enumerate(lines):
        
    # lines[i] = lines[i].replace('  ', ' ')
    # Fix spacing around punctuation
    text = re.sub(r'\s+([.,!?])', r'\1', text)
    text = re.sub(r'([.,!?])\s+', r'\1 ', text)
    
    # Fix contractions
    text = re.sub(r'\s*\'\s*s\b', "'s", text)
    text = re.sub(r'\s*n\s*\'\s*t\b', "n't", text)
    text = re.sub(r'\s*\'\s*ve\b', "'ve", text)
    text = re.sub(r'\s*\'\s*re\b', "'re", text)
    text = re.sub(r'\s*\'\s*ll\b', "'ll", text)
    text = re.sub(r'\s*\'\s*d\b', "'d", text)
    text = re.sub(r'\s*\'\s*m\b', "'m", text)
    
    # Fix spaces around parentheses
    text = re.sub(r'\(\s+', '(', text)
    text = re.sub(r'\s+\)', ')', text)
    
    # Remove spaces before and after text
    text = text.strip()
    # text = text.replace('agent 0: ','')
    # text = text.replace('agent 1: ','')
    return text


In [14]:
examples = [
    dspy.Example({ 
                  "text" : remove_space(r["text"]), 
                  "pronoun": r['pronoun'],
                  "candidate": '0: '+  str(r['candidates'][0]) +  ', 1: ' + str(r['candidates'][1]),
                  "label": r['label']

                }
                  ).with_inputs("text", 'pronoun', 'candidate') 
    
    for r in ds
    
    ]


In [15]:
example = examples[0]
for k, v in example.items():
    print(f"\n{k.upper()}:\n")
    print(v)



TEXT:

Sabina is trying to look for Maria, but to no avail, as she worries for her safety.

PRONOUN:

she

CANDIDATE:

0: Sabina, 1: Maria

LABEL:

0


In [18]:
def extract_prediction(text):
    matches = re.findall(r'\b[0-2]\b', text)
    # print(matches)
    parsed_answer = matches[-1] if matches else ""
    return parsed_answer

In [19]:
def eval_metric(true, prediction, trace=None):
    pred = prediction.label
    matches = re.findall(r'\b[0-9]\b', pred)
    # print(matches)
    parsed_answer = matches[-1] if matches else ""
    # print(parsed_answer)
    return parsed_answer == str(true.label)

# Evaluate the original test set

In [20]:
from dspy.evaluate import Evaluate

# evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10)


In [165]:
class Coref(dspy.Signature):
    """Which candidate does the pronoun refer to? Answer with either 0 or 1."""
    text = dspy.InputField()
    pronoun = dspy.InputField()
    candidate = dspy.InputField()
    label = dspy.OutputField(desc="The index 0 or 1 of the candidates.", prefix = 'Answer:')

In [166]:
class SimpleCoref(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Coref)

    def forward(self, text, pronoun, candidate):

        return self.prog(text = text, pronoun = pronoun, candidate = candidate)


In [167]:
simple_coref = SimpleCoref()

In [168]:
pred = simple_coref(text=example.text, pronoun = example.pronoun, candidate = example.candidate)
print("\nQUESTION:\n")
print(example.text)
print("\nPRONOUN:\n")
print(example.pronoun)
print("\nCANDIDATES:\n")
print(example.candidate)

# print("\nANSWER:\n")
# print(example.label)
print("\nPREDICTION:\n")
print(pred)



QUESTION:

The sniper shot the terrorist because he was a bad guy.

PRONOUN:

he

CANDIDATES:

0: The sniper, 1: the terrorist

PREDICTION:

Prediction(
    label='1'
)


In [148]:
eval_metric(example, pred)

True

In [169]:
evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10, return_outputs= True, return_all_scores=True)
results = evaluate(simple_coref)
items = []
for sample in results[1]:
    item = {}
    sentence = sample[0]['text']
    pronoun = sample[0]['pronoun']
    candidate = sample[0]['candidate']
    label = sample[0]['label']
    pred = sample[1]['label']
    item['text'] = sentence
    item['pronoun'] = pronoun
    item['candidate'] = candidate
    item['label'] = label
    item['pred'] = pred
    items.append(item)
df_result = pd.DataFrame(data = items)
df_result.to_csv('results/coref/gpt4o-0shot-coref.csv')

Average Metric: 1 / 1  (100.0):   0%|          | 0/2118 [00:00<?, ?it/s]

Average Metric: 972 / 1227  (79.2):  58%|█████▊    | 1227/2118 [02:45<32:05,  2.16s/it]IOStream.flush timed out
Average Metric: 47 / 50  (94.0):   2%|▏         | 50/2118 [28:59<19:59:15, 34.79s/it]

Average Metric: 1556 / 2118  (73.5): 100%|██████████| 2118/2118 [04:37<00:00,  7.62it/s]


Unnamed: 0,text,pronoun,candidate,example_label,pred_label,eval_metric
0,The sniper shot the terrorist because he was a bad guy.,he,"0: The sniper, 1: the terrorist",1,1,✔️ [True]
1,The sniper shot the terrorist because he had orders.,he,"0: The sniper, 1: the terrorist",0,0,✔️ [True]
2,The chimpanzee could not use Linux because it was an animal.,it,"0: The chimpanzee, 1: Linux",0,0,✔️ [True]
3,The chimpanzee could not use Linux because it uses different commands than Windows.,it,"0: The chimpanzee, 1: Linux",1,1,✔️ [True]
4,Bill punched Bob in the face because he was being rude to Mary.,he,"0: Bill, 1: Bob",1,1,✔️ [True]
5,Bill punched Bob in the face because he wanted to protect Mary.,he,"0: Bill, 1: Bob",0,0,✔️ [True]
6,"Carl borrowed a book from Richard, but the book was never returned to him.",him,"0: Carl, 1: Richard",1,1,✔️ [True]
7,"Carl borrowed a book from Richard, but the book was unreadable to him.",him,"0: Carl, 1: Richard",0,0,✔️ [True]
8,Bill likes to play with Bob because he loves playing.,he,"0: Bill, 1: Bob",0,0,✔️ [True]
9,Bill likes to play with Bob but he did not want to.,he,"0: Bill, 1: Bob",1,1,✔️ [True]


In [151]:
class CoTCoref(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.ChainOfThought(Coref)

    def forward(self, text, pronoun, candidate):

        return self.prog(text = text, pronoun = pronoun, candidate = candidate)


In [152]:
cot_coref = CoTCoref()
pred = cot_coref(text=example.text, pronoun = example.pronoun, candidate = example.candidate)
print("\nQUESTION:\n")
print(example.text)
# print("\nANSWER:\n")
# print(example.label)
print("\nPREDICTION:\n")
print(pred)



QUESTION:

The sniper shot the terrorist because he was a bad guy.

PREDICTION:

Prediction(
    reasoning='The pronoun "he" in the sentence "The sniper shot the terrorist because he was a bad guy" refers to the reason for the sniper\'s action. The phrase "because he was a bad guy" suggests that the person being referred to as "he" is the one perceived as a "bad guy." In this context, it is more logical to conclude that "he" refers to "the terrorist," as the sniper shot him due to this perception.',
    label='1'
)


In [153]:
evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10, return_outputs= True, return_all_scores=True)
results = evaluate(cot_coref)
items = []
for sample in results[1]:
    item = {}
    sentence = sample[0]['text']
    pronoun = sample[0]['pronoun']
    candidate = sample[0]['candidate']
    label = sample[0]['label']
    pred = sample[1]['label']
    reasoning = sample[1]['reasoning']
    item['text'] = sentence
    item['pronoun'] = pronoun
    item['candidate'] = candidate
    item['rationale'] = reasoning
    item['label'] = label
    item['pred'] = pred
    items.append(item)
df_result = pd.DataFrame(data = items)
df_result.to_csv('results/coref/gpt4o-0shot-cot-coref.csv')

Average Metric: 1656 / 2118  (78.2): 100%|██████████| 2118/2118 [11:32<00:00,  3.06it/s]


Unnamed: 0,text,pronoun,candidate,example_label,reasoning,pred_label,eval_metric
0,The sniper shot the terrorist because he was a bad guy.,he,"0: The sniper, 1: the terrorist",1,"The pronoun ""he"" in the sentence ""The sniper shot the terrorist because he was a bad guy"" refers to the reason for the sniper's action....",1,✔️ [True]
1,The sniper shot the terrorist because he had orders.,he,"0: The sniper, 1: the terrorist",0,"The pronoun ""he"" in the sentence ""The sniper shot the terrorist because he had orders"" is most likely referring to ""The sniper."" This is because...",0,✔️ [True]
2,The chimpanzee could not use Linux because it was an animal.,it,"0: The chimpanzee, 1: Linux",0,"The pronoun ""it"" in the sentence ""The chimpanzee could not use Linux because it was an animal"" refers to ""The chimpanzee."" The reason is that...",0,✔️ [True]
3,The chimpanzee could not use Linux because it uses different commands than Windows.,it,"0: The chimpanzee, 1: Linux",1,"The pronoun ""it"" in the sentence ""The chimpanzee could not use Linux because it uses different commands than Windows"" refers to the subject that uses...",1,✔️ [True]
4,Bill punched Bob in the face because he was being rude to Mary.,he,"0: Bill, 1: Bob",1,"The pronoun ""he"" in the sentence ""Bill punched Bob in the face because he was being rude to Mary"" refers to the person who was...",1,✔️ [True]
5,Bill punched Bob in the face because he wanted to protect Mary.,he,"0: Bill, 1: Bob",0,"The pronoun ""he"" is used to indicate the person who wanted to protect Mary. In the sentence, Bill is the one who took action by...",0,✔️ [True]
6,"Carl borrowed a book from Richard, but the book was never returned to him.",him,"0: Carl, 1: Richard",1,"The pronoun ""him"" refers to the person who originally owned the book and to whom it should have been returned. In this context, Richard is...",1,✔️ [True]
7,"Carl borrowed a book from Richard, but the book was unreadable to him.",him,"0: Carl, 1: Richard",0,"The pronoun ""him"" refers to the person who found the book unreadable. Since Carl borrowed the book from Richard, it is likely that Carl is...",0,✔️ [True]
8,Bill likes to play with Bob because he loves playing.,he,"0: Bill, 1: Bob",0,"The sentence states that ""Bill likes to play with Bob because he loves playing."" The pronoun ""he"" is likely referring to the person who loves...",0,✔️ [True]
9,Bill likes to play with Bob but he did not want to.,he,"0: Bill, 1: Bob",1,"The sentence states that ""Bill likes to play with Bob but he did not want to."" The pronoun ""he"" is used to indicate someone who...",1,✔️ [True]


# Evaluate by modification

## Without label change

In [18]:
def evaluate_modified_set(ds, program):
    examples = [
    dspy.Example({ 
                  "text" : remove_space(r['modified_text']), 
                  "original_text": remove_space(r['original_text']),
                  "pronoun": r['modified_pronoun'],
                  "candidate": '0: '+  str(r['modified_candidates'][0]) +  ', 1: ' + str(r['modified_candidates'][1]),
                  "label": int(r['modified_label']),
                  "modified_label": int(r['modified_label'])
                }
                  ).with_inputs("text", "pronoun", "candidate") 
    for r in ds
    ]
    evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=1, return_outputs= True, return_all_scores=True)
    results = evaluate(program)
    return results

In [19]:
class Coref(dspy.Signature):
    """Which candidate does the pronoun refer to? Answer with either 0 or 1."""
    text = dspy.InputField()
    pronoun = dspy.InputField()
    candidate = dspy.InputField()
    label = dspy.OutputField(desc="The index 0 or 1 of the candidates.", prefix = 'Answer:')
class SimpleCoref(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Coref)

    def forward(self, text, pronoun, candidate):

        return self.prog(text = text, pronoun = pronoun, candidate = candidate)
simple_coref = SimpleCoref()

In [39]:
# Configure GPT-4 as the language model
lm = dspy.LM('openai/gpt-4o', temperature=0, max_tokens=300)
dspy.configure(lm=lm)

# Get all json files in the specified directory
json_files = glob.glob('../data/modified_data/coref/*_100.json')
original_pred_ds = pd.read_csv('results/coref/gpt4o-0shot-coref.csv', index_col=False)
original_pred_ds['text'] = original_pred_ds['text'].apply(remove_space)  # Replace 'your_function' with the actual function
# print(original_pred_ds['dialog'][1958])
for json_file in json_files:
    print(json_file)
    if 'grammatical_role' in json_file or 'negation' in json_file:
        continue
    # Load the json file
    # with open(json_file, 'r') as f:
    #     data = json.load(f)
    with open(json_file,'r') as f:
        data = json.load(f)
        # data = pd.read_json(json_file)
        # data = data.to_json(orient = 'records')
        # data = ast.literal_eval(data)
    # print(data)
    results_modified = evaluate_modified_set(data, simple_coref)
    items = []
    for sample in results_modified[1]:
        item = {}
        # print(sample[0])
        modified_text = sample[0]['text']
        original_text = sample[0]['original_text']

        pred = sample[1]['label']
        # rationale = sample[1]['reasoning']
        # original_pred = compare_dialog(original_pred_ds, original_pred_ds['dialog'], original_text)
        original_text = remove_space(original_text)
        # print(original_text)
        pred = extract_prediction(pred)
        # print()
        original_pred = original_pred_ds.loc[original_pred_ds['text'] == original_text]['pred'].values[0]
        item['original_text'] = original_text
        item['modified_text'] = modified_text
        item['modified_pronoun'] = sample[0]['pronoun']
        item['modified_candidates'] = sample[0]['candidate']
        item['modified_label'] = sample[0]['modified_label']
        item['modified_pred'] = pred
        item['original_pred'] = original_pred
        item['original_label'] = sample[0]['label']
        # item['reasoning'] = rationale
        items.append(item)
    
    df_result = pd.DataFrame(data=items)
    
    # Save results with filename based on input json
    output_filename = f"results/coref/gpt4o-0shot-{json_file.split('/')[-1].replace('.json', '')}.csv"
    df_result.to_csv(output_filename)

../data/modified_data/coref/casual_100.json


Average Metric: 76 / 100  (76.0): 100%|██████████| 100/100 [00:02<00:00, 37.04it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,"Sally's got a crush on Bella, but she's bummed out to find out she's tying the knot with Emily Vincent.","Sally becomes attracted to Bella, but she is disappointed to learn she is marrying Emily Vincent.",she,"0: Sally, 1: Bella",1,1,1,✔️ [True]


../data/modified_data/coref/discourse_100.json


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:02<00:00, 42.14it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,"The villagers attempt to have the werewolves executed, however, they are instead exiled by the clan leader.","The villagers attempt to have the werewolves executed, but they are instead exiled by the clan leader.",they,"0: the werewolves, 1: The villagers",0,0,1,


../data/modified_data/coref/compound_word_100.json


Average Metric: 78 / 96  (81.2): 100%|██████████| 96/96 [00:00<00:00, 142.35it/s] 


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,"Cathy realizes that Heather has overheard, so she is overcome by guilt and she runs out after her into a raging thunderstorm.","Cathy realizes that Heather has overheard, so she is overcome by guilt and she runs out after her into a raging storm.",her,"0: Cathy, 1: Heather",1,1,1,✔️ [True]


../data/modified_data/coref/temporal_bias_100.json


Average Metric: 80 / 100  (80.0): 100%|██████████| 100/100 [00:02<00:00, 35.87it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,"Josie did not fancy Sarah, but she did not tell her.","Josie did not like Sarah, but she did not tell her.",she,"0: Josie, 1: Sarah",0,0,0,✔️ [True]


../data/modified_data/coref/coordinating_conjunction_100.json


Average Metric: 76 / 100  (76.0): 100%|██████████| 100/100 [00:01<00:00, 83.45it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,"Shatterstar attempts to kill Arcade, but he only destroys and dismantles a robotic double.","Shatterstar attempts to kill Arcade, but he only destroys a robotic double.",he,"0: Arcade, 1: Shatterstar",1,1,1,✔️ [True]


../data/modified_data/coref/capitalization_100.json


Average Metric: 81 / 100  (81.0): 100%|██████████| 100/100 [00:05<00:00, 18.48it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,"Lingad was defeated by Estelito Mendoza, but he raised charges of FRAUD which led to the staging of a new election for governor.","Lingad was defeated by Estelito Mendoza, but he raised charges of fraud which led to the staging of a new election for governor.",he,"0: Lingad, 1: Estelito Mendoza",0,0,0,✔️ [True]


../data/modified_data/coref/dialectal_100.json


Average Metric: 77 / 100  (77.0): 100%|██████████| 100/100 [00:01<00:00, 83.55it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,Journalist Jack Anderson was thinking maybe Watergate Special Prosecutor Archibald Cox kena fired cos he starting to kaypoh into Rebozo's part in Nixon taking secret...,Journalist Jack Anderson speculated that Watergate Special Prosecutor Archibald Cox had been fired because he had started to investigate Rebozo's role in Nixon accepting covert...,he,"0: Journalist Jack Anderson, 1: Watergate Special Prosecutor Archibald Cox",1,1,0,


../data/modified_data/coref/sentiment_100.json


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:00<00:00, 147.23it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,The cat was afraid of the friendly dog because it was timid.,The cat was afraid of the dog because it was timid.,it,"0: The cat, 1: the friendly dog",0,0,0,✔️ [True]


../data/modified_data/coref/grammatical_role_100.json
../data/modified_data/coref/length_bias_100.json


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:01<00:00, 77.69it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,"Janis has been ordered to kill Noelia, but she is unsure if she can actually go along with this plan.","Janis is ordered to kill Noelia, but doesn't know if she can go along with this.",she,"0: Janis, 1: Noelia",0,0,0,✔️ [True]


../data/modified_data/coref/concept_replacement_100.json


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:00<00:00, 125.98it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,"Dan walks away from Samuel and he tries plarding after him, but he trips and falls.","Dan walks away from Samuel and he tries running after him, but he trips and falls.",him,"0: Dan, 1: Samuel",0,0,0,✔️ [True]


../data/modified_data/coref/typo_bias_100.json


Average Metric: 78 / 100  (78.0): 100%|██████████| 100/100 [00:03<00:00, 33.30it/s] 


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,"Jason built Rocky a robot, and he gave it to himm.","Jason built Rocky a robot, and he gave it to him.",he,"0: Jason, 1: Rocky",0,0,0,✔️ [True]


../data/modified_data/coref/geographical_bias_100.json


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:00<00:00, 163.29it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,Yosuo refused to support Emilio since he was waiting to see whether Chuukese workers would rebel and whether Chuukese soldiers would refuse to follow orders.,Trotsky refused to support Lenin since he was waiting to see whether German workers would rebel and whether German soldiers would refuse to follow orders.,he,"0: Yosuo, 1: Emilio",1,1,0,


../data/modified_data/coref/punctuation_100.json


Average Metric: 82 / 100  (82.0): 100%|██████████| 100/100 [00:00<00:00, 133.02it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,"The cat was afraid of the dog, because it was timid.",The cat was afraid of the dog because it was timid.,it,"0: The cat, 1: the dog",0,0,0,✔️ [True]


../data/modified_data/coref/derivation_100.json


Average Metric: 78 / 100  (78.0): 100%|██████████| 100/100 [00:00<00:00, 167.45it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,Cosby backed Jay because Thea really liked him.,Cosby supported Jay because Thea really liked him.,him,"0: Cosby, 1: Jay",1,1,1,✔️ [True]


../data/modified_data/coref/active_to_passive_100.json


Average Metric: 67 / 95  (70.5): 100%|██████████| 95/95 [00:04<00:00, 20.79it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,modified_label,pred_label,eval_metric
0,The flashlights were turned on by the campers because it was dark without them.,The campers turned on their flashlights because it was dark without them.,them,"0: The campers, 1: their flashlights",1,1,1,✔️ [True]


../data/modified_data/coref/negation_100.json


## With label change

In [21]:
def evaluate_modified_set(ds, program):
    examples = [
    dspy.Example({ 
                  "text" : remove_space(r['modified_text']), 
                  "original_text": remove_space(r['original_text']),
                  "pronoun": r['modified_pronoun'],
                  "candidate": '0: '+  str(r['modified_candidates'][0]) +  ', 1: ' + str(r['modified_candidates'][1]),
                  "label": int(r['modified_label']),
                  "original_label": int(r['original_label']),
                  "original_pronoun": r['original_pronoun'],
                  "type": r['type']
                }
                  ).with_inputs("text", "pronoun", "candidate") 
    for r in ds
    ]
    evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=1, return_outputs= True, return_all_scores=True)
    results = evaluate(program)
    return results

In [10]:
class Coref(dspy.Signature):
    """Which candidate does the pronoun refer to? Answer with either 0 or 1."""
    text = dspy.InputField()
    pronoun = dspy.InputField()
    candidate = dspy.InputField()
    label = dspy.OutputField(desc="The index 0 or 1 of the candidates.", prefix = 'Answer:')
class SimpleCoref(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Coref)

    def forward(self, text, pronoun, candidate):

        return self.prog(text = text, pronoun = pronoun, candidate = candidate)
simple_coref = SimpleCoref()

In [22]:
# Configure GPT-4 as the language model
lm = dspy.LM('openai/gpt-4o', temperature=0, max_tokens=300)
dspy.configure(lm=lm)

# Get all json files in the specified directory
json_files = glob.glob('../data/modified_data/coref/*_100.json')
original_pred_ds = pd.read_csv('results/coref/gpt4o-0shot-coref.csv', index_col=False)
original_pred_ds['text'] = original_pred_ds['text'].apply(remove_space)  # Replace 'your_function' with the actual function
# print(original_pred_ds['dialog'][1958])
for json_file in json_files:
    # if not any(x in json_file for x in ['active_to_passive']):
    #     continue
 
    # Load the json file
    print(json_file)
    # with open(json_file, 'r') as f:
    #     data = json.load(f)
    with open(json_file,'r') as f:
        data = json.load(f)
        # data = pd.read_json(json_file)
        # data = data.to_json(orient = 'records')
        # data = ast.literal_eval(data)
    # print(data)
    results_modified = evaluate_modified_set(data, simple_coref)
    items = []
    for sample in results_modified[1]:
        item = {}
        modified_text = sample[0]['text']
        original_text = sample[0]['original_text']

        label = sample[0]['label']
        pred = sample[1]['label']
        # rationale = sample[1]['reasoning']
        # original_pred = compare_dialog(original_pred_ds, original_pred_ds['dialog'], original_text)
        original_text = remove_space(original_text)
        # print(original_text)
        pred = extract_prediction(pred)

        # print()
        original_pred = original_pred_ds.loc[original_pred_ds['text'] == original_text]['pred'].values[0]
        item['original_text'] = original_text
        item['modified_text'] = modified_text
        item['modified_label'] = sample[0]['label']
        item['modified_pred'] = pred
        item['original_pred'] = original_pred
        item['modified_pronoun'] = sample[0]['pronoun']
        # if sample[0]['pronoun'] != sample[0]['original_pronoun']:
        #     continue
        item['modified_candidates'] = sample[0]['candidate']
        item['original_label'] = sample[0]['original_label']
        item['type'] = sample[0]['type']
        # item['reasoning'] = rationale
        items.append(item)
    
    df_result = pd.DataFrame(data=items)
    
    # Save results with filename based on input json
    output_filename = f"results/coref/gpt4o-0shot-{json_file.split('/')[-1].replace('.json', '')}.csv"
    df_result.to_csv(output_filename)

../data/modified_data/coref/casual_100.json


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:01<00:00, 81.54it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,"Sally's got a crush on Bella, but she's bummed out to find out she's tying the knot with Emily Vincent.","Sally becomes attracted to Bella, but she is disappointed to learn she is marrying Emily Vincent.",she,"0: Sally, 1: Bella",1,1,she,casual,1,✔️ [True]


../data/modified_data/coref/discourse_100.json


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 765.57it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,"The villagers attempt to have the werewolves executed, however, they are instead exiled by the clan leader.","The villagers attempt to have the werewolves executed, but they are instead exiled by the clan leader.",they,"0: the werewolves, 1: The villagers",0,0,they,reverse,1,


../data/modified_data/coref/compound_word_100.json


Average Metric: 80 / 96  (83.3): 100%|██████████| 96/96 [00:00<00:00, 825.66it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,"Cathy realizes that Heather has overheard, so she is overcome by guilt and she runs out after her into a raging thunderstorm.","Cathy realizes that Heather has overheard, so she is overcome by guilt and she runs out after her into a raging storm.",her,"0: Cathy, 1: Heather",1,1,her,compound_word,1,✔️ [True]


../data/modified_data/coref/temporal_bias_100.json


Average Metric: 79 / 100  (79.0): 100%|██████████| 100/100 [00:00<00:00, 663.67it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,"Josie did not fancy Sarah, but she did not tell her.","Josie did not like Sarah, but she did not tell her.",she,"0: Josie, 1: Sarah",0,0,she,temporal_bias,0,✔️ [True]


../data/modified_data/coref/coordinating_conjunction_100.json


Average Metric: 76 / 97  (78.4): 100%|██████████| 97/97 [00:00<00:00, 182.41it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,"Shatterstar attempts to kill Arcade, but he only destroys and dismantles a robotic double.","Shatterstar attempts to kill Arcade, but he only destroys a robotic double.",he,"0: Arcade, 1: Shatterstar",1,1,he,coordinating_conjunction,1,✔️ [True]


../data/modified_data/coref/capitalization_100.json


Average Metric: 83 / 99  (83.8): 100%|██████████| 99/99 [00:02<00:00, 44.99it/s]  


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,"Lingad was defeated by Estelito Mendoza, but he raised charges of FRAUD which led to the staging of a new election for governor.","Lingad was defeated by Estelito Mendoza, but he raised charges of fraud which led to the staging of a new election for governor.",he,"0: Lingad, 1: Estelito Mendoza",0,0,he,all_caps,0,✔️ [True]


../data/modified_data/coref/dialectal_100.json


Average Metric: 79 / 100  (79.0): 100%|██████████| 100/100 [00:00<00:00, 822.60it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,Journalist Jack Anderson was thinking maybe Watergate Special Prosecutor Archibald Cox kena fired cos he starting to kaypoh into Rebozo's part in Nixon taking secret...,Journalist Jack Anderson speculated that Watergate Special Prosecutor Archibald Cox had been fired because he had started to investigate Rebozo's role in Nixon accepting covert...,he,"0: Journalist Jack Anderson, 1: Watergate Special Prosecutor Archibald Cox",1,1,he,singaporean_english,0,


../data/modified_data/coref/sentiment_100.json


Average Metric: 80 / 100  (80.0): 100%|██████████| 100/100 [00:00<00:00, 811.57it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,The cat was afraid of the friendly dog because it was timid.,The cat was afraid of the dog because it was timid.,it,"0: The cat, 1: the friendly dog",0,0,it,sentiment,0,✔️ [True]


../data/modified_data/coref/grammatical_role_100.json


Average Metric: 60 / 72  (83.3): 100%|██████████| 72/72 [00:00<00:00, 108.28it/s] 


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,Vivan asked Lakshman to get him some ice cream because he was hot.,Lakshman asked Vivan to get him some ice cream because he was hot.,he,"0: Vivan, 1: Lakshman",0,0,he,grammatical_role,0,✔️ [True]


../data/modified_data/coref/length_bias_100.json


Average Metric: 84 / 99  (84.8): 100%|██████████| 99/99 [00:01<00:00, 71.34it/s] 


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,"Janis has been ordered to kill Noelia, but she is unsure if she can actually go along with this plan.","Janis is ordered to kill Noelia, but doesn't know if she can go along with this.",she,"0: Janis, 1: Noelia",0,0,she,length_bias,0,✔️ [True]


../data/modified_data/coref/concept_replacement_100.json


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 826.71it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,"Dan walks away from Samuel and he tries plarding after him, but he trips and falls.","Dan walks away from Samuel and he tries running after him, but he trips and falls.",him,"0: Dan, 1: Samuel",0,0,him,nonce,0,✔️ [True]


../data/modified_data/coref/typo_bias_100.json


Average Metric: 77 / 98  (78.6): 100%|██████████| 98/98 [00:00<00:00, 844.14it/s] 


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,"Jason built Rocky a robot, and he gave it to himm.","Jason built Rocky a robot, and he gave it to him.",he,"0: Jason, 1: Rocky",0,0,he,addition,0,✔️ [True]


../data/modified_data/coref/geographical_bias_100.json


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 511.32it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,Yosuo refused to support Emilio since he was waiting to see whether Chuukese workers would rebel and whether Chuukese soldiers would refuse to follow orders.,Trotsky refused to support Lenin since he was waiting to see whether German workers would rebel and whether German soldiers would refuse to follow orders.,he,"0: Yosuo, 1: Emilio",1,1,he,geographical_bias,0,


../data/modified_data/coref/punctuation_100.json


Average Metric: 82 / 99  (82.8): 100%|██████████| 99/99 [00:00<00:00, 107.03it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,"The cat was afraid of the dog, because it was timid.",The cat was afraid of the dog because it was timid.,it,"0: The cat, 1: the dog",0,0,it,addition,0,✔️ [True]


../data/modified_data/coref/derivation_100.json


Average Metric: 79 / 98  (80.6): 100%|██████████| 98/98 [00:00<00:00, 796.11it/s]


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,The blimp impacted the tree because it was obstructing.,The blimp hit the tree because it was in the way.,it,"0: The blimp, 1: the tree",1,1,it,derivation,1,✔️ [True]


../data/modified_data/coref/active_to_passive_100.json


Average Metric: 79 / 95  (83.2): 100%|██████████| 95/95 [00:01<00:00, 84.01it/s]  


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,The flashlights were turned on by the campers because it was dark without them.,The campers turned on their flashlights because it was dark without them.,them,"0: the campers, 1: The flashlights",1,1,them,active_to_passive,1,✔️ [True]


../data/modified_data/coref/negation_100.json


Average Metric: 69 / 98  (70.4): 100%|██████████| 98/98 [00:00<00:00, 180.04it/s] 


Unnamed: 0,text,original_text,pronoun,candidate,example_label,original_label,original_pronoun,type,pred_label,eval_metric
0,Kathy paid Jane to leave but she did not return several weeks later.,Kathy paid Jane to leave but she returned several weeks later.,she,"0: Kathy, 1: Jane",1,1,she,verbal,1,✔️ [True]


# Aggregate results

In [44]:
modification_name = ['temporal_bias', 'geographical_bias','length_bias', 'typo_bias', 'capitalization', 'punctuation', 'derivation', 'compound_word','active_to_passive','grammatical_role', 'coordinating_conjunction', 'concept_replacement','negation','discourse','sentiment','casual', 'dialectal']

In [45]:
from scipy import stats

In [46]:
result_files = glob.glob('results/coref/gpt4o-0shot-*_100.csv')

aggregated_results = []

for file in result_files:
    # Extract modification type from filename
    mod_type = file.split('-')[-1].replace('.csv','')
    
    # Read results file
    df = pd.read_csv(file)

    # Calculate accuracies
    original_correct = (df['original_pred'] == df['original_label']).sum()
    modified_correct = (df['modified_pred'] == df['modified_label']).sum()
    total = len(df)

    original_acc = original_correct / total
    modified_acc = modified_correct / total
    
    # Calculate the difference between original_res and modified_res
    difference = -round(original_acc - modified_acc, 2)
    
    # Calculate percentage difference with respect to total samples
    pct_difference = -round((original_correct - modified_correct) / original_correct * 100, 2)
    
    # Perform t-test between original and modified predictions
    t_stat, p_value = stats.ttest_ind(
        (df['original_pred'] == df['original_label']).astype(float),
        (df['modified_pred'] == df['modified_label']).astype(float)
    )
    
    aggregated_results.append({
        'task': 'dialogue_contradiction_detection',
        'modification': mod_type,
        'original_res': round(original_acc, 2),
        'modified_res': round(modified_acc, 2),
        'difference': difference,  # Difference in accuracy
        'pct_difference': pct_difference,  # Percentage difference relative to total samples
        'p_value': p_value  # Add p-value from t-test
    })

# Create final results dataframe
results_df = pd.DataFrame(aggregated_results)

# Sort the results based on modification_name
modification_name = ['temporal_bias_100', 'geographical_bias_100','length_bias_100', 'typo_bias_100', 'capitalization_100', 'punctuation_100', 'derivation_100', 'compound_word_100','active_to_passive_100','grammatical_role_100', 'coordinating_conjunction_100', 'concept_replacement_100','negation_100','discourse_100','sentiment_100','casual_100', 'dialectal_100']
results_df['modification'] = pd.Categorical(results_df['modification'], categories=modification_name, ordered=True)
results_df = results_df.sort_values(by='modification')

# Calculate averages across all modifications
avg_original = results_df['original_res'].mean()
avg_modified = results_df['modified_res'].mean()
avg_difference = avg_original - avg_modified
avg_pct_difference = results_df['pct_difference'].mean()

# Add averages as a new row
results_df.loc[len(results_df)] = {
    'task': 'dialogue_contradiction_detection',
    'modification': 'average',
    'original_res': round(avg_original, 2),
    'modified_res': round(avg_modified, 2),
    'difference': -round(avg_difference, 2),
    'pct_difference': round(avg_pct_difference, 2),
    'p_value': None  # No p-value for average row
}

print("\n")
results_df.to_csv('results/coref/gpt4o-DP.csv')

# Apply styling to highlight rows where original_res > modified_res and significant p-values
def highlight_drops_and_significance(row):
    colors = [''] * len(row)
    if row['original_res'] > row['modified_res']:
        colors = ['background-color: red'] * len(row)
        # If p-value < 0.05, add bold text
        if 'p_value' in row and row['p_value'] is not None and row['p_value'] < 0.05:
            colors = ['background-color: red; font-weight: bold'] * len(row)
    return colors

results_df.round(2).style.apply(highlight_drops_and_significance, axis=1)






  results_df.loc[len(results_df)] = {


Unnamed: 0,task,modification,original_res,modified_res,difference,pct_difference,p_value
8,dialogue_contradiction_detection,temporal_bias_100,0.82,0.8,-0.02,-2.44,0.72
2,dialogue_contradiction_detection,geographical_bias_100,0.77,0.72,-0.05,-6.49,0.42
11,dialogue_contradiction_detection,length_bias_100,0.78,0.75,-0.03,-3.85,0.62
13,dialogue_contradiction_detection,typo_bias_100,0.81,0.78,-0.03,-3.7,0.6
3,dialogue_contradiction_detection,capitalization_100,0.8,0.81,0.01,1.25,0.86
10,dialogue_contradiction_detection,punctuation_100,0.83,0.82,-0.01,-1.2,0.85
7,dialogue_contradiction_detection,derivation_100,0.81,0.8,-0.01,-1.23,0.86
16,dialogue_contradiction_detection,compound_word_100,0.84,0.81,-0.03,-3.7,0.57
6,dialogue_contradiction_detection,active_to_passive_100,0.65,0.71,0.05,8.06,0.44
1,dialogue_contradiction_detection,grammatical_role_100,0.91,0.87,-0.04,-4.17,0.44


In [177]:
# Load results from different models
gpt4_df = pd.read_csv('results/coref/gpt4o-0shot-coref.csv')
claude_df = pd.read_csv('results/coref/claude-3-5-sonnet-0shot-coref.csv')
mixtral_df = pd.read_csv('results/coref/mixtral-8x22b-0shot-coref.csv')

# Calculate accuracy between predictions and labels
gpt4_acc = (gpt4_df['pred'] == gpt4_df['label']).mean()
claude_acc = (claude_df['pred'] == claude_df['label']).mean()
mixtral_acc = (mixtral_df['pred'] == mixtral_df['label']).mean()
# Calculate average accuracy for each model
print(f"GPT-4 Average Accuracy: {gpt4_acc:.2%}")
print(f"Claude-3.5 Average Accuracy: {claude_acc:.2%}")
print(f"Mixtral Average Accuracy: {mixtral_acc:.2%}")

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': ['GPT-4', 'Claude-3.5', 'Mixtral'],
    'Accuracy': [gpt4_acc, claude_acc, mixtral_acc]
})

# Style the dataframe
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: green' if v else '' for v in is_max]

styled_df = comparison_df.style.apply(highlight_max, subset=['Accuracy'])
styled_df


GPT-4 Average Accuracy: 73.47%
Claude-3.5 Average Accuracy: 76.53%
Mixtral Average Accuracy: 61.10%


Unnamed: 0,Model,Accuracy
0,GPT-4,0.734655
1,Claude-3.5,0.765345
2,Mixtral,0.610954
