In [51]:
import pandas as pd
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yaara.shriki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [52]:
df_niki = pd.read_json(path_or_buf="../data/experiment_niki.jsonl", lines=True)
df_niki

Unnamed: 0,sentence,question,question_type,length,gender_match,correct_answer
0,The tourist hired the guides that the locals a...,Did the locals anger the guides?,filler-Q,short,mismatch,true
1,The tourists hired the guides that the locals ...,Did the locals anger the guides?,distractor-Q,short,match,true
2,The tourist hired the guides that two days ago...,Did the locals anger the guides?,filler-Q,long,mismatch,true
3,The tourists hired the guides who two days ago...,Did the locals anger the guides?,distractor-Q,long,match,true
4,The tourist hired the guides that the locals a...,Did the locals anger the tourist?,filler-Q,short,mismatch,false
...,...,...,...,...,...,...
251,The donors bored the researchers that last yea...,Did the interns disturb the researchers?,distractor-Q,long,match,true
252,The donor bored the researchers that interns d...,Did the interns disturb the donor?,filler-Q,short,mismatch,false
253,The donors bored the researchers that interns ...,Did the interns disturb the donors?,distractor-Q,short,match,false
254,The donor bored the researchers that last year...,Did the interns disturb the donor?,filler-Q,long,mismatch,false


In [53]:
# example
df_niki[df_niki["sentence"] == "The tourists hired the guides that the locals angered due to a misunderstanding"]

Unnamed: 0,sentence,question,question_type,length,gender_match,correct_answer
1,The tourists hired the guides that the locals ...,Did the locals anger the guides?,distractor-Q,short,match,True
5,The tourists hired the guides that the locals ...,Did the locals anger the tourists?,distractor-Q,short,match,False


In [54]:
df_niki = df_niki.query("correct_answer == 'true'").reset_index(drop=True)

In [55]:
df_niki

Unnamed: 0,sentence,question,question_type,length,gender_match,correct_answer
0,The tourist hired the guides that the locals a...,Did the locals anger the guides?,filler-Q,short,mismatch,true
1,The tourists hired the guides that the locals ...,Did the locals anger the guides?,distractor-Q,short,match,true
2,The tourist hired the guides that two days ago...,Did the locals anger the guides?,filler-Q,long,mismatch,true
3,The tourists hired the guides who two days ago...,Did the locals anger the guides?,distractor-Q,long,match,true
4,The coach removed the gymnasts that the fans d...,Did the fans disrespect the gymnasts?,filler-Q,short,mismatch,true
...,...,...,...,...,...,...
123,The editors censored the journalists that last...,Did the readers influence the journalists?,distractor-Q,long,match,true
124,The donor bored the researchers that interns d...,Did the interns disturb the researchers?,filler-Q,short,mismatch,true
125,The donors bored the researchers that interns ...,Did the interns disturb the researchers?,distractor-Q,short,match,true
126,The donor bored the researchers that last year...,Did the interns disturb the researchers?,filler-Q,long,mismatch,true


In [56]:
# change the question where it is like "Did the musicians excited the producers?" to "Did the musicians excite the producers?"
df_niki.loc[df_niki.question.str.contains("heard"), "question"] = df_niki.question.str.replace("heard", "hear")
df_niki.loc[df_niki.question.str.contains("excited"), "question"] = df_niki.question.str.replace("excited", "excite")

In [57]:
# get the fourth word from the sentence
df_niki["verb"] = df_niki.question.str.split().str[3]
# df_niki["distractor"] = df_niki.question.str.split().str[3]

In [58]:
# remove the "Did" and "?" from the question
df_niki["question"] = df_niki["question"].str.replace("Did", "Therefore,").str.replace("?", "")

# get the last word from the question
df_niki["label"] = df_niki["question"].str.split().str[-1]

# change the last words of the question to <MASK>
df_niki["question"] = df_niki["question"].str.split().str[:-1].str.join(" ") + " [MASK]"

  df_niki["question"] = df_niki["question"].str.replace("Did", "Therefore,").str.replace("?", "")


In [59]:
verbs =  df_niki["verb"].unique().tolist()
verbs_to_past_dict = {
'anger': 'angered',
'disrespect': 'disrespected',
'excite': 'excited',
'threaten': 'threatened',
'abuse': 'abused',
'care': 'cared',
'need': 'needed',
'ignore': 'ignored',
'pay': 'paid',
'watch': 'watched',
'help': 'helped',
'hear': 'heard',
'support': 'supported',
'dislike': 'disliked',
'mock': 'mocked',
'rebel': 'rebelled',
'gossip': 'gossiped',
'look': 'looked',
'fear': 'feared',
'harass': 'harassed',
'impress': 'impressed',
'compliment': 'complimented',
'admire': 'admired',
'escape': 'escaped',
'approach': 'approached',
'shoot': 'shot',
'trust': 'trusted',
'advise': 'advised',
'appreciate': 'appreciated',
'influence': 'influenced',
'disturb': 'disturbed'
}

In [60]:
df_niki["question"] = df_niki.apply(lambda row: row["question"].replace(row["verb"], verbs_to_past_dict[row["verb"]]), axis=1)

In [61]:
df_niki["prompt"] = df_niki["sentence"] + ". " + df_niki["question"]

In [62]:
# rename gender_match column to plural_match
df_niki.rename(columns={"gender_match": "plural_match"}, inplace=True)

In [63]:
nltk.download('averaged_perceptron_tagger')

sentences = df_niki["sentence"].tolist()

# Function to extract the first and second nouns from a sentence
def extract_nouns(sentence):
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    nouns = [word for word, pos in tagged if pos.startswith('NN')]
    return nouns[:2]

# Extract nouns from each sentence
first_nouns = []
second_nouns = []

for sentence in sentences:
    nouns = extract_nouns(sentence)
    if len(nouns) > 0:
        first_nouns.append(nouns[0])
    else:
        first_nouns.append(None)
    if len(nouns) > 1:
        second_nouns.append(nouns[1])
    else:
        second_nouns.append(None)

# Create Pandas Series
first_nouns_series = pd.Series(first_nouns)
second_nouns_series = pd.Series(second_nouns)

# Add columns to DataFrame
df_niki["first_noun"] = first_nouns_series.values
df_niki["second_noun"] = second_nouns_series.values


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yaara.shriki/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [64]:
# add column with the wrong answer
df_niki["distractor"] = df_niki.apply(lambda row: row["first_noun"] if row["first_noun"] != row["correct_answer"] else row["second_noun"], axis=1)

# Fix double word labels

In [67]:
df_niki.query("label == 'ones' | label == 'women' | label == 'players' | label == 'managers'")

Unnamed: 0,sentence,question,question_type,length,plural_match,correct_answer,verb,label,prompt,first_noun,second_noun,distractor
52,The defense attorney accompanied the accused o...,"Therefore, the jurors disliked the accused [MASK]",filler-Q,short,mismatch,True,dislike,ones,The defense attorney accompanied the accused o...,defense,attorney,defense
53,The defense attorneys accompanied the accused ...,"Therefore, the jurors disliked the accused [MASK]",distractor-Q,short,match,True,dislike,ones,The defense attorneys accompanied the accused ...,defense,attorneys,defense
54,The defense attorney accompanied the accused o...,"Therefore, the jurors disliked the accused [MASK]",filler-Q,long,mismatch,True,dislike,ones,The defense attorney accompanied the accused o...,defense,attorney,defense
55,The defense attorneys accompanied the accused ...,"Therefore, the jurors disliked the accused [MASK]",distractor-Q,long,match,True,dislike,ones,The defense attorneys accompanied the accused ...,defense,attorneys,defense
64,The tenant hosted the old women that the neigh...,"Therefore, the neighbors gossiped about the ol...",filler-Q,short,mismatch,True,gossip,women,The tenant hosted the old women that the neigh...,tenant,women,tenant
65,The tenants hosted the old women that the neig...,"Therefore, the neighbors gossiped about the ol...",distractor-Q,short,match,True,gossip,women,The tenants hosted the old women that the neig...,tenants,women,tenants
66,The tenant hosted the old women that last week...,"Therefore, the neighbors gossiped about the ol...",filler-Q,long,mismatch,True,gossip,women,The tenant hosted the old women that last week...,tenant,women,tenant
67,The tenants hosted the old women that last wee...,"Therefore, the neighbors gossiped about the ol...",distractor-Q,long,match,True,gossip,women,The tenants hosted the old women that last wee...,tenants,women,tenants
96,The referee disqualified the tennis players th...,"Therefore, the spectators approached the tenni...",filler-Q,short,mismatch,True,approach,players,The referee disqualified the tennis players th...,referee,tennis,referee
97,The referees disqualified the tennis players t...,"Therefore, the spectators approached the tenni...",distractor-Q,short,match,True,approach,players,The referees disqualified the tennis players t...,referees,tennis,referees


In [68]:
# fix the double word labels
df_niki.loc[df_niki.label == "ones", "label"] = "accused"
df_niki.loc[df_niki.label == "women", "label"] = "old"
df_niki.loc[df_niki.label == "players", "label"] = "tennis"
df_niki.loc[df_niki.label == "managers", "label"] = "bank"

In [73]:
df_niki.loc[df_niki.label == "bank"]

Unnamed: 0,sentence,question,question_type,length,plural_match,correct_answer,verb,label,prompt,first_noun,second_noun,distractor
100,The clerk found the bank managers that the rob...,"Therefore, the robbers shot the bank [MASK]",filler-Q,short,mismatch,True,shoot,bank,The clerk found the bank managers that the rob...,clerk,bank,clerk
101,The clerks found the bank managers that the ro...,"Therefore, the robbers shot the bank [MASK]",distractor-Q,short,match,True,shoot,bank,The clerks found the bank managers that the ro...,clerks,bank,clerks
102,The clerk found the bank managers that this mo...,"Therefore, the robbers shot the bank [MASK]",filler-Q,long,mismatch,True,shoot,bank,The clerk found the bank managers that this mo...,clerk,bank,clerk
103,The clerks found the bank managers that this m...,"Therefore, the robbers shot the bank [MASK]",distractor-Q,long,match,True,shoot,bank,The clerks found the bank managers that this m...,clerks,bank,clerks


In [74]:
# fix the first noun
df_niki.loc[df_niki.label == "accused", "second_noun"] = "accused"
df_niki.loc[df_niki.label == "old", "second_noun"] = "old"
df_niki.loc[df_niki.label == "tennis", "second_noun"] = "tennis"
df_niki.loc[df_niki.label == "bank", "second_noun"] = "bank"

In [83]:
# Function to modify prompt
def modify_prompt(row):
    if row["label"] in ["accused", "old", "tennis", "bank"]:
        words = row["prompt"].split()[:-2]  # Remove the last two words
        modified_prompt = " ".join(words) + " <MASK>"
        return modified_prompt
    return row["prompt"]

In [84]:
# Apply the function to each row
df_niki["prompt"] = df_niki.apply(modify_prompt, axis=1)

In [86]:
df_niki.query("label == 'accused' | label == 'old' | label == 'tennis' | label == 'bank'")

Unnamed: 0,sentence,question,question_type,length,plural_match,correct_answer,verb,label,prompt,first_noun,second_noun,distractor
52,The defense attorney accompanied the accused o...,"Therefore, the jurors disliked the accused [MASK]",filler-Q,short,mismatch,True,dislike,accused,The defense attorney accompanied the accused o...,defense,accused,defense
53,The defense attorneys accompanied the accused ...,"Therefore, the jurors disliked the accused [MASK]",distractor-Q,short,match,True,dislike,accused,The defense attorneys accompanied the accused ...,defense,accused,defense
54,The defense attorney accompanied the accused o...,"Therefore, the jurors disliked the accused [MASK]",filler-Q,long,mismatch,True,dislike,accused,The defense attorney accompanied the accused o...,defense,accused,defense
55,The defense attorneys accompanied the accused ...,"Therefore, the jurors disliked the accused [MASK]",distractor-Q,long,match,True,dislike,accused,The defense attorneys accompanied the accused ...,defense,accused,defense
64,The tenant hosted the old women that the neigh...,"Therefore, the neighbors gossiped about the ol...",filler-Q,short,mismatch,True,gossip,old,The tenant hosted the old women that the neigh...,tenant,old,tenant
65,The tenants hosted the old women that the neig...,"Therefore, the neighbors gossiped about the ol...",distractor-Q,short,match,True,gossip,old,The tenants hosted the old women that the neig...,tenants,old,tenants
66,The tenant hosted the old women that last week...,"Therefore, the neighbors gossiped about the ol...",filler-Q,long,mismatch,True,gossip,old,The tenant hosted the old women that last week...,tenant,old,tenant
67,The tenants hosted the old women that last wee...,"Therefore, the neighbors gossiped about the ol...",distractor-Q,long,match,True,gossip,old,The tenants hosted the old women that last wee...,tenants,old,tenants
96,The referee disqualified the tennis players th...,"Therefore, the spectators approached the tenni...",filler-Q,short,mismatch,True,approach,tennis,The referee disqualified the tennis players th...,referee,tennis,referee
97,The referees disqualified the tennis players t...,"Therefore, the spectators approached the tenni...",distractor-Q,short,match,True,approach,tennis,The referees disqualified the tennis players t...,referees,tennis,referees


In [88]:
df_niki.drop(["question_type", "correct_answer", "verb"], axis=1, inplace=True)
df_niki.drop(["sentence", "question"], axis=1, inplace=True)

In [89]:
# change the order of the columns
df_niki = df_niki[["prompt", "label", "distractor", "length", "plural_match"]]

In [90]:
df_niki

Unnamed: 0,prompt,label,distractor,length,plural_match
0,The tourist hired the guides that the locals a...,guides,tourist,short,mismatch
1,The tourists hired the guides that the locals ...,guides,tourists,short,match
2,The tourist hired the guides that two days ago...,guides,tourist,long,mismatch
3,The tourists hired the guides who two days ago...,guides,tourists,long,match
4,The coach removed the gymnasts that the fans d...,gymnasts,coach,short,mismatch
...,...,...,...,...,...
123,The editors censored the journalists that last...,journalists,editors,long,match
124,The donor bored the researchers that interns d...,researchers,donor,short,mismatch
125,The donors bored the researchers that interns ...,researchers,donors,short,match
126,The donor bored the researchers that last year...,researchers,donor,long,mismatch


In [92]:
df_niki.to_csv("../data_processed/niki.csv", index=False)

# Gordon

In [None]:
# object extracted -> the subject is the correct answer, otherwise remove the row
# subject extracted -> if condition question == "matrix verb" -> the subject is the correct answer, otherwise the distractor is the correct answer
# subject extracted -> if condition question == "embedded verb" -> the distractor is the correct answer, otherwise the subject is the correct answer

In [None]:
df_gordon = pd.read_json(path_or_buf="data/experiment_gordon.jsonl", lines=True)
df_gordon.head(20)

In [None]:
df_gordon.subject_value.value_counts()

In [None]:
df_gordon.query("condition_sentence == 'object_extracted'")

In [None]:
df_gordon.condition_question.value_counts()

In [None]:
import pandas as pd

In [None]:
df_naama = pd.read_json(path_or_buf="../data/experiment_naama.jsonl", lines=True)
df_naama

In [None]:
df_naama[df_naama.sentence=="The actress who was in the French director's studio this week hit the expensive vase at the entrance."]

In [None]:
df_naama["gender"].value_counts()

In [None]:
#Notes: Dependency, animacy is the important columns, gender is not.
# Prompt = sentence + question when we mask the subject with mask using correct answer for example: actress,director the label is actress
#

In [None]:
df_naama["gender"].value_counts()

# Data Exploration

In [None]:
df_niki.iloc[1]
