In [1]:
import pandas as pd
import math

In [2]:
questions = pd.read_json("cladder-v1-questions.json")

In [3]:
questions.info()
questions.head(1)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10560 entries, 0 to 10559
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   question_id  10560 non-null  int64 
 1   desc_id      10560 non-null  object
 2   given_info   10560 non-null  object
 3   question     10560 non-null  object
 4   answer       10560 non-null  object
 5   meta         10560 non-null  object
 6   reasoning    8916 non-null   object
dtypes: int64(1), object(6)
memory usage: 577.6+ KB


Unnamed: 0,question_id,desc_id,given_info,question,answer,meta,reasoning
0,19,alarm-mediation-nie-model1-spec1-q1,For husbands that don't set the alarm and wive...,Does husband negatively affect alarm clock thr...,no,"{'story_id': 'alarm', 'graph_id': 'mediation',...",{'step0': 'Let X = husband; V2 = wife; Y = ala...


In [4]:
no_reasoning_qs = questions[questions["reasoning"].isnull()]
no_reasoning_qs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1644 entries, 4 to 9078
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   question_id  1644 non-null   int64 
 1   desc_id      1644 non-null   object
 2   given_info   1644 non-null   object
 3   question     1644 non-null   object
 4   answer       1644 non-null   object
 5   meta         1644 non-null   object
 6   reasoning    0 non-null      object
dtypes: int64(1), object(6)
memory usage: 102.8+ KB


In [5]:
print("Number of backadj questions: " + str(sum(questions["meta"].map(lambda val : val["query_type"]) == 'backadj')))
print("Number of questions with reasoning==None: " + str(len(no_reasoning_qs)))
print("Overlap between these groups: " + str(sum(no_reasoning_qs["meta"].map(lambda val : val["query_type"]) == 'backadj')))

Number of backadj questions: 1644
Number of questions with reasoning==None: 1644
Overlap between these groups: 1644


In [6]:
sum(no_reasoning_qs["given_info"].map(lambda val : "Method 1: We look" in val and "Method 2: We look" in val))

1644

In [7]:
# Get a list of all query_types
qtypes = questions["meta"].map(lambda val : val["query_type"]).unique()

In [8]:
# Store the number of instances and associated rung for each query_type
qtype_info = {}
for t in qtypes:
    t_inds = questions["meta"].map(lambda val : val["query_type"]) == t
    num_q = sum(t_inds)
    rung = int(questions.loc[t_inds]["meta"].map(lambda val : val["rung"]).mode().values[0])
    qtype_info[t] = (num_q, rung)

In [9]:
qtype_info

{'nie': (828, 3),
 'marginal': (1644, 1),
 'nde': (384, 3),
 'backadj': (1644, 2),
 'ate': (1476, 2),
 'ett': (1296, 3),
 'correlation': (1476, 1),
 'collider_bias': (168, 2),
 'exp_away': (168, 1),
 'det-counterfactual': (1476, 3)}

Now just need to extract appropriate number of questions from each group. Ensure that they have 50% positive class breakdown like in overall dataset.

Potential issue- paper mentions that the v1.0 benchmark set is primarily balanced across all stories, but the number of stories per variant (commonsense, anticommonsense, and nonsense) vary significantly, so there's an unbalanced benchmark in terms of sensicalness.

CLadder has stories where the natural language description/terms were opposing the "common sense" intuition/interpretation/assumptions of how the scenario should work. 

Decision made: balance between common/anti-common/nonsense stories, try to balance story content inside those groups
Nonsense is easy to find since desc_id starts with "nonsense", but need to figure out how to determine anti-common vs common sense

Turns out that the way to determine common/anti-common/nonsense is to connect each question to its associated model from the meta-models.json file, using the model_id value of the question's 'meta' field. Should have realized that before but oh well.

In [10]:
# Add the sense-type to each question based on meta-models
meta = pd.read_json('cladder-v1-meta-models.json')

In [11]:
meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7064 entries, 0 to 7063
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   model_id          7064 non-null   int64  
 1   story_id          7064 non-null   object 
 2   graph_id          7064 non-null   object 
 3   spec_id           7064 non-null   int64  
 4   spec              7064 non-null   object 
 5   seed              7064 non-null   int64  
 6   builder           7064 non-null   object 
 7   difficulty        4440 non-null   object 
 8   equation_type     7064 non-null   object 
 9   background        7064 non-null   object 
 10  variable_mapping  7064 non-null   object 
 11  structure         7064 non-null   object 
 12  params            7064 non-null   object 
 13  groundtruth       7064 non-null   object 
 14  simpson           512 non-null    float64
 15  anticommonsense   2352 non-null   object 
 16  nonsense          2360 non-null   float64


In [12]:
# Find subsets appropriately
anticommon = meta.dropna(subset=['anticommonsense'])
nonsense = meta.dropna(subset=['nonsense'])

# Common is everything that isn't anti or non
common = meta[~meta.isin(anticommon)].dropna(how='all')
common = common[~common.isin(nonsense)].dropna(how='all')

# Restrict to the model_ids
anticommon = anticommon['model_id']
nonsense = nonsense['model_id']
common = common['model_id']

In [13]:
def sense(model_id):
    '''
    Gets the sense type (anticommon, non, common) of the provided model_id
    '''
    if model_id in anticommon:
        return 'anticommonsense'
    elif model_id in nonsense:
        return 'nonsense'
    else:
        return 'commonsense'

In [14]:
# Now add the sense type to each question
questions['sense'] = questions['meta'].map(lambda val : sense(val['model_id']))

In [15]:
# Break down by rung first
rungqs = {}
num_per_rung = {}
for i in range(1,4):
    rungi_qs = questions["meta"].map(lambda val : val["rung"]) == i
    rungqs[i] = questions.loc[rungi_qs]
    num_per_rung[i] = sum(rungi_qs)

In [16]:
for i in range(1,4):
    print(f"Rung {i} question types:" + str(rungqs[i]['meta'].map(lambda val : val['query_type']).unique()) + " Number of questions: "+ str(num_per_rung[i]))

Rung 1 question types:['marginal' 'correlation' 'exp_away'] Number of questions: 3288
Rung 2 question types:['backadj' 'ate' 'collider_bias'] Number of questions: 3288
Rung 3 question types:['nie' 'nde' 'ett' 'det-counterfactual'] Number of questions: 3984


In [17]:
# Within each rung, need to assign numbers to each qtype st total sum is 500/3 ~ 167
#  Ideally respect approximate distributions
total_qs = 500
result_qs = {}
sense_types = {'anticommonsense','nonsense','commonsense'}
# Set RNG seed for repeatable sampling
rng = 50288

dfs = []

# loop through rungs
for i in range(1,4):

    # get the questions
    qs = rungqs[i]

    # divide and sample by sense types (went with 56 so that there's not a balancing issue with yes/no, and no jank stuff with all but one group being 57)
    for stype in sense_types:
        sense_qs = qs.loc[qs['sense'] == stype]

        # split yes/no for even positive responses
        s_yes_sample = sense_qs.loc[sense_qs['answer'] == 'yes'].sample(28, random_state=rng)
        s_no_sample = sense_qs.loc[sense_qs['answer'] == 'no'].sample(28, random_state=rng)
        dfs.append(s_yes_sample)
        dfs.append(s_no_sample)
        
    

# for t in qtypes:
#     # Number of instances of the question type
#     num_qtype = qtype_info[t][0]
    
#     # Rung that t belongs to
#     rung = qtype_info[t][1]
    
#     # Number of qtype samples that should be extracted for balanced distribution
#     num_to_get = math.ceil(num_qtype/num_per_rung[rung]*total_qs/3)
    
#     # Subset the questions DF
#     qtype_qs = questions.loc[questions['meta'].map(lambda val : val['query_type']) == t]
    
#     # Split into yes and no responses
#     t_yes = qtype_qs.loc[questions['answer'] == 'yes']
#     t_no = qtype_qs.loc[questions['answer'] == 'no']
    
#     # Sample appropriately from each
#     t_yes_sample = t_yes.sample(math.ceil(num_to_get/2), random_state=rng)
#     t_no_sample = t_no.sample(math.ceil(num_to_get/2), random_state=rng)
#     result_qs[t] = pd.concat([t_yes_sample, t_no_sample], ignore_index=True)

In [18]:
# Concat all DFs and output as JSON
output = pd.concat(dfs, ignore_index=True)
output.to_json("sampled_questions.json",orient='records')

In [19]:
output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   question_id  504 non-null    int64 
 1   desc_id      504 non-null    object
 2   given_info   504 non-null    object
 3   question     504 non-null    object
 4   answer       504 non-null    object
 5   meta         504 non-null    object
 6   reasoning    420 non-null    object
 7   sense        504 non-null    object
dtypes: int64(1), object(7)
memory usage: 31.6+ KB


In [22]:
s_qtype_info = {}
for t in qtypes:
    t_inds = output["meta"].map(lambda val : val["query_type"]) == t
    num_q = sum(t_inds)
    rung = int(output.loc[t_inds]["meta"].map(lambda val : val["rung"]).mode().values[0])
    s_qtype_info[t] = (num_q, rung)
s_qtype_info

{'nie': (37, 3),
 'marginal': (76, 1),
 'nde': (10, 3),
 'backadj': (84, 2),
 'ate': (70, 2),
 'ett': (46, 3),
 'correlation': (80, 1),
 'collider_bias': (14, 2),
 'exp_away': (12, 1),
 'det-counterfactual': (75, 3)}