In [1]:
import pandas as pd
import numpy as np

In [207]:
df = pd.read_json("data/snli_1.0/snli_1.0_train.jsonl", lines=True)
# df = pd.read_json("data/contrast.jsonl", lines=True)

In [208]:
df = df.drop(columns=['sentence1_binary_parse','sentence1_parse','sentence2_binary_parse','sentence2_parse','captionID','pairID'])
df = df.rename(columns={'sentence1':'premise','sentence2':'hypothesis','gold_label':'label'})

## Ambiguity

In [210]:
df['annotator_labels_str'] = ''

for index, row in df.iterrows():
    d = [1 if x=='neutral' else (2 if x=='contradiction' else 0) for x in row['annotator_labels']]
    d = sorted(d)
    if(len(d) == 4):
        s = str(d[0]) +str(d[1]) + str(d[2]) + str(d[3])
    elif len(d)==5:
        s = str(d[0]) +str(d[1]) + str(d[2]) + str(d[3]) + str(d[4])
    elif len(d)==3:
        s = str(d[0]) +str(d[1]) + str(d[2])
    df.at[index,'annotator_labels_str'] = s

    df.at[index,'label'] = 1 if row['label']=='neutral' else (2 if row['label']=='contradiction' else 0)

df = df.drop(columns=['annotator_labels'])

In [None]:
df.groupby('annotator_labels_str')['label'].agg(len)

annotator_labels_str
0000       7479
00000    115460
00001     28250
00002     10976
0001       2443
00011     13279
00012      3694
0002       1138
00022      1182
0011       1200
00111     15116
00112      3017
0012        575
00122      1121
0022        135
00222      1332
0111       2062
01111     29955
01112      6688
0112        648
01122      2274
0122        252
01222      3137
0222        797
02222      9138
1111       4290
11111     75604
11112     26831
1112       2798
11122     10954
1122       1132
11222      9512
1222       1663
12222     22479
2222       8028
22222    125513
Name: label, dtype: int64

In [211]:
ambiguous = df[(df['annotator_labels_str'].isin(['00011','00012','00022','00111','00112','00122','00222','01112','01122','01222','11122','11222',
                                                 '0011','0012','0022','0112','0122','1122',
                                                 '012'])) ]

In [212]:
print(len(ambiguous))
print(len(df))

75237
550152


In [None]:
ambiguous.to_json('ambiguous.jsonl',orient='records',lines=True)

## Lexical Overlap

In [None]:
from collections import Counter
from typing import List

class Indexer(object):
    """
    Bijection between objects and integers starting at 0. Useful for mapping
    labels, features, etc. into coordinates of a vector space.

    Attributes:
        objs_to_ints
        ints_to_objs
    """
    def __init__(self):
        self.objs_to_ints = {}
        self.ints_to_objs = {}

    def __repr__(self):
        return str([str(self.get_object(i)) for i in range(0, len(self))])

    def __str__(self):
        return self.__repr__()

    def __len__(self):
        return len(self.objs_to_ints)

    def get_object(self, index):
        """
        :param index: integer index to look up
        :return: Returns the object corresponding to the particular index or None if not found
        """
        if (index not in self.ints_to_objs):
            return None
        else:
            return self.ints_to_objs[index]

    def contains(self, object):
        """
        :param object: object to look up
        :return: Returns True if it is in the Indexer, False otherwise
        """
        return self.index_of(object) != -1

    def index_of(self, object):
        """
        :param object: object to look up
        :return: Returns -1 if the object isn't present, index otherwise
        """
        if (object not in self.objs_to_ints):
            return -1
        else:
            return self.objs_to_ints[object]

    def add_and_get_index(self, object, add=True):
        """
        Adds the object to the index if it isn't present, always returns a nonnegative index
        :param object: object to look up or add
        :param add: True by default, False if we shouldn't add the object. If False, equivalent to index_of.
        :return: The index of the object
        """
        if not add:
            return self.index_of(object)
        if (object not in self.objs_to_ints):
            new_idx = len(self.objs_to_ints)
            self.objs_to_ints[object] = new_idx
            self.ints_to_objs[new_idx] = object
        return self.objs_to_ints[object]


class UnigramFeatureExtractor():
    """
    Extracts unigram bag-of-words features from a sentence. It's up to you to decide how you want to handle counts
    and any additional preprocessing you want to do.
    """
    def __init__(self, indexer: Indexer):
        self.indexer = indexer
    
    def extract_features(self, sentence: List[str], add_to_indexer: bool=False) -> Counter:
        lower_sentence = [word.lower() for word in sentence]
        indices = []

        for word in lower_sentence:
            if add_to_indexer:
                idx = self.indexer.add_and_get_index(word)
            else:
                idx = self.indexer.index_of(word)

            if idx >= 0:
                indices.append(idx)

        counter = Counter(indices)     
        return counter


In [None]:
import string

extractor = UnigramFeatureExtractor(Indexer())

def strip_punct(s):
    s = s.replace("<s>"," ")
    s = s.replace("</s>"," ")
    s = s.translate(str.maketrans('', '', string.punctuation))
    return s.split(" ")

def passes_threshold(hypothesis, premise):
    count = 0
    for key in set(hypothesis.elements()):
        if premise[key] > 0:
            count += 1
    return count/len(hypothesis)

def precision(premise: str, hypothesis: str) -> bool:
    premise_counter = extractor.extract_features(strip_punct(premise),True)
    hypothesis_counter = extractor.extract_features(strip_punct(hypothesis), True)

    return passes_threshold(hypothesis_counter,premise_counter)

In [213]:
df['lexical_overlap'] = 0
for index, row in df.iterrows():
    df.at[index,'lexical_overlap'] = precision(row['premise'],row['hypothesis'])


In [214]:
df['lexical_overlap'].describe()

count    550152.000000
mean          0.456587
std           0.242531
min           0.000000
25%           0.285714
50%           0.428571
75%           0.625000
max           1.000000
Name: lexical_overlap, dtype: float64

In [None]:
overlap = df[(df['lexical_overlap'] > .285) & (df['lexical_overlap'] < .428)]

In [None]:
df[df['lexical_overlap'] <= .385].groupby('label')['premise'].agg(len) / len(df[df['lexical_overlap'] <= .385])

label
0    0.195953
1    0.371479
2    0.432568
Name: premise, dtype: float64

In [None]:
df[df['lexical_overlap'] >= .525].groupby('label')['premise'].agg(len) / len(df[df['lexical_overlap'] >= .525])

label
0    0.518863
1    0.267636
2    0.213501
Name: premise, dtype: float64

In [202]:
df[(df['lexical_overlap'] > .385) & (df['lexical_overlap'] < .525)].groupby('label')['premise'].agg(len) / len(df[(df['lexical_overlap'] > .385) & (df['lexical_overlap'] < .525)])

label
0    0.222222
1    0.388889
2    0.388889
Name: premise, dtype: float64

In [181]:
len(overlap)/len(df)

0.21472611205630443

In [182]:
overlap.to_json('overlap.jsonl',orient='records',lines=True)

In [204]:
overlap.head()

Unnamed: 0,premise,hypothesis,label,lexical_overlap
1,Two women are embracing while holding to go pa...,The sisters are hugging each other while holdi...,0,0.375
12,"Two young children in blue jerseys, one with t...",Two kids in jackets walk to school.,2,0.285714
15,A man in a blue shirt standing in front of a g...,A man is repainting a garage,1,0.4
16,A man in a blue shirt standing in front of a g...,A man is painting a picture,2,0.4
33,"At an outdoor event in an Asian-themed area, a...",A single man is to the side of a camera,2,0.333333


## Combined

In [205]:
df.iloc[[33,45,52]]

Unnamed: 0,premise,hypothesis,label,lexical_overlap
33,"At an outdoor event in an Asian-themed area, a...",A single man is to the side of a camera,2,0.333333
45,A white dog with long hair jumps to catch a re...,An animal is jumping to catch an object.,0,0.285714
52,An Indian woman is washing and cleaning dirty ...,An Indian woman is doing her job at a lake.,1,0.7


In [190]:
combined = df[(df['lexical_overlap'] > .285) & (df['lexical_overlap'] < .428) & 
              ((df['annotator_labels_str'].isin(['00011','00012','00022','00111','00112','00122','00222','01112','01122','01222','11122','11222',
                                                 '0011','0012','0022','0112','0122','1122',
                                                 '012'])))]

In [191]:
len(combined)/len(df)

0.02940278323081621

In [192]:
combined.head()

Unnamed: 0,label,premise,hypothesis,annotator_labels_str,lexical_overlap
0,1,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,1112,0.375
3,1,Children smiling and waving at camera,They are smiling at their parents,1112,0.333333
6,2,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,1112,0.4
9,1,An older man sits with his orange juice at a s...,An older man drinks his juice as he waits for ...,1112,0.333333
14,0,Two blond women are hugging one another.,There are women showing affection.,111,0.4


In [193]:
combined.to_json('combined.jsonl',orient='records',lines=True)