In [1]:
# !pip install transformers==3.0.2
# !pip install nlp

In [2]:
import pandas as pd
import numpy as np
import csv
import sys

def _read_tsv(input_file, quotechar=None):
    """Reads a tab separated value file."""
    with open(input_file, "r", encoding='utf-8') as f:
        reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
        lines = []
        for line in reader:
            if sys.version_info[0] == 2:
                line = list(unicode(cell, 'utf-8') for cell in line)
            lines.append(line)
        return lines
    
def _create_examples_snli(lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
        if i == 0:
            continue
        guid = "%s-%s" % (set_type, line[0])
        text_a = line[7]
        text_b = line[8]
        label = line[-1]
        examples.append([guid, text_a, text_b, label])
    return examples

def _create_examples_mnli(lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
        if i == 0:
            continue
        guid = "%s-%s" % (set_type, line[0])
        text_a = line[8]
        text_b = line[9]
        label = line[-1]
        examples.append([guid, text_a, text_b, label])
    return examples

train_snli = _create_examples_snli(_read_tsv('glue_data/SNLI/train.tsv'), 'train_s')
test_snli = _create_examples_snli(_read_tsv('glue_data/SNLI/test.tsv'), 'test_s')

In [3]:
col_names = ['id', 'text_a', 'text_b', 'label']
train_df_s = pd.DataFrame(train_snli, columns = col_names)
test_df_s = pd.DataFrame(test_snli, columns = col_names)

In [4]:
# SNLI data
train_df_s.head()

Unnamed: 0,id,text_a,text_b,label
0,train_s-0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,neutral
1,train_s-1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",contradiction
2,train_s-2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",entailment
3,train_s-3,Children smiling and waving at camera,They are smiling at their parents,neutral
4,train_s-4,Children smiling and waving at camera,There are children present,entailment


In [5]:
train_df_s

Unnamed: 0,id,text_a,text_b,label
0,train_s-0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,neutral
1,train_s-1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",contradiction
2,train_s-2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",entailment
3,train_s-3,Children smiling and waving at camera,They are smiling at their parents,neutral
4,train_s-4,Children smiling and waving at camera,There are children present,entailment
...,...,...,...,...
549362,train_s-549362,Four dirty and barefooted children.,four kids won awards for 'cleanest feet',contradiction
549363,train_s-549363,Four dirty and barefooted children.,"four homeless children had their shoes stolen,...",neutral
549364,train_s-549364,A man is surfing in a bodysuit in beautiful bl...,A man in a bodysuit is competing in a surfing ...,neutral
549365,train_s-549365,A man is surfing in a bodysuit in beautiful bl...,A man in a business suit is heading to a board...,contradiction


### Algorithm of filtering describtion:
- define top 1000 the most frequent hypothesis
- define distribution of class for each hypotesis
- if one of the class has more than 80% of samples move it to filtering list
- filter out from dataset pairs for phrases from filtering list in the way to have the same amount of pairs for top frequent class and secound top frequent within each hypothesis

In [6]:
# Selecting the most frequent hypothesis:
top_texts = train_df_s.text_b.value_counts()[:1000].index.values
df = train_df_s[train_df_s.text_b.isin(top_texts)]
tmp = pd.DataFrame(df.groupby(['text_b', 'label'])['id', 'text_a'].count().to_records())
tmp.head()

  after removing the cwd from sys.path.


Unnamed: 0,text_b,label,id,text_a
0,A baby is crying.,contradiction,9,9
1,A baby is crying.,entailment,9,9
2,A baby is crying.,neutral,1,1
3,A baby is sleeping.,contradiction,9,9
4,A baby is sleeping.,entailment,9,9


In [7]:
def get_ratio(x=0,y=0,z=0, ratio = 0.8):
    summ = x+y+z
    ratios = np.array([x/summ, y/summ, z/summ])
    if np.sum(ratios>0.7)>0:
        return True
    else:
        return False

In [8]:
texts_to_filter = []
for t in top_texts:
    tmp_inner = tmp[tmp.text_b == t]
    if get_ratio(*tmp_inner.id.values):
        texts_to_filter.append(t)

In [9]:
len(texts_to_filter)

719

In [10]:
from tqdm.auto import tqdm
ids_to_filter = []

for t in tqdm(texts_to_filter):
    sample = train_df_s[train_df_s.text_b == t]
    tt = sample.groupby('label')['id'].count()
    
    classes = list(tt.index)
    values = tt.values
    
    if len(values)>1:
        biggest = np.argsort(values)[-1]
        middle = np.argsort(values)[-2]
        values_to_drop = values[biggest] - values[middle]
        class_to_drop = classes[biggest]
    else:
        biggest = np.argsort(values)[-1]
        values_to_drop = values[biggest]
        class_to_drop = classes[biggest]
    
    ids = [int(i[8:]) for i in sample[sample.label == class_to_drop][:values_to_drop]['id'].values]
    
    ids_to_filter += ids

HBox(children=(FloatProgress(value=0.0, max=719.0), HTML(value='')))




In [11]:
len(ids_to_filter)

16462

### Filtering ids from the training list

In [28]:
import pandas as pd
import numpy as np
import csv
import sys

input_file = 'glue_data/SNLI/train.tsv'
output_file = 'glue_data/SNLI/train_filtered.tsv'

initial_line = 'index	captionID	pairID	sentence1_binary_parse	sentence2_binary_parse	sentence1_parse	sentence2_parse	sentence1	sentence2	label1	label2	label3	label4	label5	gold_label'

filter_set = set([str(i) for i in ids_to_filter])

def _read_filter_tsv(input_file, output_file, filter_set, quotechar=None):
    """Reads a tab separated value file."""

    outF = open(output_file, "w")
    outF.writelines([initial_line])
    count_filtered = 0
    with open(input_file, "r", encoding='utf-8') as f:
        reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
        lines_to_write = []
        for line in tqdm(reader):
            if sys.version_info[0] == 2:
                line = list(unicode(cell, 'utf-8') for cell in line)
            if line[0] not in filter_set:
                lines_to_write.append("\t".join(line)+'\n')
        outF.writelines(lines_to_write)   
    outF.close()
    
_read_filter_tsv(input_file, output_file, filter_set, quotechar=None)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [21]:
16493/len(train_df_s)

0.030021825118727554

In [27]:
import json

file_in = '/Users/trokhymovych/Documents/UCU/diplom/data/snli/snli_1.0_train.jsonl'
file_out = '/Users/trokhymovych/Documents/UCU/diplom/data/snli/snli_1.0_train_filtered.jsonl'

with open(file_in, 'r') as json_file:
    json_list = list(json_file)

count_filtered = 0
outF = open(file_out, "w") 
for i, line in tqdm(enumerate(json_list)):
    if str(i) not in filter_set:
        count_filtered+=1
        outF.writelines(line)  
outF.close()

print(count_filtered)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


533659


In [26]:
filter_set

{'74350',
 '28580',
 '18942',
 '485166',
 '110183',
 '143477',
 '397595',
 '11107',
 '496142',
 '148475',
 '11567',
 '247234',
 '227040',
 '530496',
 '478705',
 '446364',
 '53997',
 '473748',
 '336400',
 '218282',
 '368542',
 '403209',
 '451511',
 '165242',
 '418615',
 '245441',
 '185239',
 '314900',
 '76883',
 '116696',
 '300348',
 '91402',
 '370082',
 '388034',
 '236524',
 '341515',
 '306862',
 '334872',
 '207450',
 '152364',
 '530532',
 '242182',
 '271515',
 '64833',
 '24640',
 '337442',
 '144729',
 '449402',
 '276970',
 '450470',
 '424583',
 '167225',
 '255295',
 '220061',
 '365690',
 '282484',
 '354356',
 '15653',
 '319581',
 '304060',
 '138498',
 '277774',
 '115206',
 '484449',
 '397191',
 '118402',
 '123781',
 '316604',
 '217169',
 '11723',
 '41784',
 '145099',
 '495735',
 '36395',
 '239258',
 '319056',
 '162139',
 '226101',
 '514401',
 '281136',
 '186642',
 '356723',
 '226095',
 '392562',
 '344742',
 '265158',
 '133767',
 '200619',
 '153107',
 '109594',
 '8580',
 '447879',
 '18

In [30]:
test_df_s

Unnamed: 0,id,text_a,text_b,label
0,test_s-0,This church choir sings to the masses as they ...,The church has cracks in the ceiling.,neutral
1,test_s-1,This church choir sings to the masses as they ...,The church is filled with song.,entailment
2,test_s-2,This church choir sings to the masses as they ...,A choir singing at a baseball game.,contradiction
3,test_s-3,"A woman with a green headscarf, blue shirt and...",The woman is young.,neutral
4,test_s-4,"A woman with a green headscarf, blue shirt and...",The woman is very happy.,entailment
...,...,...,...,...
9819,test_s-9819,Two women are observing something together.,Two women are standing with their eyes closed.,contradiction
9820,test_s-9820,Two women are observing something together.,Two girls are looking at something.,entailment
9821,test_s-9821,A man in a black leather jacket and a book in ...,A man is flying a kite.,contradiction
9822,test_s-9822,A man in a black leather jacket and a book in ...,A man is speaking in a classroom.,entailment


In [52]:
import altair as alt

# customizing main Axes 
axisX = alt.Axis(grid=False, 
                 domain=False,
                 ticks = False,
                 labels = False,
                 labelAngle = 180,
                 labelPadding = 2,
                 titleFontWeight = 300,
                 titlePadding = 10,
                 titleAngle = 10,
                 titleY =  -25,
                 titleX = 770,
                 labelFontSize= 20,
                 titleFontSize=25)

axisY = alt.Axis(grid = True,
                 domain=False, 
                 ticks = False,
                 titleFontWeight = 300,
                 labelFontSize= 20,
                 titleFontSize=25)

alt.Chart(tmp[:1000], title = '').mark_bar().encode(
    x=alt.X('label:O', title = '', axis = axisX),
    y=alt.Y('id:Q', title = 'Number of items', axis = axisY),
    color=alt.Color('label:N', title = 'Label', legend = alt.Legend(labelFontSize = 20, titleFontSize=20)),
    column=alt.Column('text_b:N', title = '', spacing = 5, header=alt.Header(labelAngle=90, 
                                                                             labelLimit = 200, 
                                                                             labelOrient = 'bottom',
                                                                             labelAlign = 'left', 
                                                                             labelFontSize= 20,
                                                                             titleFontSize=25), sort = alt.SortArray(top_texts))
).properties(width=50, height=300).configure_view(
    stroke='transparent')