In [1]:
# !pip install transformers==3.0.2
# !pip install nlp

In [2]:
import pandas as pd
import numpy as np
import csv
import sys

def _read_tsv(input_file, quotechar=None):
    """Reads a tab separated value file."""
    with open(input_file, "r", encoding='utf-8') as f:
        reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
        lines = []
        for line in reader:
            if sys.version_info[0] == 2:
                line = list(unicode(cell, 'utf-8') for cell in line)
            lines.append(line)
        return lines
    
def _create_examples_snli(lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
        if i == 0:
            continue
        guid = "%s-%s" % (set_type, line[0])
        text_a = line[7]
        text_b = line[8]
        label = line[-1]
        examples.append([guid, text_a, text_b, label])
    return examples

def _create_examples_mnli(lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
        if i == 0:
            continue
        guid = "%s-%s" % (set_type, line[0])
        text_a = line[8]
        text_b = line[9]
        label = line[-1]
        examples.append([guid, text_a, text_b, label])
    return examples

train_snli = _create_examples_snli(_read_tsv('glue_data/SNLI/train.tsv'), 'train_s')
train_mnli = _create_examples_mnli(_read_tsv('glue_data/MNLI/train.tsv'), 'train_m')

In [3]:
col_names = ['id', 'text_a', 'text_b', 'label']
train_df_s = pd.DataFrame(train_snli, columns = col_names)
train_df_m = pd.DataFrame(train_mnli, columns = col_names)

In [4]:
# SNLI data
train_df_s.head()

Unnamed: 0,id,text_a,text_b,label
0,train_s-0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,neutral
1,train_s-1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",contradiction
2,train_s-2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",entailment
3,train_s-3,Children smiling and waving at camera,They are smiling at their parents,neutral
4,train_s-4,Children smiling and waving at camera,There are children present,entailment


In [5]:
# MNLI data
train_df_m.head()

Unnamed: 0,id,text_a,text_b,label
0,train_m-0,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,neutral
1,train_m-1,you know during the season and i guess at at y...,You lose the things to the following level if ...,entailment
2,train_m-2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,entailment
3,train_m-3,How do you know? All this is their information...,This information belongs to them.,entailment
4,train_m-4,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,neutral


### What plots I want to have:
- 1) Number of elements per class
- 2) Avg lengs of texts in class
- 3) The most frequent texts for text_a and text_b + counts
- 4) Top words that characterise the classes and % - refer to the paper

### 1) Number of elements per class - done

In [6]:
s_count = train_df_s.groupby('label')['id'].count()
m_count = train_df_m.groupby('label')['id'].count()

counts = list(s_count) + list(m_count)
labels = ['contradiction', 'entailment', 'neutral']*2
dataset = ['SNLI']*3+['MNLI']*3

df = pd.DataFrame({'dataset':dataset, 'labels': labels, 'counts':counts})
df

Unnamed: 0,dataset,labels,counts
0,SNLI,contradiction,183187
1,SNLI,entailment,183416
2,SNLI,neutral,182764
3,MNLI,contradiction,130903
4,MNLI,entailment,130899
5,MNLI,neutral,130900


In [7]:
import altair as alt

# customizing main Axes 
axisX = alt.Axis(grid=False, 
                 domain=False,
                 ticks = False,
                 labelAngle =0,
                 labelPadding = 2,
                 titleFontWeight = 300,
                 titlePadding = 10,
                 titleAngle = 0,
                 titleY =  -25,
                 titleX = 770)

axisY = alt.Axis(grid = True,
                 domain=False, 
                 ticks = False,
                 titleFontWeight = 300,)

alt.Chart(df).mark_bar().encode(
    x=alt.X('dataset:O', title = '', axis = axisX),
    y=alt.Y('counts:Q', title = 'Number of items', axis = axisY),
    color='dataset:N',
    column=alt.Column('labels:N', title = '')
).properties(width=100, height=250).configure_view(
    stroke='transparent')

### 2) Avg lengs of texts in class

In [8]:
def num_of_tokens(x):
    return len(x.split(' '))

train_df_s['len_a'] = train_df_s.text_a.apply(num_of_tokens)
train_df_s['len_b'] = train_df_s.text_b.apply(num_of_tokens)
train_df_m['len_a'] = train_df_m.text_a.apply(num_of_tokens)
train_df_m['len_b'] = train_df_m.text_b.apply(num_of_tokens)

train_df_s['len_diff'] = train_df_s['len_a'] - train_df_s['len_b']
train_df_m['len_diff'] = train_df_m['len_a'] - train_df_m['len_b']

train_df_s.head(1)

Unnamed: 0,id,text_a,text_b,label,len_a,len_b,len_diff
0,train_s-0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,neutral,11,9,2


In [9]:
len_n = list(train_df_s[train_df_s.label == 'neutral'].groupby('len_b')['id'].count()/len(train_df_s[train_df_s.label == 'neutral']))
len_e = list(train_df_s[train_df_s.label == 'entailment'].groupby('len_b')['id'].count()/len(train_df_s[train_df_s.label == 'entailment']))
len_c = list(train_df_s[train_df_s.label == 'contradiction'].groupby('len_b')['id'].count()/len(train_df_s[train_df_s.label == 'contradiction']))

index_n = list(train_df_s[train_df_s.label == 'neutral'].groupby('len_b')['id'].count().index)
index_e = list(train_df_s[train_df_s.label == 'entailment'].groupby('len_b')['id'].count().index)
index_c = list(train_df_s[train_df_s.label == 'contradiction'].groupby('len_b')['id'].count().index)

lens_a = len_n + len_e + len_c
index_a = index_n + index_e + index_c 
class_a = ['neutral']*len(len_n) + ['entailment']*len(len_e) + ['contradiction']*len(len_c)

dataset_plot = pd.DataFrame({'length_probability':lens_a, 'number_of_words':index_a, 'class':class_a})
dataset_plot

Unnamed: 0,length_probability,number_of_words,class
0,0.000060,1,neutral
1,0.000799,2,neutral
2,0.014270,3,neutral
3,0.082467,4,neutral
4,0.100107,5,neutral
...,...,...,...
117,0.000005,36,contradiction
118,0.000005,38,contradiction
119,0.000005,39,contradiction
120,0.000005,56,contradiction


In [15]:
# customizing main Axes 
axisX = alt.Axis(grid=False, 
                 values=[1,3,5,7,9,11,13,15,17],
                 domain=True,
                 ticks = True,
                 labelAngle =0,
                 labelPadding = 2,
                 titleFontWeight = 300,
                 titlePadding = 10,
                 titleAngle = 0,
                 labelFontSize= 40,
                 titleFontSize=45
                )

axisY = alt.Axis(grid = True,
                 domain=True, 
                 ticks = False,
                 titleFontWeight = 300,                
                 labelFontSize= 40,
                 titleFontSize=45)

alt.Chart(dataset_plot[dataset_plot.number_of_words<18], title = '').mark_line().encode(
    x=alt.X('number_of_words:N', title = 'Number of words', axis = axisX),
    y=alt.Y('length_probability:Q', title = 'Probability', axis = axisY),
    color=alt.Color('class', title = 'label', legend = alt.Legend(labelFontSize = 28, titleFontSize=40))
).properties(width=700, height=550).configure_view(
    stroke='transparent')

In [16]:
len_n = list(train_df_m[train_df_m.label == 'neutral'].groupby('len_b')['id'].count()/len(train_df_m[train_df_m.label == 'neutral']))
len_e = list(train_df_m[train_df_m.label == 'entailment'].groupby('len_b')['id'].count()/len(train_df_m[train_df_m.label == 'entailment']))
len_c = list(train_df_m[train_df_m.label == 'contradiction'].groupby('len_b')['id'].count()/len(train_df_m[train_df_m.label == 'contradiction']))

index_n = list(train_df_m[train_df_m.label == 'neutral'].groupby('len_b')['id'].count().index)
index_e = list(train_df_m[train_df_m.label == 'entailment'].groupby('len_b')['id'].count().index)
index_c = list(train_df_m[train_df_m.label == 'contradiction'].groupby('len_b')['id'].count().index)

lens_a = len_n + len_e + len_c
index_a = index_n + index_e + index_c 
class_a = ['neutral']*len(len_n) + ['entailment']*len(len_e) + ['contradiction']*len(len_c)

dataset_plot = pd.DataFrame({'length_probability':lens_a, 'number_of_words':index_a, 'class':class_a})
dataset_plot

Unnamed: 0,length_probability,number_of_words,class
0,0.000512,1,neutral
1,0.001406,2,neutral
2,0.006272,3,neutral
3,0.019213,4,neutral
4,0.037403,5,neutral
...,...,...,...
150,0.000015,42,contradiction
151,0.000008,43,contradiction
152,0.000023,44,contradiction
153,0.000008,45,contradiction


In [17]:
# customizing main Axes 
axisX = alt.Axis(grid=False, 
                 values=[1,3,5,7,9,11,13,15,17],
                 domain=True,
                 ticks = True,
                 labelAngle =0,
                 labelPadding = 2,
                 titleFontWeight = 300,
                 titlePadding = 10,
                 titleAngle = 0,
                 labelFontSize= 40,
                 titleFontSize=45
                )

axisY = alt.Axis(grid = True,
                 domain=True, 
                 ticks = False,
                 titleFontWeight = 300,                
                 labelFontSize= 40,
                 titleFontSize=45)

alt.Chart(dataset_plot[dataset_plot.number_of_words<18], title = '').mark_line().encode(
    x=alt.X('number_of_words:N', title = 'Number of words', axis = axisX),
    y=alt.Y('length_probability:Q', title = 'Probability', axis = axisY),
    color=alt.Color('class', title = 'label', legend = alt.Legend(labelFontSize = 28, titleFontSize=40))
).properties(width=700, height=550).configure_view(
    stroke='transparent')

In [210]:
len_n = list(train_df_s[train_df_s.label == 'neutral'].groupby('len_diff')['id'].count()/len(train_df_s[train_df_s.label == 'neutral']))
len_e = list(train_df_s[train_df_s.label == 'entailment'].groupby('len_diff')['id'].count()/len(train_df_s[train_df_s.label == 'entailment']))
len_c = list(train_df_s[train_df_s.label == 'contradiction'].groupby('len_diff')['id'].count()/len(train_df_s[train_df_s.label == 'contradiction']))

index_n = list(train_df_s[train_df_s.label == 'neutral'].groupby('len_diff')['id'].count().index)
index_e = list(train_df_s[train_df_s.label == 'entailment'].groupby('len_diff')['id'].count().index)
index_c = list(train_df_s[train_df_s.label == 'contradiction'].groupby('len_diff')['id'].count().index)

lens_a = len_n + len_e + len_c
index_a = index_n + index_e + index_c 
class_a = ['neutral']*len(len_n) + ['entailment']*len(len_e) + ['contradiction']*len(len_c)

dataset_plot = pd.DataFrame({'length_probability':lens_a, 'number_of_words':index_a, 'class':class_a})
dataset_plot

Unnamed: 0,length_probability,number_of_words,class
0,0.000011,-22,neutral
1,0.000005,-20,neutral
2,0.000011,-18,neutral
3,0.000038,-17,neutral
4,0.000066,-16,neutral
...,...,...,...
260,0.000011,64,contradiction
261,0.000011,68,contradiction
262,0.000005,69,contradiction
263,0.000005,70,contradiction


In [211]:
# customizing main Axes 
axisX = alt.Axis(grid=False, 
                 domain=True,
                 ticks = True,
                 labelAngle =0,
                 labelPadding = 2,
                 titleFontWeight = 300,
                 titlePadding = 10,
                 titleAngle = 0,)

axisY = alt.Axis(grid = True,
                 domain=True, 
                 ticks = False,
                 titleFontWeight = 300,)

alt.Chart(dataset_plot[(dataset_plot.number_of_words<15)&(dataset_plot.number_of_words>-5)], title = 'Difference in Number of words between Text and Hypothesis').mark_line().encode(
    x=alt.X('number_of_words:N', title = '', axis = axisX),
    y=alt.Y('length_probability:Q', title = 'Probability', axis = axisY),
    color=alt.Color('class', title = 'label')
).properties(width=250, height=150).configure_view(
    stroke='transparent')

### - 3) The most frequent texts for text_a and text_b + counts

In [354]:
top_texts = train_df_m.text_b.value_counts()[:15].index.values
df = train_df_m[train_df_m.text_b.isin(top_texts)]

In [355]:
tmp = pd.DataFrame(df.groupby(['text_b', 'label'])['id', 'text_a'].count().to_records())
tmp.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,text_b,label,id,text_a
0,Absolutely not.,contradiction,3,3
1,Absolutely not.,entailment,6,6
2,Definitely not.,contradiction,8,8
3,Definitely not.,entailment,5,5
4,I don't know.,contradiction,4,4


In [356]:
import altair as alt

# customizing main Axes 
axisX = alt.Axis(grid=False, 
                 domain=False,
                 ticks = False,
                 labels = False,
                 labelAngle = 180,
                 labelPadding = 2,
                 titleFontWeight = 300,
                 titlePadding = 10,
                 titleAngle = 10,
                 titleY =  -25,
                 titleX = 770,
                 labelFontSize= 20,
                 titleFontSize=25)

axisY = alt.Axis(grid = True,
                 domain=False, 
                 ticks = False,
                 titleFontWeight = 300,
                 labelFontSize= 20,
                 titleFontSize=25)

alt.Chart(tmp, title = '').mark_bar().encode(
    x=alt.X('label:O', title = '', axis = axisX),
    y=alt.Y('id:Q', title = 'Number of items', axis = axisY),
    color=alt.Color('label:N', title = 'Label', legend = alt.Legend(labelFontSize = 20, titleFontSize=20)),
    column=alt.Column('text_b:N', title = '', spacing = 5, header=alt.Header(labelAngle=90, 
                                                                             labelLimit = 200, 
                                                                             labelOrient = 'bottom',
                                                                             labelAlign = 'left', 
                                                                             labelFontSize= 20,
                                                                             titleFontSize=25), sort = alt.SortArray(top_texts))
).properties(width=50, height=300).configure_view(
    stroke='transparent')