In [1]:
import pandas as pd
import metrics
import string
from tqdm import tqdm
import spacy

In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
output_data = {
    "s1": [],
    "s2": [],
    "dataset": [],
    "value": [],
    "type": [],
}

# MRPC

In [4]:
df_train = pd.read_csv("./mrpc/mrpc_train.csv")
df_train = df_train[df_train["label"]==1]
df_test = pd.read_csv("./mrpc/mrpc_test.csv")
df_test = df_test[df_test["label"]==1]
df = pd.concat([df_train, df_test])
print(df.shape)
df.head()

(3900, 4)


Unnamed: 0.1,Unnamed: 0,s1,s2,label
0,0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...",1
2,2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...",1
4,4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,1
5,5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,1
7,7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,1


In [5]:
pos_dev_list = []
lex_dev_list = []
edit_dist_list = []
rougel_list = []
sbleu_list = []
s1_list = []
s2_list = []

for index, row in tqdm(df.iterrows(), total=len(df)):
    s1 = metrics.preprocess_text(row["s1"])
    s2 = metrics.preprocess_text(row["s2"])
    assert len(s1) > 4 and len(s2) > 4
    s1_list.append(s1)
    s2_list.append(s2)
    s1, s2 = s1.lower(), s2.lower()
    edit_dist_list.append(metrics.edit_distance(s1, s2))
    rougel_list.append(metrics.rouge_l(s1, s2))
    sbleu_list.append(metrics.self_bleu(s1, s2))
    s1 = nlp(s1)
    s2 = nlp(s2)
    pos_dev_list.append(metrics.wpd(s1, s2))
    lex_dev_list.append(metrics.ld(s1, s2))

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
100%|██████████| 3900/3900 [00:34<00:00, 113.27it/s]


In [6]:
output_data["s1"] += s1_list
output_data["s2"] += s2_list

output_data["value"] += pos_dev_list
output_data["dataset"] += ["mrpc"]*len(pos_dev_list)
output_data["type"] += ["position deviation"]*len(pos_dev_list)

output_data["s1"] += s1_list
output_data["s2"] += s2_list

output_data["value"] += lex_dev_list
output_data["dataset"] += ["mrpc"]*len(lex_dev_list)
output_data["type"] += ["lexical deviation"]*len(lex_dev_list)

output_data["s1"] += s1_list
output_data["s2"] += s2_list

output_data["value"] += edit_dist_list
output_data["dataset"] += ["mrpc"]*len(edit_dist_list)
output_data["type"] += ["edit distance"]*len(edit_dist_list)

output_data["s1"] += s1_list
output_data["s2"] += s2_list

output_data["value"] += rougel_list
output_data["dataset"] += ["mrpc"]*len(rougel_list)
output_data["type"] += ["rouge-l"]*len(rougel_list)

output_data["s1"] += s1_list
output_data["s2"] += s2_list

output_data["value"] += sbleu_list
output_data["dataset"] += ["mrpc"]*len(sbleu_list)
output_data["type"] += ["self-bleu"]*len(sbleu_list)

# PAWS

In [7]:
df_train = pd.read_csv("./paws/train.tsv", sep="\t")
df_train = df_train[df_train["label"]==1]
print(df_train.shape)
df_dev = pd.read_csv("./paws/dev.tsv", sep="\t")
df_dev = df_dev[df_dev["label"]==1]
print(df_dev.shape)
df_test = pd.read_csv("./paws/test.tsv", sep="\t")
df_test = df_test[df_test["label"]==1]
print(df_test.shape)
df = pd.concat([df_train, df_dev, df_test])
print(df.shape)
df.head()

(21829, 4)
(3539, 4)
(3536, 4)
(28904, 4)


Unnamed: 0,id,sentence1,sentence2,label
1,2,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...,1
3,4,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...,1
4,5,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...,1
5,6,William Henry Henry Harman was born on 17 Febr...,"William Henry Harman was born in Waynesboro , ...",1
7,8,With a discrete amount of probabilities Formul...,Given a discrete set of probabilities formula ...,1


In [8]:
pos_dev_list = []
lex_dev_list = []
edit_dist_list = []
rougel_list = []
sbleu_list = []
s1_list = []
s2_list = []

for index, row in tqdm(df.iterrows(), total=len(df)):
    s1 = metrics.preprocess_text(row["sentence1"])
    s2 = metrics.preprocess_text(row["sentence2"])
    s1_list.append(s1)
    s2_list.append(s2)
    s1, s2 = s1.lower(), s2.lower()
    edit_dist_list.append(metrics.edit_distance(s1, s2))
    rougel_list.append(metrics.rouge_l(s1, s2))
    sbleu_list.append(metrics.self_bleu(s1, s2))
    s1 = nlp(s1)
    s2 = nlp(s2)
    pos_dev_list.append(metrics.wpd(s1, s2))
    lex_dev_list.append(metrics.ld(s1, s2))

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
100%|██████████| 28904/28904 [04:01<00:00, 119.86it/s]


In [9]:
output_data["s1"] += s1_list
output_data["s2"] += s2_list

output_data["value"] += pos_dev_list
output_data["dataset"] += ["paws"]*len(pos_dev_list)
output_data["type"] += ["position deviation"]*len(pos_dev_list)

output_data["s1"] += s1_list
output_data["s2"] += s2_list

output_data["value"] += lex_dev_list
output_data["dataset"] += ["paws"]*len(lex_dev_list)
output_data["type"] += ["lexical deviation"]*len(lex_dev_list)

output_data["s1"] += s1_list
output_data["s2"] += s2_list

output_data["value"] += edit_dist_list
output_data["dataset"] += ["paws"]*len(edit_dist_list)
output_data["type"] += ["edit distance"]*len(edit_dist_list)

output_data["s1"] += s1_list
output_data["s2"] += s2_list

output_data["value"] += rougel_list
output_data["dataset"] += ["paws"]*len(rougel_list)
output_data["type"] += ["rouge-l"]*len(rougel_list)

output_data["s1"] += s1_list
output_data["s2"] += s2_list

output_data["value"] += sbleu_list
output_data["dataset"] += ["paws"]*len(sbleu_list)
output_data["type"] += ["self-bleu"]*len(sbleu_list)

## Output

In [10]:
df_out = pd.DataFrame.from_dict(output_data)
df_out.head()

Unnamed: 0,s1,s2,dataset,value,type
0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...",mrpc,0.19977,position deviation
1,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...",mrpc,0.240221,position deviation
2,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,mrpc,0.196429,position deviation
3,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,mrpc,0.137821,position deviation
4,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,mrpc,0.067235,position deviation


In [11]:
df_out.tail()

Unnamed: 0,s1,s2,dataset,value,type
164015,Twice Sparrow sold the island twice to Thomas ...,Sparrow twice sold the island to Thomas Polloc...,paws,0.672318,self-bleu
164016,"The name in Tupi means "" insensitive stone "", ...","The name in Tupi means "" hard stone "", "" insen...",paws,0.59699,self-bleu
164017,"The company has branches in Tokyo, based in th...",The company has branches in Tokyo based in Sai...,paws,0.455616,self-bleu
164018,The modern coat of arms of Bavaria was designe...,The modern coat of arms of Bavaria was designe...,paws,0.573057,self-bleu
164019,"It is located near Point Pleasant Borough, a m...","It is near Point Pleasant borough, a municipal...",paws,0.487649,self-bleu


In [12]:
df_out.to_csv("./dataset_stats.csv", index=None)