In [8]:
import numpy as np
from datasets import load_dataset
from scipy.spatial import distance
import seaborn as sns
import pandas as pd

In [9]:
def filter_for_single_pt(df):
    unique_paraphrase_types = set()
    df['paraphrase_types'].apply(lambda x: unique_paraphrase_types.update(x) if isinstance(x, (list, np.ndarray)) else unique_paraphrase_types.add(x))
    unique_paraphrase_types = list(unique_paraphrase_types)
    paraphrase_type_data_dict = {}

    for row in unique_paraphrase_types:
        reduced_to_similar_paraphrase_type = df[df['paraphrase_types']
                        .apply(lambda x: row in x if isinstance(x, (list, np.ndarray)) else row == x)]
        paraphrase_type_data_dict[row] = reduced_to_similar_paraphrase_type

    return paraphrase_type_data_dict

In [10]:
df = load_dataset("jpwahle/etpc")['train'].to_pandas()
single_type_dict = filter_for_single_pt(df)

for key, value in single_type_dict.items():
    print(key, len(value))

Entailment 81
Change of format 207
Subordination and nesting changes 448
Same Polarity Substitution (named ent.) 448
Addition/Deletion 2988
Synthetic/analytic substitution 806
Non-paraphrase 605
Opposite polarity substitution (habitual) 4
Identity 3870
Ellipsis 64
Converse substitution 42
Opposite polarity substitution (contextual) 12
Punctuation changes 748
Derivational Changes 181
Same Polarity Substitution (habitual) 681
Semantic based 328
Syntax/discourse structure changes 305
Change of order 766
Diathesis alternation 161
Spelling changes 534
Modal Verb Changes 180
Inflectional Changes 544
Direct/indirect style alternations 66
Negation switching 20
Same Polarity Substitution (contextual) 2511
Coordination changes 47


In [11]:
row = df[df['idx'] == '5_4']

In [28]:
pd.set_option('display.max_colwidth', None)
row[['sentence1', 'sentence2', 'paraphrase_types', 'sentence1_segment_location_indices', 'sentence2_segment_location_indices']]

Unnamed: 0,sentence1,sentence2,paraphrase_types,sentence1_segment_location_indices,sentence2_segment_location_indices
4,"The stock rose $2.11, or about 11 percent, to close Friday at $21.51 on the New York Stock Exchange.",PG&E Corp. shares jumped $1.63 or 8 percent to $21.03 on the New York Stock Exchange on Friday.,"[Same Polarity Substitution (contextual), Same Polarity Substitution (habitual), Same Polarity Substitution (contextual), Synthetic/analytic substitution, Change of order, Addition/Deletion, Identity, Non-paraphrase, Non-paraphrase, Non-paraphrase, Punctuation changes]","[[0, 1], [2], [11, 12, 14], [13], [13], [7], [3, 6, 9, 15, 17, 18, 19, 20, 21, 22, 23], [4], [8], [16]]","[[0, 1, 2, 3, 4], [5], [11], [20, 21], [20, 21], [6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 22], [7], [9], [13], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]]"


In [29]:
row['sentence1'].values[0]

'The stock rose $2.11, or about 11 percent, to close Friday at $21.51 on the New York Stock Exchange.'

('The stock', array([0, 1], dtype=int32))
('rose', array([2], dtype=int32))
('to close at', array([11, 12, 14], dtype=int32))
('Friday', array([13], dtype=int32))
('Friday', array([13], dtype=int32))
('about', array([7], dtype=int32))
('$ or percent $ on the New York Stock Exchange .', array([ 3,  6,  9, 15, 17, 18, 19, 20, 21, 22, 23], dtype=int32))
('2.11', array([4], dtype=int32))
('11', array([8], dtype=int32))
('21.51', array([16], dtype=int32))
0 The
1 stock
2 rose
3 $2.11,
4 or
5 about
6 11
7 percent,
8 to
9 close
10 Friday
11 at
12 $21.51
13 on
14 the
15 New
16 York
17 Stock
18 Exchange.


4
19



In [5]:
len(["The", "stock", "rose", "$", "2.11", ",", "or", "about", "11", "percent", ",", "to", "close", "Friday", "at", "$", "21.51", "on", "the", "New", "York", "Stock", "Exchange", "."])

24

In [25]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('/Users/yasir/github/paraphrase-types/out/cls-models/bert-large-uncased-jpwahle/etpc-paraphrase-detection/checkpoint-3045')
input_string = "The stock rose $2.11, or about 11 percent, to close Friday at $21.51 on the New York Stock Exchange."
tokens = tokenizer.tokenize(input_string)

print(tokens)


['the', 'stock', 'rose', '$', '2', '.', '11', ',', 'or', 'about', '11', 'percent', ',', 'to', 'close', 'friday', 'at', '$', '21', '.', '51', 'on', 'the', 'new', 'york', 'stock', 'exchange', '.']


In [27]:
df['sentence1_segment_text']

0                                                                                                                                [whom, called, Amrozi accused his brother, `` the witness '' , of deliberately distorting his evidence .]
1                                                                                                                                                                                                                                       []
2                                                                                                                          [They, cargo, on June 10, , he added, had published an advertisement on the Internet , offering the for sale .]
3                                                                                                                                                                                                                                       []
4       [The stock, rose, to close at, Friday, Friday, about

In [44]:
df.head()

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,negation,paraphrase_types,paraphrase_type_ids,sentence1_segment_location,sentence2_segment_location,sentence1_segment_location_indices,sentence2_segment_location_indices,sentence1_segment_text,sentence2_segment_text
0,1_0,"Amrozi accused his brother, whom he called ""the witness"", of deliberately distorting his evidence.","Referring to him as only ""the witness"", Amrozi accused his brother of deliberately distorting his evidence.","[Amrozi, accused, his, brother, ,, whom, he, called, ``, the, witness, '', ,, of, deliberately, distorting, his, evidence, .]","[Referring, to, him, as, only, ``, the, witness, '', ,, Amrozi, accused, his, brother, of, deliberately, distorting, his, evidence, .\n]",1,1,0,"[Same Polarity Substitution (habitual), Same Polarity Substitution (contextual), Change of order, Addition/Deletion, Identity]","[5, 6, 26, 25, 29]","[26, 26, 26, 26, 0, 5, 0, 6, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25]","[6, 5, 5, 0, 25, 0, 0, 0, 0, 0, 26, 26, 26, 26, 0, 0, 0, 0, 0, 0]","[[5], [7], [0, 1, 2, 3], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]]","[[1, 2], [0], [10, 11, 12, 13], [4]]","[whom, called, Amrozi accused his brother, `` the witness '' , of deliberately distorting his evidence .]","[to him, Referring, Amrozi accused his brother, only, `` the witness '' , of deliberately distorting his evidence .\n]"
1,2_1,Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $2.5 billion.,Yucaipa bought Dominick's in 1995 for $693 million and sold it to Safeway for $1.8 billion in 1998.,"[Yucaipa, owned, Dominick, 's, before, selling, the, chain, to, Safeway, in, 1998, for, $, 2.5, billion, .]","[Yucaipa, bought, Dominick, 's, in, 1995, for, $, 693, million, and, sold, it, to, Safeway, for, $, 1.8, billion, in, 1998, .\n]",0,0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],[],[],[]
2,3_2,"They had published an advertisement on the Internet on June 10, offering the cargo for sale, he added.","On June 10, the ship's owners had published an advertisement on the Internet, offering the explosives for sale.","[They, had, published, an, advertisement, on, the, Internet, on, June, 10, ,, offering, the, cargo, for, sale, ,, he, added, .]","[On, June, 10, ,, the, ship, 's, owners, had, published, an, advertisement, on, the, Internet, ,, offering, the, explosives, for, sale, .\n]",1,1,0,"[Same Polarity Substitution (contextual), Same Polarity Substitution (contextual), Change of order, Addition/Deletion, Identity]","[6, 6, 26, 25, 29]","[6, 0, 0, 0, 0, 0, 0, 0, 26, 26, 26, 0, 0, 0, 6, 0, 0, 25, 25, 25, 0]","[26, 26, 26, 26, 6, 6, 6, 6, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 6, 25, 25, 25]","[[0], [14], [8, 9, 10], [17, 18, 19]]","[[4, 5, 6, 7], [18], [0, 1, 2, 3], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21]]","[They, cargo, on June 10, , he added, had published an advertisement on the Internet , offering the for sale .]","[the ship 's owners, explosives, On June 10 ,, had published an advertisement on the Internet , offering the for sale .\n]"
3,4_3,"Around 0335 GMT, Tab shares were up 19 cents, or 4.4%, at A$4.56, having earlier set a record high of A$4.57.","Tab shares jumped 20 cents, or 4.6%, to set a record closing high at A$4.57.","[Around, 0335, GMT, ,, Tab, shares, were, up, 19, cents, ,, or, 4.4, %, ,, at, A, $, 4.56, ,, having, earlier, set, a, record, high, of, A, $, 4.57, .]","[Tab, shares, jumped, 20, cents, ,, or, 4.6, %, ,, to, set, a, record, closing, high, at, A, $, 4.57, .\n]",0,0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],[],[],[]
4,5_4,"The stock rose $2.11, or about 11 percent, to close Friday at $21.51 on the New York Stock Exchange.",PG&E Corp. shares jumped $1.63 or 8 percent to $21.03 on the New York Stock Exchange on Friday.,"[The, stock, rose, $, 2.11, ,, or, about, 11, percent, ,, to, close, Friday, at, $, 21.51, on, the, New, York, Stock, Exchange, .]","[PG, &, E, Corp., shares, jumped, $, 1.63, or, 8, percent, to, $, 21.03, on, the, New, York, Stock, Exchange, on, Friday, .\n]",0,1,0,"[Same Polarity Substitution (contextual), Same Polarity Substitution (habitual), Same Polarity Substitution (contextual), Synthetic/analytic substitution, Change of order, Addition/Deletion, Identity, Non-paraphrase, Non-paraphrase, Non-paraphrase, Punctuation changes]","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 6, 5, 29, 30, 0, 29, 25, 30, 29, 0, 6, 6, 26, 6, 29, 30, 29, 29, 29, 29, 29, 29, 29]","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30]","[[0, 1], [2], [11, 12, 14], [13], [13], [7], [3, 6, 9, 15, 17, 18, 19, 20, 21, 22, 23], [4], [8], [16]]","[[0, 1, 2, 3, 4], [5], [11], [20, 21], [20, 21], [6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 22], [7], [9], [13], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]]","[The stock, rose, to close at, Friday, Friday, about, $ or percent $ on the New York Stock Exchange ., 2.11, 11, 21.51, The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .]","[PG & E Corp. shares, jumped, to, on Friday, on Friday, $ or percent $ on the New York Stock Exchange .\n, 1.63, 8, 21.03, PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .\n]"
