In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'
import seaborn as sns
import pandas as pd
import spacy
import metrics
nlp = spacy.load("en_core_web_lg")

def compute_wpd_ld(s1, s2):
    s1, s2 = nlp(s1), nlp(s2)
    wpd = metrics.wpd(s1, s2)
    ld = metrics.ld(s1, s2)
    return wpd, ld

In [2]:
s1 = "A conviction could bring a maximum penalty of 10 years in prison and a $250,000 fine."
s2 = "If convicted, he faces a maximum penalty of 10 years in prison and a $250,000 fine."
compute_wpd_ld(s1, s2)

(0.027149321266968323, 0.38888888888888884)

In [3]:
s1 = "The top rate will go to 4.45 percent for all residents with taxable incomes above $500,000."
s2 = "For residents with incomes above $500,000, the income-tax rate will increase to 4.45 percent."
compute_wpd_ld(s1, s2)

(0.4950980392156862, 0.33333333333333337)

In [4]:
s1 = "However, prosecutors have declined to take criminal action against guards, though Fine said his inquiry is not finished."
s2 = "Prosecutors have declined to take criminal action against corrections officers, although Fine said his inquiry was not finished."
compute_wpd_ld(s1, s2)

(0.06176470588235296, 0.2857142857142857)

In [5]:
s1 = "In trading on the New York Stock Exchange, Kraft shares fell 25 cents to close at $32.30."
s2 = "Kraft's shares fell 25 cents to close at $32.30 yesterday on the New York Stock Exchange."
compute_wpd_ld(s1, s2)

(0.4422084623323013, 0.21052631578947367)

In [6]:
s1 = "An attempt last month in the Senate to keep the fund open for another year fell flat."
s2 = "An attempt to keep the fund open for another year fell flat in the Senate last month."
compute_wpd_ld(s1, s2)

(0.30968858131487886, 0.0)

In [7]:
s1 = "Prisoners were tortured and executed -- their ears and scalps severed for souvenirs."
s2 = "They frequently tortured and shot prisoners, severing ears and scalps for souvenirs."
compute_wpd_ld(s1, s2)

(0.08974358974358976, 0.4285714285714286)

Appendix - MRPC

In [8]:
s1 = "However, the talk was downplayed by PBL which said it would focus only on smaller purchases that were immediately earnings and cash flow-accretive."
s2 = "The talk, however, has been downplayed by PBL which said it would focus only on smaller purchases that were immediately earnings and cash flow-accretive."
compute_wpd_ld(s1, s2)

(0.03793322062552832, 0.04166666666666663)

In [9]:
s1 = "With an estimated net worth of $1.7 billion, Mrs. Kroc ranked No. 121 on Forbes magazine's latest list of the nation's wealthiest people."
s2 = "Kroc ranked No. 121 on Forbes magazine's latest list of the nation's wealthiest people, with an estimated net worth of $1.7 billion."
compute_wpd_ld(s1, s2)

(0.45066137566137565, 0.04347826086956519)

In [10]:
s1 = "Federal Emergency Management Administration designated $20 million to establish the registry."
s2 = "The registry was launched with $20 million from the Federal Emergency Management Agency."
compute_wpd_ld(s1, s2)

(0.4087301587301587, 0.5625)

In [11]:
s1 = "As a result, Nelson now faces up to a 10 year jail term instead of life."
s2 = "The verdict means Nelson faces up to 10 years in prison rather than a life sentence."
compute_wpd_ld(s1, s2)

(0.136437908496732, 0.6521739130434783)

Appendix - PAWS

In [12]:
s1 = "Brockton is approximately 25 miles northeast of Providence, Rhode Island, and 30 miles south of Boston."
s2 = "Brockton is located approximately 25 miles northeast of Providence, Rhode Island and 30 miles south of Boston."
compute_wpd_ld(s1, s2)

(0.02951388888888889, 0.06666666666666665)

In [13]:
s1 = "Wollstonecraft arrived in Grenada on board the ship 'Sydney' on 31 August 1819."
s2 = "Wollstonecraft arrived on August 31, 1819 on board the ship 'Sydney' in Grenada."
compute_wpd_ld(s1, s2)

(0.3010416666666666, 0.0)

In [14]:
s1 = "Based on the city of Baltimore, only mentioned, never visited in the show."
s2 = "Based on the city of Baltimore, only mentioned, has never visited in the show."
compute_wpd_ld(s1, s2)

(0.015029761904761905, 0.07692307692307687)

In [15]:
s1 = "The dividends have increased the total return on the average equity to double, approximately 3.2 %."
s2 = "The dividends increased the real 'total return' of the average equity to double, about 3.2 % ."
compute_wpd_ld(s1, s2)

(0.02794316107009604, 0.375)

In [16]:
"""
df = pd.read_csv("./dataset_stats.csv")
df.head()
FILTER_FOR = "edit distance"
df = pd.read_csv("./dataset_stats.csv")
df = df[df["dataset"]=="mrpc"]
df = df[df["type"]==FILTER_FOR]
df.head()
df = df[df["value"]>0.5][df["value"]<0.9]
df.head()

for index, row in df.iterrows():
    s1, s2, value = row['s1'], row['s2'], row["value"]
    if len(s1.split(" ")) < 20:
        s1, s2 = nlp(s1), nlp(s2)
        wpd = metrics.wpd(s1, s2)
        ld = metrics.ld(s1, s2)
        if wpd > 0.3 and ld < 0.1:
            print(s1)
            print(s2)
            print(FILTER_FOR, value)
            print("WPD", wpd)
            print("LD", ld)
            print(" ")
            
for index, row in df.iterrows():
    s1, s2, value = row['s1'], row['s2'], row["value"]
    if len(s1.split(" ")) < 20:
        s1, s2 = nlp(s1), nlp(s2)
        wpd = metrics.wpd(s1, s2)
        ld = metrics.ld(s1, s2)
        if wpd < 0.1 and ld > 0.3:
            print(s1)
            print(s2)
            print(FILTER_FOR, value)
            print("WPD", wpd)
            print("LD", ld)
            print(" ")
"""

'\ndf = pd.read_csv("./dataset_stats.csv")\ndf.head()\nFILTER_FOR = "edit distance"\ndf = pd.read_csv("./dataset_stats.csv")\ndf = df[df["dataset"]=="mrpc"]\ndf = df[df["type"]==FILTER_FOR]\ndf.head()\ndf = df[df["value"]>0.5][df["value"]<0.9]\ndf.head()\n\nfor index, row in df.iterrows():\n    s1, s2, value = row[\'s1\'], row[\'s2\'], row["value"]\n    if len(s1.split(" ")) < 20:\n        s1, s2 = nlp(s1), nlp(s2)\n        wpd = metrics.wpd(s1, s2)\n        ld = metrics.ld(s1, s2)\n        if wpd > 0.3 and ld < 0.1:\n            print(s1)\n            print(s2)\n            print(FILTER_FOR, value)\n            print("WPD", wpd)\n            print("LD", ld)\n            print(" ")\n            \nfor index, row in df.iterrows():\n    s1, s2, value = row[\'s1\'], row[\'s2\'], row["value"]\n    if len(s1.split(" ")) < 20:\n        s1, s2 = nlp(s1), nlp(s2)\n        wpd = metrics.wpd(s1, s2)\n        ld = metrics.ld(s1, s2)\n        if wpd < 0.1 and ld > 0.3:\n            print(s1)\n     