## Imports and Function Definitions

In [10]:
import numpy as np
from scipy import stats
from statistics import mean, stdev
from math import sqrt,abs
import PyNonpar
import pandas as pd

In [3]:
def notnormallydistributed(a,b):
    result = stats.wilcoxon(a, b)
    return result

In [4]:
def normallydistributed(a,b):
    result = stats.ttest_rel(a, b)
    return result

In [5]:
def normal_distribution_effect(a, b):
    # #Cohen suggested that d = 0.2 be considered a 'small' effect size, 0.5 represents a 'medium' effect size and 0.8 a 'large' effect size. 
    cohens_d = (np.mean(a) - np.mean(b)) / np.sqrt((np.std(a) ** 2 + np.std(b) ** 2) / 2)
    return cohens_d

In [7]:
def not_normal_distribution_effect(group1, group2):
    # cliff's delta |d| < 0.147: negligible , |d| < 0.33: small, |d| < 0.474: medium, |d| >= 0.474: large
    n1 = len(group1)
    n2 = len(group2)
    group1, group2 = np.array(group1), np.array(group2)
    greater = sum(x > y for x in group1 for y in group2)
    less = sum(x < y for x in group1 for y in group2)

    delta = (greater - less) / (n1 * n2)
    return delta

## SBERT

In [28]:
# Recall Rates at k 1 to 100
whole_enriched_average = [21.650219950599201, 28.62246201842, 35.3733575705568, 35.3733575705568, 35.617458712244, 37.4774390658813, 37.962653895944, 39.2583947029442, 41.119855678338, 42.091849986956596, 43.1797134767045, 43.1704929217418, 43.178630783935696, 46.604315219207905, 46.3603109712974, 46.883049863196504, 46.8890561021924, 46.880164427795205, 48.416238387763, 48.6472130655637, 57.2708631866937]
whole_average = [12.982539682539698, 24.4909523809524, 29.948253968254, 29.948253968254, 29.948253968254, 29.948253968254, 29.948253968254, 29.948253968254, 30.476190476190503, 30.476190476190503, 30.476190476190503, 30.476190476190503, 30.476190476190503, 30.476190476190503, 31.269841269841297, 31.269841269841297, 32.3809523809524, 44.047619047619, 48.1587301587302, 48.1587301587302, 49.1334920634921]
sim_average = [53.971538461538499, 76.098461538461497, 79.8887179487179, 79.8887179487179, 79.8887179487179, 79.8887179487179, 79.8887179487179, 79.8887179487179, 80.1887179487179, 80.1887179487179, 80.1887179487179, 80.3846153846154, 80.3846153846154, 80.3846153846154, 81.46153846154, 81.53846154, 83.3846153846154, 84.3846153846154, 84.6784549, 89.12234325, 92.29]
sim_enriched_average = [56.84, 86.166666666666667, 88.36, 70.512820512820509, 72.846153846153847, 75.230769230769226, 77.512820512820509, 79.846153846153847, 82.128205128205124, 84.461538461538467, 86.717948717948718, 89.025641025641022, 89.025641025641022, 89.025641025641022, 89.025641025641022, 89.025641025641022, 89.025641025641022, 89.025641025641022, 89.025641025641022, 89.025641025641022, 96.04]
dissim_enriched_average = [52.9100529100529, 74.60317460317461, 78.3068783068783, 78.3068783068783, 78.3068783068783, 82.010582010582, 82.010582010582, 85.7142857142857, 85.7142857142857, 85.7142857142857, 85.7142857142857, 85.7142857142857, 85.7142857142857, 90.4761904761905, 90.4761904761905, 90.4761904761905, 90.4761904761905, 90.4761904761905, 90.4761904761905, 90.4761904761905, 90.4761904761905]
dis_average = [49.206349206349195, 74.60317460317461, 74.60317460317461, 74.60317460317461, 74.60317460317461, 78.3068783068783, 78.3068783068783, 78.3068783068783, 78.3068783068783, 78.3068783068783, 78.3068783068783, 78.3068783068783, 78.3068783068783, 78.3068783068783, 78.3068783068783, 78.3068783068783, 83.0687830687831, 83.0687830687831, 83.0687830687831, 83.0687830687831, 83.0687830687831]

In [29]:
# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(whole_average), stats.shapiro(whole_enriched_average)) # not normal, normal
print(stats.shapiro(sim_average), stats.shapiro(sim_enriched_average)) # not normal, not normal
print(stats.shapiro(dis_average),stats.shapiro(dissim_enriched_average)) # not normal, not normal

ShapiroResult(statistic=0.7636807379239798, pvalue=0.0001921302898587873) ShapiroResult(statistic=0.9471442647026478, pvalue=0.30049320609997826)
ShapiroResult(statistic=0.6501524957927944, pvalue=7.002237055378483e-06) ShapiroResult(statistic=0.8094441369690316, pvalue=0.0009180913873723957)
ShapiroResult(statistic=0.5738312031585339, pvalue=1.0542197594828412e-06) ShapiroResult(statistic=0.7127193360746931, pvalue=3.9825671336225474e-05)


In [37]:

# not normally distributed
print("SBERT vs SBERT Enriched")

print("Whole: ",notnormallydistributed(whole_average, whole_enriched_average))
print("Effect: ", abs(not_normal_distribution_effect (whole_average, whole_enriched_average)))

print("Textually Similar: t-statistic = ",notnormallydistributed(sim_average, sim_enriched_average))
print("Effect: ",abs(not_normal_distribution_effect (sim_average, sim_enriched_average)))

print("Textually Dissimilar : ",notnormallydistributed(dis_average, dissim_enriched_average))
print("Effect: ",abs(not_normal_distribution_effect (dis_average, dissim_enriched_average)))

SBERT vs SBERT Enriched
Whole:  WilcoxonResult(statistic=0.0, pvalue=9.5367431640625e-07)
Effect:  0.546485260770975
Textually Similar: t-statistic =  WilcoxonResult(statistic=50.0, pvalue=0.02157115936279297)
Effect:  0.3197278911564626
Textually Dissimilar :  WilcoxonResult(statistic=0.0, pvalue=6.175568347266167e-05)
Effect:  0.6394557823129252


## BM25

In [38]:
# Recall Rates at k 1 to 100 for BM25 and BM25 Enriched
regular = [18.67, 31.18, 37.02, 38.08, 39.14, 40.2, 41.26, 42.32, 43.38, 44.44, 45.5, 46.56, 47.62, 48.68, 49.74, 50.8, 51.86, 52.92, 53.98, 55.04, 55.86]
enriched = [19.63, 31.85, 38.39, 39.5, 40.6, 41.7, 42.8, 43.9, 45.0, 46.1, 47.2, 48.3, 49.4, 50.5, 51.6, 52.7, 53.8, 54.9, 56.0, 57.1, 57.39]
# Textually Similar
sim_reg = [22.29, 38.71, 43.73, 44.8, 45.85, 46.9, 47.95, 49.0, 50.05, 51.1, 52.15, 53.2, 54.25, 55.3, 56.35, 57.4, 58.45, 59.5, 60.55, 61.6, 62.72]
sim_enriched = [28.45, 42.84, 47.96, 48.95, 49.95, 50.95, 51.95, 52.95, 53.95, 54.95, 55.95, 56.95, 57.95, 58.95, 59.95, 60.95, 61.95, 62.95, 63.95, 64.95, 65.95]
# Textually Dissimilar
dissim_reg = [16.4, 28.8, 34.6, 35.7, 36.8, 37.9, 39.0, 40.1, 41.2, 42.3, 43.4, 44.5, 45.6, 46.7, 47.8, 48.9, 50.0, 51.1, 52.2, 53.3, 52.95]
dissim_enriched = [25.56, 31.99, 37.16, 38.2, 39.24, 40.28, 41.32, 42.36, 43.4, 44.44, 45.48, 46.52, 47.56, 48.6, 49.64, 50.68, 51.72, 52.76, 53.8, 54.84, 57.36]

In [40]:
# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(regular), stats.shapiro(enriched)) # normal, normal
print(stats.shapiro(sim_reg),stats.shapiro(sim_enriched)) # not normal, not normal
print(stats.shapiro(dissim_reg),stats.shapiro(dissim_enriched)) # normal, normal

ShapiroResult(statistic=0.9160086872991104, pvalue=0.07223974823836606) ShapiroResult(statistic=0.9126816844853531, pvalue=0.06204307684021796)
ShapiroResult(statistic=0.89089106545577, pvalue=0.023392225717368004) ShapiroResult(statistic=0.8991337002735843, pvalue=0.03366665870432764)
ShapiroResult(statistic=0.9132401454207475, pvalue=0.06364513717181519) ShapiroResult(statistic=0.9710592423054386, pvalue=0.7563775719440912)


In [43]:
print("BM25 vs BM25 Enriched")

# normally distributed
print("Whole: ",normallydistributed(regular, enriched))
print("Effect: ", abs(normal_distribution_effect (regular, enriched)))

print("Textually Dissimilar: t-statistic = ",normallydistributed(dissim_reg, dissim_enriched))
print("Effect: ",abs(normal_distribution_effect (dissim_reg, dissim_enriched)))

#not normally distributed
print("Textually Similar: t-statistic = ",notnormallydistributed(sim_reg, sim_enriched))
print("Effect: ",abs(not_normal_distribution_effect (sim_reg, sim_enriched)))


BM25 vs BM25 Enriched
Whole:  TtestResult(statistic=-21.921258830408814, pvalue=1.861801522626024e-15, df=20)
Effect:  0.1853118903797667
Textually Dissimilar: t-statistic =  TtestResult(statistic=-7.139125366159098, pvalue=6.458841644649449e-07, df=20)
Effect:  0.31055065939294535
Textually Similar: t-statistic =  WilcoxonResult(statistic=0.0, pvalue=9.5367431640625e-07)
Effect:  0.2834467120181406


## LDA+GLOVE

In [45]:
#  Recall at K 1 to 100 for LDA and LDA Enriched
# Whole
regular = [0.0, 5.5, 9.33, 10.33, 11.4, 12.27, 13.0, 13.54, 14.08, 14.56, 15.0, 15.32, 15.65, 15.94, 16.22, 16.47, 16.7, 16.91, 17.12, 17.31, 17.5]
enriched = [0.0, 7.78, 13.17, 14.05, 14.93, 15.81, 16.69, 17.57, 18.45, 19.33, 20.21, 21.09, 21.97, 22.85, 23.73, 24.61, 25.49, 26.37, 27.25, 28.13, 20.01]
# Textually Similar
sim_reg = [0.0, 6.67, 9.83, 10.94, 12.05, 13.16, 14.27, 15.38, 16.49, 17.6, 18.71, 19.82, 20.93, 22.04, 23.15, 24.26, 25.37, 26.48, 27.59, 28.7, 18.5]
sim_enriched = [4.14, 10.88, 15.39, 16.41, 17.43, 18.45, 19.47, 20.49, 21.51, 22.53, 23.55, 24.57, 25.59, 26.61, 27.63, 28.65, 29.67, 30.69, 31.71, 32.73, 22.42]
# Textually Dissimilar
dissim_reg = [0.0, 3.67, 7.0, 7.89, 8.78, 9.67, 10.56, 11.45, 12.34, 13.23, 14.12, 15.01, 15.9, 16.79, 17.68, 18.57, 19.46, 20.35, 21.24, 22.13, 13.83]
dissim_enriched = [3.45, 8.69, 13.37, 14.49, 15.61, 16.73, 17.85, 18.97, 20.09, 21.21, 22.33, 23.45, 24.57, 25.69, 26.81, 27.93, 29.05, 30.17, 31.29, 32.41, 19.3]

In [47]:
# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(regular), stats.shapiro(enriched)) # not normal, normal
print(stats.shapiro(sim_reg),stats.shapiro(sim_enriched)) # normal, normal
print(stats.shapiro(dissim_reg),stats.shapiro(dissim_enriched)) # normal, normal

ShapiroResult(statistic=0.8035179313028193, pvalue=0.0007429089847418348) ShapiroResult(statistic=0.926939275221735, pvalue=0.1195088484202588)
ShapiroResult(statistic=0.9720562534294691, pvalue=0.7780345028768536) ShapiroResult(statistic=0.9592783729964569, pvalue=0.5017081753535937)
ShapiroResult(statistic=0.9739518069617576, pvalue=0.8179509728304076) ShapiroResult(statistic=0.9712365622689744, pvalue=0.7602547931056709)


In [48]:
print("LDA vs LDA Enriched")

# normally distributed
print("Textually Dissimilar: t-statistic = ",normallydistributed(dissim_reg, dissim_enriched))
print("Effect: ",abs(normal_distribution_effect (dissim_reg, dissim_enriched)))

print("Textually Similar: t-statistic = ",normallydistributed(sim_reg, sim_enriched))
print("Effect: ",abs(normal_distribution_effect (sim_reg, sim_enriched)))

#not normally distributed
print("Whole: ",notnormallydistributed(regular, enriched))
print("Effect: ", abs(not_normal_distribution_effect (regular, enriched)))

LDA vs LDA Enriched
Textually Dissimilar: t-statistic =  TtestResult(statistic=-20.19105868850517, pvalue=9.006520007958382e-15, df=20)
Effect:  1.1859172202677204
Textually Similar: t-statistic =  TtestResult(statistic=-41.99371024042359, pvalue=5.570082277582102e-21, df=20)
Effect:  0.6644358425000066
Whole:  WilcoxonResult(statistic=0.0, pvalue=8.857457687863549e-05)
Effect:  0.5895691609977324


## CUPID

In [49]:
import pandas as pd
whole_df = pd.read_csv('whole_dataset.csv')
sim_df = pd.read_csv('textually_similar.csv')
dis_df = pd.read_csv('textually_dissimilar.csv')

In [50]:
# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

whole_perf = list(whole_df['CUPID'])
whole_enriched = list(whole_df['CUPID_Enriched'])
sim_perf = list(sim_df['CUPID'])
sim_enriched = list(sim_df['CUPID_Enriched'])
dis_perf = list(dis_df['CUPID'])
dis_enriched = list(dis_df['CUPID_Enriched'])

print(stats.shapiro(whole_perf), stats.shapiro(whole_enriched)) # not normal, not normal
print(stats.shapiro(sim_perf), stats.shapiro(sim_enriched)) # not normal,  normal
print(stats.shapiro(dis_perf), stats.shapiro(dis_enriched)) # not normal, not normal


ShapiroResult(statistic=0.8721412154981281, pvalue=0.010476194633603246) ShapiroResult(statistic=0.8996371699994118, pvalue=0.03443082659964294)
ShapiroResult(statistic=0.8864472709518897, pvalue=0.019275282343937234) ShapiroResult(statistic=0.9181234277303109, pvalue=0.07960141683090527)
ShapiroResult(statistic=0.9023127785249615, pvalue=0.03880721576767272) ShapiroResult(statistic=0.8945751445946227, pvalue=0.027504479001652468)


In [51]:
print("CUPID vs CUPID Enriched")

# not normally distributed
print("Whole: ",notnormallydistributed(whole_perf, whole_enriched))
print("Effect: ", abs(not_normal_distribution_effect (whole_perf, whole_enriched)))

print("Textually Similar: ",notnormallydistributed(sim_perf, sim_enriched))
print("Effect: ", abs(not_normal_distribution_effect (sim_perf, sim_enriched)))

print("Textually Dissimilar: ",notnormallydistributed(dis_perf, dis_enriched))
print("Effect: ", abs(not_normal_distribution_effect (dis_perf, dis_enriched)))

CUPID vs CUPID Enriched
Whole:  WilcoxonResult(statistic=4.0, pvalue=0.009779628315844426)
Effect:  0.015873015873015872
Textually Similar:  WilcoxonResult(statistic=0.0, pvalue=0.007685794055213263)
Effect:  0.024943310657596373
Textually Dissimilar:  WilcoxonResult(statistic=0.0, pvalue=9.5367431640625e-07)
Effect:  0.21315192743764172


## SiameseCNN

### data

In [144]:
df = pd.read_csv('siamese.csv')

regular_df = df.loc[df['dataset'] == 'regular']
enriched_df = df.loc[df['dataset'] == 'enriched']
sim_df = df.loc[df['dataset'] == 'sim_reg']
sim_enriched_df = df.loc[df['dataset'] == 'sim_enriched']
dis_df = df.loc[df['dataset'] == 'dissim_reg']
dis_enriched_df = df.loc[df['dataset'] == 'dissim_enriched']


### Precision

In [145]:
regular = list(regular_df['PRECISION'])
enriched = list(enriched_df['PRECISION'])
sim = list(sim_df['PRECISION'])
sim_enriched = list(sim_enriched_df['PRECISION'])
dis = list(dis_df['PRECISION'])
dis_enriched = list(dis_enriched_df['PRECISION'])

# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(regular), stats.shapiro(enriched)) # not normal, normal
print(stats.shapiro(sim_reg),stats.shapiro(sim_enriched)) # normal,not normal
print(stats.shapiro(dissim_reg),stats.shapiro(dissim_enriched)) # normal,normal


ShapiroResult(statistic=0.3083727130181436, pvalue=8.809927442588677e-09) ShapiroResult(statistic=0.5937380439426659, pvalue=2.537915270102269e-06)
ShapiroResult(statistic=0.9720562534294691, pvalue=0.7780345028768536) ShapiroResult(statistic=0.5938823570888548, pvalue=2.5465857349678898e-06)
ShapiroResult(statistic=0.9739518069617576, pvalue=0.8179509728304076) ShapiroResult(statistic=0.9712365622689744, pvalue=0.7602547931056709)


In [146]:
print("Siamese vs Siamese Enriched -- Precision")

# not normally distributed
print("Whole: ",notnormallydistributed(regular, enriched))
print("Effect: ", abs(not_normal_distribution_effect (regular, enriched)))

print("Textually Similar: ",notnormallydistributed(sim, sim_enriched))
print("Effect: ", abs(not_normal_distribution_effect (sim, sim_enriched)))

# normally distributed
print("Textually Dissimilar: ",normallydistributed(dis, dis_enriched))
print("Effect: ", abs(normal_distribution_effect (dis, dis_enriched)))

Siamese vs Siamese Enriched -- Precision
Whole:  WilcoxonResult(statistic=82.0, pvalue=0.40909767150878906)
Effect:  0.13
Textually Similar:  WilcoxonResult(statistic=83.0, pvalue=0.4304332733154297)
Effect:  0.1875
Textually Dissimilar:  TtestResult(statistic=0.5910832899692705, pvalue=0.5614312900629633, df=19)
Effect:  0.195976531829905


### Recall

In [148]:
regular = list(regular_df['RECALL'])
enriched = list(enriched_df['RECALL'])
sim = list(sim_df['RECALL'])
sim_enriched = list(sim_enriched_df['RECALL'])
dis = list(dis_df['RECALL'])
dis_enriched = list(dis_enriched_df['RECALL'])

# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(regular), stats.shapiro(enriched)) # not normal, not normal
print(stats.shapiro(sim_reg),stats.shapiro(sim_enriched)) # normal,not normal
print(stats.shapiro(dissim_reg),stats.shapiro(dissim_enriched))# normal,normal

ShapiroResult(statistic=0.7513106328567731, pvalue=0.00017678129579570825) ShapiroResult(statistic=0.6820868013549368, pvalue=2.3767126445441104e-05)
ShapiroResult(statistic=0.9720562534294691, pvalue=0.7780345028768536) ShapiroResult(statistic=0.6594627762992744, pvalue=1.2997843898660698e-05)
ShapiroResult(statistic=0.9739518069617576, pvalue=0.8179509728304076) ShapiroResult(statistic=0.9712365622689744, pvalue=0.7602547931056709)


In [149]:
print("Siamese vs Siamese Enriched - Precision")

# not normally distributed
print("Whole: ",notnormallydistributed(regular, enriched))
print("Effect: ", abs(not_normal_distribution_effect (regular, enriched)))

# not normally distributed
print("Textually Similar: ",notnormallydistributed(sim, sim_enriched))
print("Effect: ", abs(not_normal_distribution_effect (sim, sim_enriched)))

# normally distributed
print("Textually Dissimilar: ",normallydistributed(dis, dis_enriched))
print("Effect: ", abs(normal_distribution_effect (dis, dis_enriched)))

Siamese vs Siamese Enriched - Precision
Whole:  WilcoxonResult(statistic=70.0, pvalue=0.20244979858398438)
Effect:  0.325
Textually Similar:  WilcoxonResult(statistic=56.0, pvalue=0.069580078125)
Effect:  0.38
Textually Dissimilar:  TtestResult(statistic=0.36928652906311565, pvalue=0.7159976507840129, df=19)
Effect:  0.10598334481118586


### F1 

In [151]:
regular = list(regular_df['F1'])
enriched = list(enriched_df['F1'])
sim = list(sim_df['F1'])
sim_enriched = list(sim_enriched_df['F1'])
dis = list(dis_df['F1'])
dis_enriched = list(dis_enriched_df['F1'])

# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(regular), stats.shapiro(enriched)) # not normal, not normal
print(stats.shapiro(sim_reg),stats.shapiro(sim_enriched)) # normal, not normal
print(stats.shapiro(dissim_reg),stats.shapiro(dissim_enriched))# normal, normal

ShapiroResult(statistic=0.8033783437373435, pvalue=0.0009706358133565053) ShapiroResult(statistic=0.7428386510523691, pvalue=0.00013630232857151907)
ShapiroResult(statistic=0.9720562534294691, pvalue=0.7780345028768536) ShapiroResult(statistic=0.7321177429341672, pvalue=9.869046888092826e-05)
ShapiroResult(statistic=0.9739518069617576, pvalue=0.8179509728304076) ShapiroResult(statistic=0.9712365622689744, pvalue=0.7602547931056709)


In [152]:
print("Siamese vs Siamese Enriched - F1")

# normally distributed
print("Whole: ",normallydistributed(regular, enriched))
print("Effect: ", abs(normal_distribution_effect (regular, enriched)))

# not normally distributed
print("Textually Similar: ",notnormallydistributed(sim, sim_enriched))
print("Effect: ", abs(not_normal_distribution_effect (sim, sim_enriched)))

# normally distributed
print("Textually Dissimilar: ",normallydistributed(dis, dis_enriched))
print("Effect: ", abs(normal_distribution_effect (dis, dis_enriched)))

Siamese vs Siamese Enriched - F1
Whole:  TtestResult(statistic=0.5701057627538798, pvalue=0.575286156030125, df=19)
Effect:  0.18494462768167713
Textually Similar:  WilcoxonResult(statistic=86.0, pvalue=0.49800872802734375)
Effect:  0.13
Textually Dissimilar:  TtestResult(statistic=0.4004856548137504, pvalue=0.6932662537185239, df=19)
Effect:  0.1285563302568183


### AUC

In [153]:
regular = list(regular_df['AUC'])
enriched = list(enriched_df['AUC'])
sim = list(sim_df['AUC'])
sim_enriched = list(sim_enriched_df['AUC'])
dis = list(dis_df['AUC'])
dis_enriched = list(dis_enriched_df['AUC'])

# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(regular), stats.shapiro(enriched)) # normal, normal
print(stats.shapiro(sim_reg),stats.shapiro(sim_enriched)) # normal,not normal
print(stats.shapiro(dissim_reg),stats.shapiro(dissim_enriched)) # normal, normal

ShapiroResult(statistic=0.9446220030494078, pvalue=0.2927200930270471) ShapiroResult(statistic=0.9825005545566754, pvalue=0.9621521849437302)
ShapiroResult(statistic=0.9720562534294691, pvalue=0.7780345028768536) ShapiroResult(statistic=0.8904270368123959, pvalue=0.027382830599984104)
ShapiroResult(statistic=0.9739518069617576, pvalue=0.8179509728304076) ShapiroResult(statistic=0.9712365622689744, pvalue=0.7602547931056709)


In [154]:
print("Siamese vs Siamese Enriched - AUC")

# normally distributed
print("Whole: ",normallydistributed(regular, enriched))
print("Effect: ", abs(normal_distribution_effect (regular, enriched)))

# not normally distributed
print("Textually Similar: ",notnormallydistributed(sim, sim_enriched))
print("Effect: ", abs(not_normal_distribution_effect (sim, sim_enriched)))

# normally distributed
print("Textually Dissimilar: ",normallydistributed(dis, dis_enriched))
print("Effect: ", abs(normal_distribution_effect (dis, dis_enriched)))

Siamese vs Siamese Enriched - AUC
Whole:  TtestResult(statistic=-3.9744464133253468, pvalue=0.0008122474909890802, df=19)
Effect:  1.3760832390691136
Textually Similar:  WilcoxonResult(statistic=50.5, pvalue=0.039989471435546875)
Effect:  0.4175
Textually Dissimilar:  TtestResult(statistic=-3.2270439254467838, pvalue=0.004436691263151411, df=19)
Effect:  1.0865623898406878


## DCCNN

### data

In [97]:
df = pd.read_csv('dccnn.csv')

regular_df = df.loc[df['dataset'] == 'regular']
enriched_df = df.loc[df['dataset'] == 'enriched']
sim_df = df.loc[df['dataset'] == 'sim_reg']
sim_enriched_df = df.loc[df['dataset'] == 'sim_enriched']
dis_df = df.loc[df['dataset'] == 'dissim_reg']
dis_enriched_df = df.loc[df['dataset'] == 'dissim_enriched']


### Precision

In [98]:
regular = list(regular_df['PRECISION'])
enriched = list(enriched_df['PRECISION'])
sim = list(sim_df['PRECISION'])
sim_enriched = list(sim_enriched_df['PRECISION'])
dis = list(dis_df['PRECISION'])
dis_enriched = list(dis_enriched_df['PRECISION'])

In [102]:
# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(regular), stats.shapiro(enriched)) # not normal, normal
print(stats.shapiro(sim_reg),stats.shapiro(sim_enriched)) # normal,normal
print(stats.shapiro(dissim_reg),stats.shapiro(dissim_enriched)) # normal,normal

ShapiroResult(statistic=0.9004670789074739, pvalue=0.04207902853813876) ShapiroResult(statistic=0.9383128650932706, pvalue=0.22276292288896973)
ShapiroResult(statistic=0.9720562534294691, pvalue=0.7780345028768536) ShapiroResult(statistic=0.930509990589305, pvalue=0.15794589407222015)
ShapiroResult(statistic=0.9739518069617576, pvalue=0.8179509728304076) ShapiroResult(statistic=0.9712365622689744, pvalue=0.7602547931056709)


In [103]:
print("DCCNN vs DCCNN Enriched --- PRECISION")

# not normally distributed
print("Whole: ",notnormallydistributed(regular, enriched))
print("Effect: ", abs(not_normal_distribution_effect (regular, enriched)))

# normally distributed
print("Textually Similar: ",normallydistributed(sim, sim_enriched))
print("Effect: ", abs(normal_distribution_effect (sim, sim_enriched)))

print("Textually Dissimilar: ",normallydistributed(dis, dis_enriched))
print("Effect: ", abs(normal_distribution_effect (dis, dis_enriched)))

DCCNN vs DCCNN Enriched --- PRECISION
Whole:  WilcoxonResult(statistic=1.0, pvalue=3.814697265625e-06)
Effect:  0.95
Textually Similar:  TtestResult(statistic=-10.831715999325198, pvalue=1.4315620472135025e-09, df=19)
Effect:  3.194862206559338
Textually Dissimilar:  TtestResult(statistic=-12.17562543202102, pvalue=2.02631034420627e-10, df=19)
Effect:  3.5553877047543327


### Recall

In [105]:
regular = list(regular_df['RECALL'])
enriched = list(enriched_df['RECALL'])
sim = list(sim_df['RECALL'])
sim_enriched = list(sim_enriched_df['RECALL'])
dis = list(dis_df['RECALL'])
dis_enriched = list(dis_enriched_df['RECALL'])

# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(regular), stats.shapiro(enriched)) # not normal, normal
print(stats.shapiro(sim_reg),stats.shapiro(sim_enriched)) # normal,normal
print(stats.shapiro(dissim_reg),stats.shapiro(dissim_enriched))# normal,normal

ShapiroResult(statistic=0.9004670789074739, pvalue=0.04207902853813876) ShapiroResult(statistic=0.9383128650932706, pvalue=0.22276292288896973)
ShapiroResult(statistic=0.9720562534294691, pvalue=0.7780345028768536) ShapiroResult(statistic=0.930509990589305, pvalue=0.15794589407222015)
ShapiroResult(statistic=0.9739518069617576, pvalue=0.8179509728304076) ShapiroResult(statistic=0.9712365622689744, pvalue=0.7602547931056709)


In [106]:
print("DCCNN vs DCCNN Enriched --- RECALL")

print("Whole: ",notnormallydistributed(regular, enriched))
print("Effect: ", abs(not_normal_distribution_effect (regular, enriched)))

# normally distributed
print("Textually Similar: ",normallydistributed(sim, sim_enriched))
print("Effect: ", abs(normal_distribution_effect (sim, sim_enriched)))

print("Textually Dissimilar: ",normallydistributed(dis, dis_enriched))
print("Effect: ", abs(normal_distribution_effect (dis, dis_enriched)))

DCCNN vs DCCNN Enriched --- RECALL
Whole:  WilcoxonResult(statistic=1.0, pvalue=3.814697265625e-06)
Effect:  0.95
Textually Similar:  TtestResult(statistic=-10.831715999325198, pvalue=1.4315620472135025e-09, df=19)
Effect:  3.194862206559338
Textually Dissimilar:  TtestResult(statistic=-12.17562543202102, pvalue=2.02631034420627e-10, df=19)
Effect:  3.5553877047543327


### F1

In [108]:
regular = list(regular_df['F1'])
enriched = list(enriched_df['F1'])
sim = list(sim_df['F1'])
sim_enriched = list(sim_enriched_df['F1'])
dis = list(dis_df['F1'])
dis_enriched = list(dis_enriched_df['F1'])

# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(regular), stats.shapiro(enriched)) # normal, normal
print(stats.shapiro(sim_reg),stats.shapiro(sim_enriched)) # normal,normal
print(stats.shapiro(dissim_reg),stats.shapiro(dissim_enriched))# normal, normal

ShapiroResult(statistic=0.9584187193972296, pvalue=0.5127640757323777) ShapiroResult(statistic=0.9573826610098888, pvalue=0.49297931751704643)
ShapiroResult(statistic=0.9720562534294691, pvalue=0.7780345028768536) ShapiroResult(statistic=0.9121310879110895, pvalue=0.06999094543550446)
ShapiroResult(statistic=0.9739518069617576, pvalue=0.8179509728304076) ShapiroResult(statistic=0.9712365622689744, pvalue=0.7602547931056709)


In [109]:
print("DCCNN vs DCCNN Enriched --- F1")
#normally distributed
print("Whole: ",normallydistributed(regular, enriched))
print("Effect: ", abs(normal_distribution_effect (regular, enriched)))

#normally distributed
print("Textually Similar: ",normallydistributed(sim, sim_enriched))
print("Effect: ", abs(normal_distribution_effect (sim, sim_enriched)))

# normally distributed
print("Textually Dissimilar: ",normallydistributed(dis, dis_enriched))
print("Effect: ", abs(normal_distribution_effect (dis, dis_enriched)))

DCCNN vs DCCNN Enriched --- F1
Whole:  TtestResult(statistic=-7.978126701924815, pvalue=1.7454716032656797e-07, df=19)
Effect:  2.9907190522030285
Textually Similar:  TtestResult(statistic=-13.36338276114618, pvalue=4.123173647695916e-11, df=19)
Effect:  3.9388508460502867
Textually Dissimilar:  TtestResult(statistic=-15.71576009706058, pvalue=2.417582156662584e-12, df=19)
Effect:  4.908257573769444


### AUC

In [111]:
regular = list(regular_df['AUC'])
enriched = list(enriched_df['AUC'])
sim = list(sim_df['AUC'])
sim_enriched = list(sim_enriched_df['AUC'])
dis = list(dis_df['AUC'])
dis_enriched = list(dis_enriched_df['AUC'])

# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(regular), stats.shapiro(enriched)) # normal, normal
print(stats.shapiro(sim_reg),stats.shapiro(sim_enriched)) # normal,not normal
print(stats.shapiro(dissim_reg),stats.shapiro(dissim_enriched)) # normal, normal

ShapiroResult(statistic=0.9497626018449111, pvalue=0.36350061783633275) ShapiroResult(statistic=0.9434913433290599, pvalue=0.2788744646207144)
ShapiroResult(statistic=0.9720562534294691, pvalue=0.7780345028768536) ShapiroResult(statistic=0.898123656072401, pvalue=0.03803548211764927)
ShapiroResult(statistic=0.9739518069617576, pvalue=0.8179509728304076) ShapiroResult(statistic=0.9712365622689744, pvalue=0.7602547931056709)


In [112]:
print("DCCNN vs DCCNN Enriched --- AUC")

# not normally distributed
print("Textually Similar: ",notnormallydistributed(sim, sim_enriched))
print("Effect: ", abs(not_normal_distribution_effect (sim, sim_enriched)))

# normally distributed
print("Whole: ",normallydistributed(regular, enriched))
print("Effect: ", abs(normal_distribution_effect (regular, enriched)))

print("Textually Dissimilar: ",normallydistributed(dis, dis_enriched))
print("Effect: ", abs(normal_distribution_effect (dis, dis_enriched)))

DCCNN vs DCCNN Enriched --- AUC
Textually Similar:  WilcoxonResult(statistic=0.0, pvalue=1.9073486328125e-06)
Effect:  1.0
Whole:  TtestResult(statistic=-12.260741097750394, pvalue=1.80055713257448e-10, df=19)
Effect:  3.9130799062616326
Textually Dissimilar:  TtestResult(statistic=-9.169180051246238, pvalue=2.0882261463790397e-08, df=19)
Effect:  2.409630923029376


## CTEDB

### data

In [130]:
df = pd.read_csv('ctedb.csv')

regular_df = df.loc[df['dataset'] == 'regular']
enriched_df = df.loc[df['dataset'] == 'enriched']
sim_df = df.loc[df['dataset'] == 'sim_reg']
sim_enriched_df = df.loc[df['dataset'] == 'sim_enriched']
dis_df = df.loc[df['dataset'] == 'dissim_reg']
dis_enriched_df = df.loc[df['dataset'] == 'dissim_enriched']


### Precision

In [132]:
regular = list(regular_df['PRECISION'])
enriched = list(enriched_df['PRECISION'])
sim = list(sim_df['PRECISION'])
sim_enriched = list(sim_enriched_df['PRECISION'])
dis = list(dis_df['PRECISION'])
dis_enriched = list(dis_enriched_df['PRECISION'])

# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(regular), stats.shapiro(enriched)) # not normal, not normal
print(stats.shapiro(sim_reg),stats.shapiro(sim_enriched)) # normal,not normal
print(stats.shapiro(dissim_reg),stats.shapiro(dissim_enriched)) # normal,normal

ShapiroResult(statistic=0.7349841797912511, pvalue=0.00010751791613944022) ShapiroResult(statistic=0.7364150044015249, pvalue=0.0001122355030994515)
ShapiroResult(statistic=0.9720562534294691, pvalue=0.7780345028768536) ShapiroResult(statistic=0.754696177142339, pvalue=0.00019638422292597173)
ShapiroResult(statistic=0.9739518069617576, pvalue=0.8179509728304076) ShapiroResult(statistic=0.9712365622689744, pvalue=0.7602547931056709)


In [133]:
print("CTEDB vs CTEDB Enriched --- PRECISION")

# not normally distributed
print("Whole: ",notnormallydistributed(regular, enriched))
print("Effect: ", abs(not_normal_distribution_effect (regular, enriched)))

# not normally distributed
print("Textually Similar: ",notnormallydistributed(sim, sim_enriched))
print("Effect: ", abs(not_normal_distribution_effect (sim, sim_enriched)))

# normally distributed
print("Textually Dissimilar: ",normallydistributed(dis, dis_enriched))
print("Effect: ", abs(normal_distribution_effect (dis, dis_enriched)))

CTEDB vs CTEDB Enriched --- PRECISION
Whole:  WilcoxonResult(statistic=95.0, pvalue=0.7285060882568359)
Effect:  0.035
Textually Similar:  WilcoxonResult(statistic=69.0, pvalue=0.1893482208251953)
Effect:  0.295
Textually Dissimilar:  TtestResult(statistic=0.6353367955826335, pvalue=0.5327866211368384, df=19)
Effect:  0.19675631389693932


### Recall

In [135]:
regular = list(regular_df['RECALL'])
enriched = list(enriched_df['RECALL'])
sim = list(sim_df['RECALL'])
sim_enriched = list(sim_enriched_df['RECALL'])
dis = list(dis_df['RECALL'])
dis_enriched = list(dis_enriched_df['RECALL'])

# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(regular), stats.shapiro(enriched)) # not normal, not normal
print(stats.shapiro(sim_reg),stats.shapiro(sim_enriched)) # normal, not normal
print(stats.shapiro(dissim_reg),stats.shapiro(dissim_enriched))# normal,normal

ShapiroResult(statistic=0.7285496955836597, pvalue=8.876663689352502e-05) ShapiroResult(statistic=0.8376950580324753, pvalue=0.0033386412702655823)
ShapiroResult(statistic=0.9720562534294691, pvalue=0.7780345028768536) ShapiroResult(statistic=0.7810981559594907, pvalue=0.00045755664005867443)
ShapiroResult(statistic=0.9739518069617576, pvalue=0.8179509728304076) ShapiroResult(statistic=0.9712365622689744, pvalue=0.7602547931056709)


In [136]:
print("CTEDB vs CTEDB Enriched --- RECALL")

# not normally distributed
print("Whole: ",notnormallydistributed(regular, enriched))
print("Effect: ", abs(not_normal_distribution_effect (regular, enriched)))

# not normally distributed
print("Textually Similar: ",notnormallydistributed(sim, sim_enriched))
print("Effect: ", abs(not_normal_distribution_effect (sim, sim_enriched)))

# normally distributed
print("Textually Dissimilar: ",normallydistributed(dis, dis_enriched))
print("Effect: ", abs(normal_distribution_effect (dis, dis_enriched)))

CTEDB vs CTEDB Enriched --- RECALL
Whole:  WilcoxonResult(statistic=75.0, pvalue=0.2773551940917969)
Effect:  0.32
Textually Similar:  WilcoxonResult(statistic=56.0, pvalue=0.069580078125)
Effect:  0.265
Textually Dissimilar:  TtestResult(statistic=-0.5972016670973637, pvalue=0.5574233984240566, df=19)
Effect:  0.22047505947072113


### F1

In [138]:
regular = list(regular_df['F1'])
enriched = list(enriched_df['F1'])
sim = list(sim_df['F1'])
sim_enriched = list(sim_enriched_df['F1'])
dis = list(dis_df['F1'])
dis_enriched = list(dis_enriched_df['F1'])

# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(regular), stats.shapiro(enriched)) # not normal, not normal
print(stats.shapiro(sim_reg),stats.shapiro(sim_enriched)) # normal,not normal
print(stats.shapiro(dissim_reg),stats.shapiro(dissim_enriched))# normal, normal

ShapiroResult(statistic=0.8321474728451028, pvalue=0.002715791386436769) ShapiroResult(statistic=0.8542533682809174, pvalue=0.0062861342441866)
ShapiroResult(statistic=0.9720562534294691, pvalue=0.7780345028768536) ShapiroResult(statistic=0.8642645232712977, pvalue=0.00933019214041117)
ShapiroResult(statistic=0.9739518069617576, pvalue=0.8179509728304076) ShapiroResult(statistic=0.9712365622689744, pvalue=0.7602547931056709)


In [139]:
print("CTEDB vs CTEDB Enriched --- F1")

# not normally distributed
print("Whole: ",notnormallydistributed(regular, enriched))
print("Effect: ", abs(not_normal_distribution_effect (regular, enriched)))

# not normally distributed
print("Textually Similar: ",notnormallydistributed(sim, sim_enriched))
print("Effect: ", abs(not_normal_distribution_effect (sim, sim_enriched)))

# normally distributed
print("Textually Dissimilar: ",normallydistributed(dis, dis_enriched))
print("Effect: ", abs(normal_distribution_effect (dis, dis_enriched)))

CTEDB vs CTEDB Enriched --- F1
Whole:  WilcoxonResult(statistic=91.0, pvalue=0.6215133666992188)
Effect:  0.095
Textually Similar:  WilcoxonResult(statistic=73.0, pvalue=0.24548721313476562)
Effect:  0.265
Textually Dissimilar:  TtestResult(statistic=-0.12376551925415874, pvalue=0.9028008669207572, df=19)
Effect:  0.041722842661881705


### AUC

In [141]:
regular = list(regular_df['AUC'])
enriched = list(enriched_df['AUC'])
sim = list(sim_df['AUC'])
sim_enriched = list(sim_enriched_df['AUC'])
dis = list(dis_df['AUC'])
dis_enriched = list(dis_enriched_df['AUC'])

# Testing and plotting for each dataset
#If p ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#If p > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed).

print(stats.shapiro(regular), stats.shapiro(enriched)) # normal, normal
print(stats.shapiro(sim_reg),stats.shapiro(sim_enriched)) # normal, normal
print(stats.shapiro(dissim_reg),stats.shapiro(dissim_enriched)) # normal, normal

ShapiroResult(statistic=0.9416088600805577, pvalue=0.2571259986445322) ShapiroResult(statistic=0.924359233599332, pvalue=0.12022545461850166)
ShapiroResult(statistic=0.9720562534294691, pvalue=0.7780345028768536) ShapiroResult(statistic=0.923545122345794, pvalue=0.11595850546589183)
ShapiroResult(statistic=0.9739518069617576, pvalue=0.8179509728304076) ShapiroResult(statistic=0.9712365622689744, pvalue=0.7602547931056709)


In [142]:
print("CTEDB vs CTEDB Enriched --- AUC")

# normally distributed
print("Whole: ",normallydistributed(regular, enriched))
print("Effect: ", abs(normal_distribution_effect (regular, enriched)))

# normally distributed
print("Textually Similar: ",normallydistributed(sim, sim_enriched))
print("Effect: ", abs(normal_distribution_effect (sim, sim_enriched)))

# normally distributed
print("Textually Dissimilar: ",normallydistributed(dis, dis_enriched))
print("Effect: ", abs(normal_distribution_effect (dis, dis_enriched)))

CTEDB vs CTEDB Enriched --- AUC
Whole:  TtestResult(statistic=-6.9995673053323335, pvalue=1.1481442309579028e-06, df=19)
Effect:  2.419359097321053
Textually Similar:  TtestResult(statistic=-4.1349346978928665, pvalue=0.0005630030089048191, df=19)
Effect:  1.1151283356086215
Textually Dissimilar:  TtestResult(statistic=-7.476845011704827, pvalue=4.5081554266528216e-07, df=19)
Effect:  2.433092932378661
