In [1]:
import pandas as pd

In [2]:
gridsearch_data = (
	pd.read_csv("../data/processed/aggregated/evaluated-gridsearch.csv")
	.rename({
		"sample_rate": "window_size",
		"aggregator": "window_skip",
		"sampler": "aggregator"
	}, axis=1)
).assign(sampler="Optimized")

gridsearch_data.aggregator = gridsearch_data.aggregator.replace({
	"bradleyterry": "Bradley-Terry",
	"additive": "Additive",
	"greedy": "Greedy",
	"pagerank": "PageRank"
})

gridsearch_data = (
	pd.merge(
		(
			gridsearch_data
			.loc[
				gridsearch_data.collection == "dl-passages",
				["sampler", "aggregator", "window_size", "window_skip", "query", "NDCG_10"]
			]
			.groupby(
				["sampler", "aggregator", "window_size", "window_skip"]
			)
			.mean()
			.drop("query", axis=1)
			.sort_values(["window_size", "sampler", "aggregator", "NDCG_10"], ascending=False)
			.reset_index()
			.groupby(["window_size", "sampler", "aggregator"])
			.head(1)
			.drop("NDCG_10", axis=1)
		),
	    (
			gridsearch_data
			.loc[
				gridsearch_data["collection"] == "dl-passages",
				["sampler", "aggregator", "window_size", "window_skip", "query", "NDCG_10"]
			]
		),
		how = "left",
		on = ["window_size", "sampler", "aggregator", "window_skip"]
	)
	.assign(sample_rate=lambda df: (
		df["window_size"]
		.astype(int)
		.apply(lambda v: (50 * v) / (50 * (50-1)))
		.apply(lambda s: s*20)
		.round()
		.astype(int)
		.apply(lambda s: s / 20)
	))
	.assign(keep=lambda df: df["window_size"].isin(
		df
		.loc[
			:,
			["window_size", "sample_rate"]
		]
		.groupby("sample_rate")
		.max()
		["window_size"]
		.values
		.tolist()
	))
	.drop(["window_skip", "window_size"], axis=1)
)

gridsearch_data = gridsearch_data.loc[
	gridsearch_data.keep &
	(gridsearch_data.sample_rate != 1.0) &
    (gridsearch_data.sample_rate != 0.0),
	['sample_rate', 'sampler', 'aggregator', 'query', 'NDCG_10']
]

In [3]:
ndcg_data = pd.read_csv("../data/processed/aggregated/evaluated-sampled.csv")
ndcg_data = (
	pd.merge(
		# i-value with minimum mean NDCG
		(
			ndcg_data
			.loc[
				ndcg_data["collection"] == "dl-passages",
				[
					"sample_rate",
					"sampler",
					"aggregator",
					"i",
					"query",
					"NDCG_10"
				]
			]
			.groupby(["sample_rate", "sampler", "aggregator", "i"])
			.mean()
			.drop("query", axis=1)
			.sort_values(["sample_rate", "sampler", "aggregator", "NDCG_10"])
			.reset_index()
			.groupby(["sample_rate", "sampler", "aggregator"])
			.head(1)
			.drop("NDCG_10", axis=1)
		),
		# NDCG per query
	    (
			ndcg_data
			.loc[
				ndcg_data["collection"] == "dl-passages",
				[
					"sample_rate",
					"sampler",
					"aggregator",
					"i",
					"query",
					"NDCG_10"
				]
			]
		),
		how = "left",
		on = ["sample_rate", "sampler", "aggregator", "i"]
	)
	.drop("i", axis=1)
)

ndcg_data = ndcg_data.loc[
	ndcg_data.aggregator != "kwiksort",
	['sample_rate', 'sampler', 'aggregator', 'query', 'NDCG_10']
]

In [4]:
baseline_data = pd.read_csv("../data/processed/aggregated/evaluated-full.csv")

baseline_data = (
	baseline_data
	.loc[
		(baseline_data.collection == "dl-passages") &
        (baseline_data.aggregator != "Kwiksort"),
		["aggregator", "query", "NDCG_10"]
	]
)

In [5]:
unified_data = pd.concat([gridsearch_data, ndcg_data], axis=0).reset_index(drop=True)

In [6]:
from scipy.stats import ttest_rel

df = []
for aggregator in baseline_data.aggregator.unique():

	sample_a = (
		baseline_data
		.sort_values("query")
		.loc[
			baseline_data.aggregator == aggregator,
			("query", "NDCG_10")
		]
    )

	p_values = []
	for sampler in unified_data.sampler.unique():
		for sample_rate in unified_data.sample_rate.unique():

			sample_b = (
				unified_data
				.sort_values("query")
				.loc[
					(unified_data.aggregator == aggregator) &
					(unified_data.sample_rate == sample_rate) &
                    (unified_data.sampler == sampler),
					("query", "NDCG_10")
				]
	        )

			merged = pd.merge(sample_a, sample_b, on="query").dropna()
			_, p = ttest_rel(merged["NDCG_10_x"], merged["NDCG_10_y"])

			ndcg_a = merged["NDCG_10_x"].mean()
			ndcg_b = merged["NDCG_10_y"].mean()
			var_a = merged["NDCG_10_x"].std()
			var_b = merged["NDCG_10_y"].std()

			p_values.append((sampler, sample_rate, p, ndcg_a, var_a, ndcg_b, var_b, len(merged)))

	df.append(pd.DataFrame(p_values, columns=["sampler", "sample_rate", "p", "ndcg_full", "var_full", "ndcg_sampled", "var_sampled", "n_topics"]).assign(aggregator=aggregator))

df = pd.concat(df)
df


Unnamed: 0,sampler,sample_rate,p,ndcg_full,var_full,ndcg_sampled,var_sampled,n_topics,aggregator
0,Optimized,0.95,6.762254e-01,0.691431,0.231714,0.692740,0.233130,42,Bradley-Terry
1,Optimized,0.90,7.734706e-01,0.691431,0.231714,0.692356,0.235051,42,Bradley-Terry
2,Optimized,0.85,1.493758e-01,0.691431,0.231714,0.683722,0.233849,42,Bradley-Terry
3,Optimized,0.80,1.186676e-01,0.691431,0.231714,0.682315,0.240335,42,Bradley-Terry
4,Optimized,0.75,1.603958e-01,0.691431,0.231714,0.682441,0.235570,42,Bradley-Terry
...,...,...,...,...,...,...,...,...,...
52,Structured,0.25,1.280301e-04,0.695342,0.232178,0.623268,0.257993,42,PageRank
53,Structured,0.20,3.696521e-06,0.695342,0.232178,0.606459,0.247890,42,PageRank
54,Structured,0.15,6.842998e-06,0.695342,0.232178,0.587584,0.250468,42,PageRank
55,Structured,0.10,1.724925e-06,0.700942,0.232172,0.561971,0.275782,41,PageRank


In [7]:
from statsmodels.stats.multitest import multipletests


df = (
	df
	.groupby(["sampler", "aggregator"])
	.apply(lambda group: group.assign(
		p_corrected=multipletests(group["p"].values, alpha=0.95, method="bonferroni")[1],
		accepted=multipletests(group["p"].values, alpha=0.95, method="bonferroni")[0]
	))
)

In [10]:
(
	df
	.loc[
		(df.aggregator == "Bradley-Terry") &
        (df.sampler == "Random")
	]
)

Unnamed: 0,sampler,sample_rate,p,ndcg_full,var_full,ndcg_sampled,var_sampled,n_topics,aggregator,p_corrected,accepted
19,Random,0.95,0.04386449,0.691431,0.231714,0.673564,0.232992,42,Bradley-Terry,0.8334253,True
20,Random,0.9,0.01162555,0.691431,0.231714,0.654963,0.251588,42,Bradley-Terry,0.2208855,True
21,Random,0.85,0.002150729,0.691431,0.231714,0.649173,0.238514,42,Bradley-Terry,0.04086386,True
22,Random,0.8,3.936392e-05,0.691431,0.231714,0.640747,0.253804,42,Bradley-Terry,0.0007479144,True
23,Random,0.75,0.0002310533,0.691431,0.231714,0.640002,0.249981,42,Bradley-Terry,0.004390012,True
24,Random,0.7,2.135494e-06,0.691431,0.231714,0.617308,0.23874,42,Bradley-Terry,4.057439e-05,True
25,Random,0.65,6.918066e-05,0.691431,0.231714,0.61664,0.266235,42,Bradley-Terry,0.001314433,True
26,Random,0.6,1.731293e-05,0.691431,0.231714,0.612793,0.241249,42,Bradley-Terry,0.0003289457,True
27,Random,0.55,1.763532e-05,0.691431,0.231714,0.60883,0.248212,42,Bradley-Terry,0.0003350712,True
28,Random,0.5,1.006611e-05,0.691431,0.231714,0.604001,0.25064,42,Bradley-Terry,0.0001912561,True


In [13]:
(
	df
	.assign(diff=lambda df: df["ndcg_sampled"] - df["ndcg_full"])
	.loc[df.accepted == False]
	.sort_values(["sampler", "aggregator", "sample_rate"], ascending=True)
	.groupby(["sampler", "aggregator"])
	.head(1)
	.dropna()
	.round(3)
	.assign(value=lambda df: (df["sample_rate"] + 0.05).round(2).astype(str) + " (" + df["diff"].astype(str) + ")")
	.pivot(
		"aggregator",
		"sampler",
		"value"
	)
	#.to_latex()
)

sampler,Optimized,Random,Structured
aggregator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Additive,0.25 (-0.021),0.85 (-0.017),0.95 (-0.004)
Bradley-Terry,0.35 (-0.011),,0.85 (-0.014)
Greedy,0.2 (-0.018),0.7 (-0.013),0.45 (-0.018)
PageRank,0.3 (-0.016),0.75 (-0.016),0.85 (-0.022)


In [30]:
(
	pd.read_csv("../data/processed/runs-sampled/dl-passages-3b/0-50-structured-0.2-additive.txt.gz", sep=" ", header=None)
	.rename({
		0: "query",
		1: "q0",
		2: "doc",
		3: "rank",
		4: "score",
		5: "tag"
	}, axis=1)
	.loc[:, "query"]
	.nunique()
)

95