In [3]:
import pandas as pd

data = pd.read_json("./05_clean_data.json.zip")

In [4]:
# Find the best values for each instance as reference.
best_solutions = data[["instance", "initial_sample_size", "optimized_sample_size"]].groupby("instance").min().reset_index().rename(columns={"initial_sample_size": "best_baseline", "optimized_sample_size": "best_lns_ub"})
best_lb = data[["instance", "lower_bound"]].groupby("instance").max().reset_index().rename(columns={"lower_bound": "best_lb"})
mean_ub = data[["instance", "optimized_sample_size"]].groupby("instance").mean().reset_index().rename(columns={"optimized_sample_size": "mean_lns_ub"})
best_values = best_solutions.merge(best_lb, left_on="instance", right_on="instance")
best_values = best_values.merge(mean_ub, left_on="instance", right_on="instance")

In [5]:
samplns_yasa = data[data["baseline_alg"] == "YASA(m=1)"][["instance", "optimized_sample_size"]].groupby("instance").mean().reset_index().rename(columns={"optimized_sample_size": "samplns_yasa"})
best_values = best_values.merge(samplns_yasa, left_on="instance", right_on="instance")
best_values

Unnamed: 0,instance,best_baseline,best_lns_ub,best_lb,mean_lns_ub,samplns_yasa
0,APL,9,7,7,7.0,7.0
1,APL-Model,10,8,8,8.0,8.0
2,BankingSoftware,40,29,28,29.0,29.0
3,BattleofTanks,444,322,256,331.763636,330.4
4,ChatClient,8,7,7,7.0,7.0
5,DMIE,25,16,16,16.0,16.0
6,E-Shop,20,12,10,12.0,12.0
7,EMBToolkit,1883,1883,1872,1928.9,1883.0
8,FameDB,8,8,8,8.0,8.0
9,FeatureIDE,11,8,8,8.0,8.0


In [6]:
data = data.merge(best_values, left_on="instance", right_on="instance")

In [7]:
t = data[data["initial_sample_size"]==data["best_baseline"]].drop_duplicates(subset=["instance"], keep="last")

In [8]:
t

Unnamed: 0,instance,#Variables,#Clauses,iteration_info,baseline_alg,initial_sample_path,initial_sample_size,lower_bound,optimized_sample_size,best_baseline,best_lns_ub,best_lb,mean_lns_ub,samplns_yasa
45,BattleofTanks,144,769,"[{'nbrhd_tuples': 248, 'nbrhd_confs': 74, 'ite...",YASA(m=10),2023-03-01_13-51-03/22_1_10_1_sample.csv,444,242,334,444,322,256,331.763636,330.4
103,XSEngine,1273,2942,"[{'nbrhd_tuples': 247, 'nbrhd_confs': 3, 'iter...",YASA(m=5),2023-03-01_13-51-03/44_1_9_1_sample.csv,63,30,41,63,38,33,40.04,39.4
151,toybox_2006-10-31_23-30-06,16,13,"[{'nbrhd_tuples': 202, 'nbrhd_confs': 7, 'iter...",YASA(m=3),2023-03-01_13-51-03/5_1_8_4_sample.csv,9,8,8,9,8,8,8.0,8.0
214,APL,23,35,"[{'nbrhd_tuples': 207, 'nbrhd_confs': 8, 'iter...",YASA(m=10),2023-03-01_13-51-03/9_1_10_1_sample.csv,9,7,7,9,7,7,7.0,7.0
259,aaed2000,1298,3036,"[{'nbrhd_tuples': 168, 'nbrhd_confs': 1, 'iter...",FIDE-YASA(m=10),2023-03-01_13-51-03/45_1_6_1_sample.csv,89,51,58,89,55,51,58.68,58.8
311,integrator_arm7,1272,2980,"[{'nbrhd_tuples': 166, 'nbrhd_confs': 2, 'iter...",FIDE-YASA(m=5),2023-03-01_13-51-03/43_1_5_2_sample.csv,66,31,40,66,39,33,40.42,40.2
356,busybox_2007-01-24_09-14-09,540,429,"[{'nbrhd_tuples': 207, 'nbrhd_confs': 5, 'iter...",YASA(m=10),2023-03-01_13-51-03/32_1_10_4_sample.csv,35,21,21,35,21,21,21.054545,21.0
418,fiasco_2020-12-01_14-09-14,258,1542,"[{'nbrhd_tuples': 246, 'nbrhd_confs': 15, 'ite...",YASA(m=10),2023-03-01_13-51-03/25_1_10_4_sample.csv,213,145,197,213,196,196,196.345455,196.2
474,fs_2017-05-22,557,4992,"[{'nbrhd_tuples': 1, 'nbrhd_confs': 1, 'iterat...",YASA(m=10),2023-03-01_13-51-03/33_1_10_4_sample.csv,398,396,396,398,396,396,397.66,396.6
510,financial_services,771,7238,"[{'nbrhd_tuples': 247, 'nbrhd_confs': 172, 'it...",FIDE-YASA(m=10),2023-03-01_13-51-03/35_1_6_2_sample.csv,4396,1191,4396,4396,4396,2731,4402.75,4417.0


In [9]:
t["baseline"] = t.apply((lambda row: f"{row['initial_sample_size']}/{row['baseline_alg']}"), axis=1)
t["sampLNS"] = t.apply((lambda row: f"{row['best_lns_ub']} (\\O {int(round(row['mean_lns_ub']))})"), axis=1)
def shorten(s):
    if len(s)>23:
        return s[:20]+"..."
    return s
#t["instance_details"] = t.apply((lambda row: f"{shorten(row['instance'])} ({row['#Variables']}/{row['#Clauses']})"), axis=1)
t["instance_details"] = t.apply((lambda row: f"{shorten(row['instance'])}"), axis=1)
t["reduction"] = 100*(1-t["samplns_yasa"]/t["best_baseline"])
t["rel_size_after_samplns"] =100*(t["samplns_yasa"]/t["best_baseline"])
t["savings"] = 100*(1-t["samplns_yasa"]/t["best_baseline"])
t["opt_1"] =t["best_baseline"] == t["best_lb"]
t["opt_2"] =t["samplns_yasa"].apply(lambda x: round(x)) == t["best_lb"]
t

Unnamed: 0,instance,#Variables,#Clauses,iteration_info,baseline_alg,initial_sample_path,initial_sample_size,lower_bound,optimized_sample_size,best_baseline,...,mean_lns_ub,samplns_yasa,baseline,sampLNS,instance_details,reduction,rel_size_after_samplns,savings,opt_1,opt_2
45,BattleofTanks,144,769,"[{'nbrhd_tuples': 248, 'nbrhd_confs': 74, 'ite...",YASA(m=10),2023-03-01_13-51-03/22_1_10_1_sample.csv,444,242,334,444,...,331.763636,330.4,444/YASA(m=10),322 (\O 332),BattleofTanks,25.585586,74.414414,25.585586,False,False
103,XSEngine,1273,2942,"[{'nbrhd_tuples': 247, 'nbrhd_confs': 3, 'iter...",YASA(m=5),2023-03-01_13-51-03/44_1_9_1_sample.csv,63,30,41,63,...,40.04,39.4,63/YASA(m=5),38 (\O 40),XSEngine,37.460317,62.539683,37.460317,False,False
151,toybox_2006-10-31_23-30-06,16,13,"[{'nbrhd_tuples': 202, 'nbrhd_confs': 7, 'iter...",YASA(m=3),2023-03-01_13-51-03/5_1_8_4_sample.csv,9,8,8,9,...,8.0,8.0,9/YASA(m=3),8 (\O 8),toybox_2006-10-31_23...,11.111111,88.888889,11.111111,False,True
214,APL,23,35,"[{'nbrhd_tuples': 207, 'nbrhd_confs': 8, 'iter...",YASA(m=10),2023-03-01_13-51-03/9_1_10_1_sample.csv,9,7,7,9,...,7.0,7.0,9/YASA(m=10),7 (\O 7),APL,22.222222,77.777778,22.222222,False,True
259,aaed2000,1298,3036,"[{'nbrhd_tuples': 168, 'nbrhd_confs': 1, 'iter...",FIDE-YASA(m=10),2023-03-01_13-51-03/45_1_6_1_sample.csv,89,51,58,89,...,58.68,58.8,89/FIDE-YASA(m=10),55 (\O 59),aaed2000,33.932584,66.067416,33.932584,False,False
311,integrator_arm7,1272,2980,"[{'nbrhd_tuples': 166, 'nbrhd_confs': 2, 'iter...",FIDE-YASA(m=5),2023-03-01_13-51-03/43_1_5_2_sample.csv,66,31,40,66,...,40.42,40.2,66/FIDE-YASA(m=5),39 (\O 40),integrator_arm7,39.090909,60.909091,39.090909,False,False
356,busybox_2007-01-24_09-14-09,540,429,"[{'nbrhd_tuples': 207, 'nbrhd_confs': 5, 'iter...",YASA(m=10),2023-03-01_13-51-03/32_1_10_4_sample.csv,35,21,21,35,...,21.054545,21.0,35/YASA(m=10),21 (\O 21),busybox_2007-01-24_0...,40.0,60.0,40.0,False,True
418,fiasco_2020-12-01_14-09-14,258,1542,"[{'nbrhd_tuples': 246, 'nbrhd_confs': 15, 'ite...",YASA(m=10),2023-03-01_13-51-03/25_1_10_4_sample.csv,213,145,197,213,...,196.345455,196.2,213/YASA(m=10),196 (\O 196),fiasco_2020-12-01_14...,7.887324,92.112676,7.887324,False,True
474,fs_2017-05-22,557,4992,"[{'nbrhd_tuples': 1, 'nbrhd_confs': 1, 'iterat...",YASA(m=10),2023-03-01_13-51-03/33_1_10_4_sample.csv,398,396,396,398,...,397.66,396.6,398/YASA(m=10),396 (\O 398),fs_2017-05-22,0.351759,99.648241,0.351759,False,False
510,financial_services,771,7238,"[{'nbrhd_tuples': 247, 'nbrhd_confs': 172, 'it...",FIDE-YASA(m=10),2023-03-01_13-51-03/35_1_6_2_sample.csv,4396,1191,4396,4396,...,4402.75,4417.0,4396/FIDE-YASA(m=10),4396 (\O 4403),financial_services,-0.477707,100.477707,-0.477707,False,False


In [10]:
# Code to query the time of the last change in lower or upper bound.

# Collect the data from the iteration_info events.
class EventCollector:
    def __init__(self):
        self.events = {
            "time": [],
            "val": [],
            "type": [],
            "instance": [],
            "path": [],
            "alg": [],
            "initial_sample_size": [],
            "y": [],
            "final": [],
        }

    def __call__(self, row):
        it_data = row["iteration_info"]

        def add(val, lbub, time, final=False):
            if time > 900:
                return
            self.events["time"].append(time / 60)
            self.events["val"].append(val)
            self.events["type"].append(lbub)
            self.events["instance"].append(row["instance"])
            self.events["alg"].append(row["baseline_alg"])
            self.events["path"].append(row["initial_sample_path"])
            self.events["initial_sample_size"].append(row["initial_sample_size"])
            self.events["y"].append(100 * (val / row["best_lb"]))
            self.events["final"].append(final)

        # add a zero entry
        add(0, "Lower", 0)
        add(row["initial_sample_size"], "Upper", 0)
        for event in it_data:
            add(event["lb"], "Lower", event["time"], final=event["lb"]==row["lower_bound"])
            add(event["ub"], "Upper", event["time"], final=event["ub"]==row["optimized_sample_size"])

instance_infos = data[
    ["instance", "#Variables", "#Clauses"]].drop_duplicates().sort_values(
    by=["#Variables", "#Clauses"])

ec = EventCollector()
data[data["baseline_alg"] == "FIDE-YASA(m=1)"].sort_values(
    by=["#Variables", "#Clauses"]).apply(ec, axis=1)
t_events = pd.DataFrame(ec.events)
t_ = t_events.groupby(["instance", "path"])["time"].max().reset_index()
unsolved_instances = t_[t_["time"] == 0]["instance"].unique().tolist()
solved_instances = [x for x in instance_infos["instance"].tolist() if
                    x not in unsolved_instances]

time_until_last_change = t_events[t_events["final"] & (t_events["instance"].isin(solved_instances))].groupby(["instance", "path", "type"])[["time"]].min().reset_index().groupby(["instance", "type"])[["time"]].mean()

unsolved_symbol = "$*$"
def get_time_info(instance):
    d = time_until_last_change.groupby("instance")["time"].max().to_dict()
    if instance not in d:
        return unsolved_symbol
    else:
        secs =  round(d[instance]*60)
        if secs == 0:
            return "$<\\SI{1}{\\second}$"
        return "\\SI{"+str(secs)+"}{\\second}"

In [11]:
tex_table = ""
def append(*args):
    global tex_table
    tex_table += f"{' '.join(str(x) for x in args)}\n"
t.sort_values(by=["#Variables", "#Clauses"], inplace=True)
for idx in t.index:
    unsolved = t["instance"][idx] in unsolved_instances
    append(t["instance_details"][idx].replace('_', '\\_'), "& \\num{", t["#Variables"][idx], "} & \\num{", t["#Clauses"][idx], "} &", ("\\textbf{" if t["opt_1"][idx] else "{")+"\\num[text-series-to-math=true]{", t["best_baseline"][idx], " }} &", unsolved_symbol if unsolved else (("\\textbf{" if t["opt_2"][idx] else "{")+"\\num[text-series-to-math=true]{"+ str( int(round(t["samplns_yasa"][idx])))+ "}}")," & \\num{", round(t["best_lb"][idx]), "} &", unsolved_symbol if unsolved else "\\SI{"+f"{round(t['savings'][idx], 2):.2g}"+"}{\\percent}", "&", get_time_info(t["instance"][idx]), "\\\\")

print(tex_table)

\begin{center}
\begin{tabular}{ l c c c}
Instance & $|\features|/|\clauses|$ & Baseline & SampLNS & Lower bound & Relative size\\
\hline
calculate & \num{ 9 } & \num{ 15 } & {\num[text-series-to-math=true]{ 9  }} & \textbf{\num[text-series-to-math=true]{5}}  & \num{ 5 } & \SI{44}{\percent} & $<\SI{1}{\second}$ \\
lcm & \num{ 9 } & \num{ 16 } & {\num[text-series-to-math=true]{ 8  }} & \textbf{\num[text-series-to-math=true]{6}}  & \num{ 6 } & \SI{25}{\percent} & $<\SI{1}{\second}$ \\
email & \num{ 10 } & \num{ 17 } & \textbf{\num[text-series-to-math=true]{ 6  }} & \textbf{\num[text-series-to-math=true]{6}}  & \num{ 6 } & \SI{0}{\percent} & $<\SI{1}{\second}$ \\
ChatClient & \num{ 14 } & \num{ 20 } & {\num[text-series-to-math=true]{ 8  }} & \textbf{\num[text-series-to-math=true]{7}}  & \num{ 7 } & \SI{12}{\percent} & \SI{1}{\second} \\
toybox\_2006-10-31\_23... & \num{ 16 } & \num{ 13 } & {\num[text-series-to-math=true]{ 9  }} & \textbf{\num[text-series-to-math=true]{8}}  & \num{ 8 } & \S

In [16]:
len(t[t["best_baseline"]/t["best_lb"]<=1.1])

9