# Solver Performance Comparisons

This notebook demonstrates our testing and comparing the performance of different general purpose solvers.

### Imports and Setup

In [173]:
# Third party libraries
import matplotlib.pyplot as plt
import matplotlib_inline

import numpy as np
import pandas as pd
import seaborn as sns 

# Local imports
import sets, loss, solver, plot

# Initialize notebook settings
sns.set_theme() # set seaborn theme
matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # vector plots
%matplotlib inline 
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [174]:
# --- Imports ------------------------------------------------------------------

import algorithms, sets, sys
from itertools import chain

import pandas as pd
import seaborn as sns
import gurobipy as gp
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib.lines import Line2D
plt.rcParams["font.sans-serif"] = "Times New Roman" # set the font
plt.rcParams["font.family"] = "sans-serif"

colors = ["#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e", "#e6ab02"]

# --- Experiments --------------------------------------------------------------

def computationPlot():
    fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(24, 14))
    axs = axs.ravel()
    metrics = ["mean", "range", "range_mean", "coverage", "distribution",
               "range_mean_coverage_distribution"]

    titles = ["Mean", "Range", "Range and Mean", "Coverage", "Distribution",
               "Range, Mean, Coverage and Distribution"]

    for i, metric in enumerate(metrics):
        df = pd.read_csv(f"../computationData/{metric}LossTime_Uniform.csv", header=1)

        sns.scatterplot(x="time",
                        y="loss",
                        alpha=0.5,
                        hue="algorithm",
                        palette=colors,
                        data=df,
                        ax=axs[i],
                        s=75)

        # meanDF = df[(df['algorithm'] == 'random') & (df["final"] == True)].groupby(['k']).max()
        # for index, row in meanDF.iterrows():
        #     axs[i].text(y=row["loss"] - 0.1,
        #                 x=row["time"] + 0.1,
        #                 s=index,
        #                 fontsize=12,
        #                 color="black")

        axs[i].set_title(titles[i], fontsize=16)
        axs[i].set_ylabel("Loss")
        # axs[i].set_ylim(bottom=-0.1)
        # axs[i].set_xlim([-0.1, 11])
        axs[i].set_xlabel("Time (s)")
        # axs[i].legend().set_visible(False)
        # axs[i].set_facecolor("#f0f0f0")
    custom_lines = [Line2D([0], [0], color=colors[0], lw=4, alpha=0.5),
                    Line2D([0], [0], color=colors[1], lw=4, alpha=0.5),
                    Line2D([0], [0], color=colors[2], lw=4, alpha=0.5)]
    axs[0].legend(custom_lines, ["Greedy Algorithm", "Integer Optimization Solver", "Random Search"])
    plt.show()

def computationData(dataset, path, numTrials):
    env = gp.Env(empty=True)
    env.setParam("OutputFlag", 0)
    env.start()
    rng = np.random.default_rng()

    with open(path, 'w') as f:
        f.write(f"Header: metric=coverage\tdataset={dataset.name}\t")
        f.write(f"algorithm=greedyImprove\tn={dataset.n}\tm={dataset.m}\t")
        f.write(f"s={dataset.s}\tk={dataset.k}\n\n")
        f.write(f"time,loss,algorithm,trial,final,k\n")

        for i in range(numTrials):
            print(f"{i/numTrials:.2%}")

            z, time, loss = algorithms.greedySwap(dataset, seed=rng)
            f.write(f"{time},{loss},greedy,{i},True,None\n")

            z, time, loss = algorithms.optimization(["range", "mean"], dataset, dataset.s, env)
            f.write(f"{time},{loss},optimizationInteger,{i},True,None\n")

            for j in [2300, dataset.s*dataset.n]:
                z, time, loss = algorithms.bestOfRandom(dataset, k=j, seed=rng)
                f.write(f"{time},{loss},bestOf{j}Random,{i},True,None\n")

if __name__ == "__main__":

    metric = "range_mean"
    numTrials = 15
    name = "Uniform"

    path = f"../computationData/{metric}LossTime_{name}.csv"

    dataset = sets.Dataset()
    dataset.name = name
    dataset.low = 1
    dataset.high = 10
    dataset.n = 1000
    dataset.m = 10
    dataset.s = 10
    dataset.numBins = 8

    dataset.generateUniform()
    dataset.data = algorithms.normalize(dataset.fullData, dataset.low, dataset.high)
    dataset.oneHot = algorithms.binning(pd.DataFrame.copy(dataset.data), dataset.numBins)
    dataset.oneHot = algorithms.oneHotEncoding(dataset.oneHot)

    dataset.pos = np.array(dataset.data)
    dataset.neg = algorithms.normalize(np.array(dataset.data), -dataset.high, -dataset.low,)

    metric = ["range", "mean", "var", "cov", "distrib", "cluster", "spread", "distinct", "crossing"]
    weight = [ 1,       1,      0,     0,     0,         0,         0,        0,          0]
    format = ["data", "data", "data", "oneHot", "oneHot", "data", "data", "data", "data"]
    function = [algorithms.calcRange, algorithms.calcMean, algorithms.calcVariance, algorithms.calcCoverage, algorithms.calcMean, algorithms.calcCluster, algorithms.calcSpread, algorithms.calcDistinct, algorithms.calcCrossing]
    objectives = {"metric": metric, "weight": weight, "format": format, "function": function}
    dataset.objectives = pd.DataFrame(objectives).set_index("metric")
    dataset.objectives["values"] = dataset.calcValues()

    if "compute" in sys.argv: # find optimal subset
        computationData(dataset, path, numTrials)
    if "plot" in sys.argv:
        computationPlot()