In [1]:
%pwd

'/Users/ryandevera/data-science/umn_environments/Deeplifting/deeplifting/notebooks'

In [2]:
%cd ../..

/Users/ryandevera/data-science/umn_environments/Deeplifting


In [3]:
%ls

README.md                       [1m[36mimages[m[m/
[1m[36m__pycache__[m[m/                    [1m[36mjobs[m[m/
[1m[36malgorithm_compare_results[m[m/      [1m[36mlow-dimension-search-results[m[m/
[1m[36mdata[m[m/                           [1m[36mmodels[m[m/
[1m[36mdata-queue[m[m/                     [1m[36mpaper-images[m[m/
[1m[36mdata-queue-2023-09-24[m[m/          requirements.txt
[1m[36mdata-queue-2023-09-27[m[m/          [1m[36mresults[m[m/
[1m[36mdeeplifting[m[m/                    [1m[36msearch_results[m[m/
deeplifting.png                 tasks.py
[1m[36mhigh-dimension-paper-results[m[m/   test-low-dimension-results.png
[1m[36mhigh-dimension-search-results[m[m/


In [4]:
import glob as glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm

In [18]:
minima = [
    -1.0,
    -3.0,
    -6.0,
    -9.103852,
    -12.712062,
    -16.505384,
    -19.821489,
    -24.113360,
    -28.422532,
    -32.765970,
    -37.967600,
    -44.326801,
    -47.845157,
    -52.322627,
    -56.815742,
    -61.317995,
    -66.530949,
    -72.659782,
    -77.1777043,
]

dimensions = 39
k = int(dimensions / 3)
print(minima[k - 2])

-44.326801


# Algorithm Comparison Results

In [12]:
# Algorithm keep columns
algorithm_columns = [
    'problem_name',
    'algorithm',
    'dimensions',
    'hits',
    'time',
    'f',
]

# Results from comparison algorithms
file_directory = './data-queue-2023-09-24/lennard-jones/*/*'
files = glob.glob(file_directory)
algorithm_df = pd.read_parquet(files)

# algorithm_df = algorithm_df[algorithm_columns]
algorithm_df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x12,x13,x14,x15,f,algorithm,time,problem_name,hits,dimensions
0,0.527248,1.986389,0.386597,-0.207182,1.305686,0.373746,-0.409025,2.255508,0.618709,-0.17437,...,-0.322214,0.115064,1.678293,1.241582,-9.103852,IPOPT,7.833295,lennard_jones_15d,0,15
1,-1.100304,0.09449,-2.553394,-0.867722,-0.619329,-1.895985,-1.165724,0.275754,-1.570656,-1.819551,...,-2.010443,-1.855997,0.63795,-2.193677,-9.103852,IPOPT,7.829979,lennard_jones_15d,0,15
2,-1.192089,0.408755,-2.598269,-1.762302,0.374894,-1.780022,-1.144999,1.145976,-1.922106,-0.776442,...,-1.702718,-0.313385,0.822274,-2.368707,-9.103852,IPOPT,7.812895,lennard_jones_15d,0,15
3,-0.363832,-0.919277,0.283479,-0.390302,0.076759,0.228431,-0.320769,-1.297625,-0.638919,-0.851604,...,-0.467109,0.148831,-0.441751,-0.432101,-9.103852,IPOPT,7.74344,lennard_jones_15d,0,15
4,2.255344,0.752425,-0.53709,2.817851,0.892651,-1.353688,2.378404,0.011114,-1.193696,1.824055,...,-1.439462,2.219762,1.62137,-1.026463,-9.103852,IPOPT,7.77156,lennard_jones_15d,0,15


In [8]:
# Let's get the algorithm compare results!
algorithm_results_df = algorithm_df.groupby(
    ['problem_name', 'algorithm', 'dimensions']
).agg({'hits': ['count', 'mean'], 'time': 'mean'})
algorithm_results_df.columns = [
    '-'.join(column) for column in algorithm_results_df.columns
]
algorithm_results_df = algorithm_results_df.reset_index()

# Need to verify that all problems have the same number of trials in the end
algorithm_results_df = algorithm_results_df.drop(columns=['hits-count'])
algorithm_results_df.pivot_table(
    index='dimensions', columns='algorithm', values='hits-mean'
)

algorithm,Basinhopping,Differential Evolution,Dual Annealing,IPOPT
dimensions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,1.0,1.0,1.0,0.4
9,0.9,1.0,1.0,0.8
12,0.9,1.0,1.0,1.0
15,0.0,0.0,0.0,0.0
30,0.0,0.0,0.0,0.0


In [None]:
algorithm_df['algorithm'].unique()

# SCIP Results

At first, SCIP was running very fast but then it encountered problems where it really struggled. I split out the results from SCIP.

In [None]:
scip_file_directory = './data-queue-2023-09-24/high-dimension-scip/*/*'
scip_files = glob.glob(scip_file_directory)

# Load in the data
scip_df = pd.read_parquet(scip_files)

# Fix where the time-violations are happening
scip_df['time-violation'] = scip_df.groupby(['problem_name', 'dimensions'])[
    'time'
].transform('max')
scip_df['time'] = np.where(
    scip_df['time-violation'] >= (60 * 60 * 8),
    scip_df['time-violation'],
    scip_df['time'],
)
scip_df['hits'] = np.where(
    scip_df['time-violation'] >= (60 * 60 * 8), 0, scip_df['hits']
)

# Remove unwanted columns
scip_df = scip_df[algorithm_columns + ['time-violation']]

# Gather the results
scip_results_df = scip_df.groupby(['problem_name', 'algorithm', 'dimensions']).agg(
    {'hits': ['count', 'mean'], 'time': 'mean'}
)
scip_results_df.columns = ['-'.join(column) for column in scip_results_df.columns]
scip_results_df = scip_results_df.reset_index()
scip_results_df

# Need to verify that all problems have the same number of trials in the end
scip_results_df = scip_results_df.drop(columns=['hits-count'])

# Missing values still because of time issues
scip_missing_df = pd.DataFrame(
    {
        'problem_name': ['levy_500d', 'levy_1000d', 'schwefel_500d', 'schwefel_1000d'],
        'algorithm': ['SCIP'] * 4,
        'dimensions': [500, 1000, 500, 1000],
        'hits-mean': [0.0] * 4,
        'time-mean': [60 * 60 * 8] * 4,
    }
)
scip_results_df = pd.concat([scip_results_df, scip_missing_df], ignore_index=True)
scip_results_df.sort_values(['problem_name', 'dimensions'])

# Full Results
Combine the full results and let's create a facet grid.

In [None]:
results_df = pd.concat(
    [test_df, algorithm_results_df, scip_results_df], axis=0, ignore_index=True
)

# Set the problem
results_df['problem'] = results_df['problem_name'].str.replace(
    r'_[0-9]+d', '', regex=True
)
results_df['problem'] = np.where(
    results_df['problem'].str.contains('reyonlds'),
    'chung_reynolds',
    results_df['problem'],
)
results_df['problem'] = np.where(
    results_df['problem'].str.contains('reynolds'),
    'Chung Reynolds',
    results_df['problem'],
)
results_df['time-mean-log'] = results_df['time-mean'].apply(np.log1p)
results_df['dimensions-log'] = results_df['dimensions'].apply(np.log1p)
results_df.head()

In [None]:
# Sanity check that all problems exists for each algorithm
results_df.groupby(['problem', 'algorithm']).size()

In [None]:
mask = (results_df['problem'] == 'rastrigin') & (
    results_df['algorithm'] == 'Deeplifting'
)
results_df.loc[mask]

In [None]:
# Create the FacetGrid
g = sns.FacetGrid(
    results_df, col="problem", col_wrap=3, sharex=True, sharey=True, height=3
)

# Map the data to the grid
g.map_dataframe(
    sns.stripplot, x='hits-mean', y='algorithm', hue='dimensions', palette='viridis'
)

# Add legends and other aesthetic improvements
g.add_legend()
g.set_axis_labels("Dimensions", "Time")
g.set_titles(col_template="{col_name} problem")

# Add grids for all axes
for ax in g.axes.flatten():
    ax.grid()

# Use matplotlib instead of seaborn because the jet colors do not exist

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(11, 5), sharex=True, sharey=True)
axes = axes.flatten()
problems = results_df['problem'].unique().tolist()

lines = []
labels = []
markers = ['o', '*', 'o', '*', 'o', '*', 'o']
for index, problem in enumerate(problems):
    ax = axes[index]
    data = results_df.loc[results_df['problem'] == problem].reset_index(drop=True)
    data = data.pivot_table(
        index='dimensions-log', columns='algorithm', values='time-mean-log'
    )
    line = data.plot(ax=ax, cmap='jet', legend=False)
    ax.set_title(f'{problem.capitalize()}', fontsize=14)
    ax.grid()
    ax.set_xlabel('Dimensions (Log Scale)', fontsize=14)
    ax.set_ylabel('Avg $\log(t)$', fontsize=14)
    # for i, line in enumerate(ax.get_lines()):
    #     line.set_marker(markers[index])

    if index == 0:
        lines, labels = ax.get_legend_handles_labels()

# Turn extra axis off
# axes[7].axis('off')
axes[8].axis('off')

# # Create a single legend for all the plots
fig.legend(lines, labels, bbox_to_anchor=(0.94, 0.0), ncol=6)
fig.tight_layout()
# fig.savefig('/Users/ryandevera/data-science/umn_environments/Deeplifting/paper-images/high-dimension-time-vs-dim.png', bbox_inches='tight', pad_inches=0.01)

In [None]:
(
    results_df.loc[results_df['algorithm'] == 'Deeplifting']
    .pivot_table(
        index='problem', columns=['algorithm', 'dimensions'], values='hits-mean'
    )
    .fillna(0.0)
)

In [None]:
(
    results_df.loc[results_df['algorithm'] == 'Differential Evolution']
    .pivot_table(
        index='problem', columns=['algorithm', 'dimensions'], values='hits-mean'
    )
    .fillna(0.0)
)

In [None]:
(
    results_df.loc[results_df['algorithm'] == 'Dual Annealing']
    .pivot_table(
        index='problem', columns=['algorithm', 'dimensions'], values='hits-mean'
    )
    .fillna(0.0)
)

In [None]:
(
    results_df.loc[results_df['algorithm'] == 'IPOPT']
    .pivot_table(
        index='problem', columns=['algorithm', 'dimensions'], values='hits-mean'
    )
    .fillna(0.0)
)

In [None]:
(
    results_df.loc[results_df['algorithm'] == 'PyGRANSO']
    .pivot_table(
        index='problem', columns=['algorithm', 'dimensions'], values='hits-mean'
    )
    .fillna(0.0)
)

In [None]:
(
    results_df.loc[results_df['algorithm'] == 'SCIP']
    .pivot_table(
        index='problem', columns=['algorithm', 'dimensions'], values='hits-mean'
    )
    .fillna(0.0)
)

In [None]:
(
    results_df.pivot_table(
        index='algorithm', columns=['problem', 'dimensions'], values=['hits-mean']
    )
    .fillna(0.0)
    .mean(axis=1)
    .sort_values(ascending=False)
)

In [None]:
(
    results_df.pivot_table(
        index='algorithm', columns=['problem', 'dimensions'], values='time-mean'
    )
    .fillna(0.0)
    .mean(axis=1)
    .sort_values()
)

In [None]:
mask = results_df['problem'] == 'ackley'

(
    results_df.loc[mask].pivot_table(
        index='dimensions-log', columns='algorithm', values='time-mean-log'
    )
)

# Create the plots side by side

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sampled_df = pd.read_parquet('./data/low-dimension-sampled-results.parquet')
sampled_df = sampled_df.set_index('problem_name')

mosaic = """
    ABC
    ADE
"""

# fig = plt.figure(figsize=(14, 6), layout="constrained")
fig, ax_dict = plt.subplot_mosaic(mosaic, figsize=(14, 6))

styles = ['o-', '^-', 's-', 'D-', '*-', 'H-']


axa = ax_dict['A']
sns.heatmap(
    data=sampled_df,
    cmap='jet',
    ax=axa,
)

# Ackley timing
mask = results_df['problem'] == 'ackley'

df = results_df.loc[mask].pivot_table(
    index='dimensions-log', columns='algorithm', values='time-mean-log'
)

axb = ax_dict['B']
df.plot(ax=axb, cmap='jet', legend=False, style=styles)
axb.grid()
axb.set_title('Ackley Series')
fig.legend(bbox_to_anchor=(0.995, 0.01), ncols=6)

# Chung Reynolds timing
mask = results_df['problem'] == 'Chung Reynolds'

df = results_df.loc[mask].pivot_table(
    index='dimensions-log', columns='algorithm', values='time-mean-log'
)

axc = ax_dict['C']
df.plot(ax=axc, cmap='jet', legend=False, style=styles)
axc.set_title('Chung Reynolds Series')
axc.grid()

# Qing timing
mask = results_df['problem'] == 'qing'

df = results_df.loc[mask].pivot_table(
    index='dimensions-log', columns='algorithm', values='time-mean-log'
)

axd = ax_dict['D']
df.plot(ax=axd, cmap='jet', legend=False, style=styles)
axd.set_title('Qing Series')
axd.grid()

# Schwefel timing
mask = results_df['problem'] == 'schwefel'

df = results_df.loc[mask].pivot_table(
    index='dimensions-log', columns='algorithm', values='time-mean-log'
)

axe = ax_dict['E']
df.plot(ax=axe, cmap='jet', legend=False, style=styles)
axe.set_title('Schwefel Series')
axe.grid()


fig.tight_layout()
fig.savefig(
    '/Users/ryandevera/data-science/umn_environments/Deeplifting/paper-images/combined-plots.png',
    bbox_inches='tight',
    pad_inches=0.01,
)