In [None]:
# Third party
import matplotlib.pyplot as plt
import matplotlib_inline

import numpy as np
import pandas as pd
import seaborn as sns

# Local
import flexibleSubsetSelection as fss

# Initialize notebook settings
sns.set_theme() # set seaborn theme
matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # vector plots
%matplotlib inline 
%load_ext autoreload
%autoreload 2

In [None]:
bins = 20

df = pd.read_csv("../data/exampleDatasets/umap.csv")
df.drop("Unnamed: 0", axis=1, inplace=True)

dataset = fss.Dataset(data = df, 
                      features = ["X Specter", "Y Specter", "X TFIDF", "Y TFIDF"], 
                      interval = (0, bins))

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))

sns.histplot(x="X Specter", 
             y="Y Specter", 
             data=dataset.scaled,
             bins=range(0, bins + 1), 
             color=color["green"], 
             ax=axs[0])

sns.histplot(x="X TFIDF", 
             y="Y TFIDF", 
             data=dataset.scaled,
             bins=range(0, bins + 1), 
             color=color["green"], 
             ax=axs[1])

# sns.scatterplot(x="X Specter", 
#                 y="Y Specter", 
#                 data=dataset.scaled,
#                 color=color["green"], 
#                 edgecolor=None,
#                 alpha=0.5,
#                 ax=axs[0])

# sns.scatterplot(x="X TFIDF", 
#                 y="Y TFIDF", 
#                 data=dataset.scaled,
#                 color=color["green"], 
#                 edgecolor=None,
#                 alpha=0.5,
#                 ax=axs[1])

axs[0].set_aspect('equal', 'box')
axs[1].set_aspect('equal', 'box')

In [None]:
s = 300

z, timeTotal, loss = approximation.greedySwap(dataset, s=s, objective=objectives.embeddingCoverageDistinct)
subset = sets.Subset(dataset, z, length=s)
print(loss)

with open(f'size={s}_obj=coverageDistinct_k={k}_bins={bins}.pkl', 'wb') as f:
    pickle.dump(subset, f)

In [None]:
bins = 20

bin_edges = np.arange(bins + 1)  # This will create edges [0, 1, 2, 3, 4, 5]

# Assign bins for each dimension
x_specter_bins = np.digitize(dataset.scaled['X Specter'], bin_edges) - 1
y_specter_bins = np.digitize(dataset.scaled['Y Specter'], bin_edges) - 1
x_tfidf_bins = np.digitize(dataset.scaled['X TFIDF'], bin_edges) - 1
y_tfidf_bins = np.digitize(dataset.scaled['Y TFIDF'], bin_edges) - 1

# Combine the bin indices for 2D bins
dataset.scaled['Specter 2D Bin'] = list(zip(x_specter_bins, y_specter_bins))
dataset.scaled['TFIDF 2D Bin'] = list(zip(x_tfidf_bins, y_tfidf_bins))

specter = len(dataset.scaled['Specter 2D Bin'].unique()) 
tfidf = len(dataset.scaled['TFIDF 2D Bin'].unique())
coverage = specter + tfidf

print(specter, tfidf, coverage, coverage/2)

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(10, 11))

sns.scatterplot(x="X Specter", 
                y="Y Specter", 
                data=dataset.scaled,
                color=color["darkGreen"], 
                edgecolor=None,
                alpha=0.6,
                ax=axs[0, 0])

sns.scatterplot(x="X TFIDF", 
                y="Y TFIDF", 
                data=dataset.scaled,
                color=color["darkGreen"], 
                edgecolor=None,
                alpha=0.6,
                ax=axs[0, 1])

bins = 40
dataset = sets.Dataset(data=df, 
                       features=["X Specter", "Y Specter", "X TFIDF", "Y TFIDF"], 
                       bins=bins,
                       interval = (0, bins))

bins = 20
dataset = sets.Dataset(data=df, 
                       features=["X Specter", "Y Specter", "X TFIDF", "Y TFIDF"], 
                       bins=bins,
                       interval = (0, bins))
with open(f'size=300_obj=coverage_k=5129_bins={bins}.pkl', 'rb') as f:
    subsetSmall = pickle.load(f)

sns.histplot(x="X Specter", 
             y="Y Specter", 
             data=dataset.scaled,
             bins=range(0, 20 + 1), 
             color=color["green"], 
             ax=axs[1, 0])

sns.histplot(x="X TFIDF", 
             y="Y TFIDF", 
             data=dataset.scaled,
             bins=range(0, 20 + 1), 
             color=color["green"], 
             ax=axs[1, 1])

sns.scatterplot(x="X Specter", 
                y="Y Specter", 
                data=subsetSmall.data,
                color=color["yellow"], 
                edgecolor=None,
                alpha=0.6,
                ax=axs[1, 0])

sns.scatterplot(x="X TFIDF", 
                y="Y TFIDF", 
                data=subsetSmall.data,
                color=color["yellow"], 
                edgecolor=None,
                alpha=0.6,
                ax=axs[1, 1])

axs[0, 0].set_title("Specter")
axs[0, 1].set_title("TFIDF")
axs[1, 0].set_title("Specter")
axs[1, 1].set_title("TFIDF")

for ax in axs.flatten():
    ax.set_aspect('equal', 'box')
    ax.set_yticks(range(0, 25, 5))
    ax.set_xticks(range(0, 25, 5))
    ax.set_ylabel(None)
    ax.set_xlabel(None)

fig.text(0.5, 0.98, 'Text Embeddings', ha='center', fontsize=16)
fig.text(0.5, 0.49, 'Discrete Coverage', ha='center', fontsize=16)


plt.subplots_adjust(hspace=25, wspace=25)
plt.savefig("embeddings.pdf", bbox_inches="tight", format="pdf")