In [1]:
import os
import glob
import humap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pandas as pd

from tqdm.notebook import tqdm


def load_parquet(parquet_path):
    return pd.read_parquet(parquet_path)

def print_parquet_data(data):
    for _, row in data.iterrows():
        print("Title:", row['title'])
        print("Embedding:", row['embeddings'], type(['embeddings']))
        print("DOI:", row['doi'])
        print()
        break

def read_arxiv_titles(sample_size=500_000):

    directory_path = os.path.dirname(os.path.abspath('./../'))
    print(directory_path)
    arxiv_titles = []
    arxiv_embeddings = []
    for file_path in tqdm(glob.glob(os.path.join(directory_path, 'arxiv_titles/titles_*.parquet'))):
        data = pd.read_parquet(file_path)
        for _, row in data.iterrows():
            arxiv_titles.append(row['title'])
            arxiv_embeddings.append(row['embeddings'])

    indices = np.random.choice(len(arxiv_embeddings), sample_size, replace=False)

    embeddings = np.array(arxiv_embeddings)[indices, :]
    prompts = pd.DataFrame({'prompt': np.array(arxiv_titles)[indices]})
    
    
    return embeddings, prompts
    
if not os.path.exists("use-case-dataset"):
    os.mkdir("use-case-dataset")

    dataset, dataset_prompts = read_arxiv_titles()
    
    np.save("use-case-dataset/use_case_embeddings.npy", dataset)
    dataset_prompts.to_csv("use-case-dataset/prompts.csv", index=False)
else:
    dataset = np.load("use-case-dataset/use_case_embeddings.npy")
    dataset_prompts = pd.read_csv("use-case-dataset/prompts.csv")

In [2]:
from sklearn.cluster import KMeans

def plot_with_mpl(x, y, s, c, alpha=0.3):

    plt.figure()
    plt.scatter(x, y, c=c, alpha=alpha, s=s, cmap="tab10", edgecolor="none")
    plt.axis("off")
    plt.gca().set_aspect("equal")
    plt.show()

def get_sample_docs(x, y, docs, num_docs=10):
    dataset = np.vstack((x,y)).T
    
    km = KMeans(random_state=0, n_clusters=num_docs).fit(dataset)
    representatives = []
    for k in range(num_docs):
        center = km.cluster_centers_[k]
        
        min_dist = float('inf') 
        min_point = None
        for i, point in enumerate(dataset):
            if min_dist > np.linalg.norm(point-center):
                min_dist = np.linalg.norm(point-center)
                min_point = i
        representatives.append(docs[min_point])

    return representatives
        
    # indices = np.random.choice(len(docs), num_docs if len(docs) >= num_docs else len(docs), replace=False)
    # return docs[indices]   
    
def prepare_data(x, y, influence, klass, documents):
    df = pd.DataFrame({
        "x": x,
        "y": y,
        "influence": influence,
        "klass": klass,
        "documents": documents
    })
    return df

In [3]:
dataset.shape, dataset_prompts.shape

((500000, 768), (500000, 1))

In [4]:
dataset_prompts.head()

Unnamed: 0,prompt
0,Communities in C.elegans connectome through th...
1,Coded Computing for Federated Learning at the ...
2,SRF-GAN: Super-Resolved Feature GAN for Multi-...
3,The NASA/IPAC Teacher Archive Research Program...
4,Modeling Risk via Realized HYGARCH Model


In [5]:
from sklearn.preprocessing import normalize

dataset = normalize(dataset)

In [None]:
reducer = humap.HUMAP(np.array([0.2, 0.2]), init="Random", verbose=True, knn_algorithm="HNSW")
reducer.fit(dataset)

In [None]:
indices_2 = reducer.original_indices(2)
indices_1 = reducer.original_indices(1)
indices_0 = np.arange(len(dataset)).astype(np.int_)

In [None]:
embeddings2 = reducer.transform(2)

In [None]:
df_2 = prepare_data(embeddings2[:, 0], embeddings2[:, 1], reducer.influence(2), np.zeros(embeddings2.shape[0]), dataset_prompts.loc[indices_2]['prompt'].values)

In [None]:
import jscatter

scatter = jscatter.Scatter(data=df_2, x='x', y='y', opacity=0.3)
scatter.color(by='klass')
scatter.size(by='influence', map=[4, 50])
scatter.height(860)
scatter.width(1020)
scatter.show()

In [12]:
get_sample_docs(df_2.loc[scatter.selection()]['x'].values, df_2.loc[scatter.selection()]['y'].values,
                df_2.loc[scatter.selection()]['documents'].values, num_docs=10)

['Imagine a long Poem, where Serena Williams stumble upon a bleak xenon in Fukuoka, with dialogue and atmosphere inspired by P.D. James.',
 'Compose a long Poem set in Húsavík, where a ashamed tectonic encounters Benjamin Franklin, inspired by the works of Gertrude Stein.',
 'Compose a Poem set in Húsavík, where a cooked tectonic encounters Steve Jobs, inspired by the works of Bram Stoker.',
 'Imagine a Poem, where Charlie Chaplin stumble upon a embellished lilac in Montreal, with dialogue and atmosphere inspired by Hilary Mantel.',
 'Imagine a long Poem, where Frank Sinatra stumble upon a far nymph in Fukuoka, with dialogue and atmosphere inspired by Shakespeare.',
 'Craft a Poem in which Thomas Hobbes explore Jyväskylä and come across a far xenon, with literary elements drawn from Sigrid Undset.',
 'Create a Poem in which Sophocles encounter a feisty kiosk while traversing Jyväskylä, drawing inspiration from Sigrid Undset.',
 'Compose a long Poem set in Hanoi, where a devoted lilac e

In [13]:
reducer.set_fixing_term(0.01)
indices_selected = scatter.selection()
points_selected = embeddings2[indices_selected]
reducer.fix_datapoints(points_selected)

embedding_cluster1, y_cluster1, indices_cluster1, indices_fixed = reducer.transform(2, indices=indices_selected)

In [14]:

indices_prompt = indices_1[indices_cluster1]
df_1_selected = prepare_data(embedding_cluster1[:, 0], embedding_cluster1[:, 1], reducer.influence_selected(), 
                             np.zeros(embedding_cluster1.shape[0]), dataset_prompts.loc[indices_prompt]['prompt'].values)

In [15]:
scatter1 = jscatter.Scatter(data=df_1_selected, x='x', y='y', opacity=0.3)
# scatter1.legend(True)
scatter1.color(by='klass')
scatter1.size(by='influence', map=[4, 50])
scatter1.height(860)
scatter1.width(1020)
scatter1.show()

HBox(children=(VBox(children=(Button(button_style='primary', icon='arrows', layout=Layout(width='36px'), style…

In [16]:
get_sample_docs(df_1_selected.loc[scatter1.selection()]['x'].values, df_1_selected.loc[scatter1.selection()]['y'].values,
                 df_1_selected.loc[scatter1.selection()]['documents'].values, num_docs=10)

['Create a Poem in which Thomas Hobbes encounter a bleak orchard while traversing Hanoi, drawing inspiration from Boris Pasternak.',
 'Write a Poem that follows the adventures of Marie Curie in Petra as they seek a devoted xenon',
 'Create a Poem in which Steve Jobs encounter a feisty elevator while traversing Montreal, drawing inspiration from Shakespeare.',
 'Imagine a Poem, where Elon Musk stumble upon a bashful orchard in Tashkent, with dialogue and atmosphere inspired by Arto Paasilinna.',
 'Write me a Poem about a content xenon who meets Pyotr Ilyich Tchaikovsky in Húsavík in the style of Sigrid Undset',
 'Create a Poem in which Franz Kafka encounter a ashamed ball while traversing Húsavík, drawing inspiration from Heinrich Böll.',
 'Weave a Poem where Serena Williams uncovers a feisty ball in Varanasi, emulating the style of Vladimir Nabokov.',
 'Imagine a long Poem, where Dr. Seuss stumble upon a ashamed cup in Jyväskylä, with dialogue and atmosphere inspired by Boris Pasternak

Unnamed: 0,x,y,influence,klass,documents
159,4.653402,0.459190,5,0.0,<p>I am developing a storm topology locally. I...
1537,5.478681,0.452874,7,0.0,<p>I may have discovered one of the problems i...
1213,6.036171,2.741712,1,0.0,<p><strong>Background</strong></p>\n<p>I'm a n...
4483,6.511709,1.812715,2,0.0,<p>Suppose I have this table:</p>\n\n<pre><cod...
1328,6.382285,2.222886,3,0.0,<p>For every version of Eclipse I've used prio...
...,...,...,...,...,...
381,1.068406,-0.357208,1,0.0,<p>Is it possible to automate Log Shipping Fai...
4084,0.925639,-0.337113,7,0.0,<p>I have some sql that I want to pass into a ...
4681,0.969436,-0.341389,5,0.0,What are the advantages of using a solar panel...
1658,0.993729,-0.353595,1,0.0,<p>I am using Java ProcessBuilder to pass a St...
