In [None]:
import pandas as pd

file_path = "../data/SAC_tokenized.csv"
df = pd.read_csv(file_path)

df.head()

In [None]:
# Filter out prompts that are too short
df["length"] = df["prompt"].apply(lambda x: len(str(x).split()))
df["length"].describe()

In [None]:
df = df[df["length"] >= 3]
len(df)

In [None]:
prompts = df["prompt"].tolist()
prompts = list(set(prompts))

In [None]:
prompts[0]

In [None]:
import re
# Only keep alphabetical, numeric, space, and comma for each prompt
prompts = [re.sub(r'[^a-zA-Z0-9\s,]', '', p) for p in prompts]

In [None]:
import random

random.choices(prompts, k=10)

In [None]:
ids = list(range(len(prompts)))

df_prompt = pd.DataFrame({"id": ids, "prompt": prompts})
split_ids = df_prompt.sample(frac=0.02, random_state=42).index
df_prompt["split_id"] = df_prompt.index.isin(split_ids).astype(int)
df_prompt.head()


In [None]:
# Check number of prompts in each split
df_prompt["split_id"].value_counts()


In [None]:
df_prompt.to_csv("../data/prompt_log.tsv", sep="\t", index=False)

In [None]:
import numpy as np


data = np.load("../out/ada2_prompt.npz")
high_dim_embeddings = data["high_dim_embeddings"]
low_dim_embeddings = data["low_dim_embeddings"]

print(high_dim_embeddings.shape)
print(low_dim_embeddings.shape)


In [None]:
import pandas as pd

df_prompt_subset = pd.read_csv("../data/prompt_log.tsv", sep="\t")
print(len(df_prompt_subset))

In [None]:
# Use plotly to visualize the low_dim_embeddings (2D)
import plotly.express as px

# Set the figure size to be a square
fig = px.scatter(
    x=low_dim_embeddings[:, 0],
    y=low_dim_embeddings[:, 1],
    hover_data={"prompt": df_prompt_subset["prompt"]},
    title="2D Embeddings of Prompts",
    width=800,  # Set width
    height=800,  # Set height to be the same as width
    size_max=3  # Set a smaller maximum size for the dots
)

fig.update_traces(marker=dict(size=2), hovertemplate='<b>Prompt:</b> %{customdata[0]}')
fig.show()