## Migrates the embedding from the old format to the new format


In [13]:
import os
import lilac as ll

namespace = 'local'
dataset_name = 'twitter-support'
path = 'text'
embedding = 'cohere'
signal_dir = os.path.join('data', 'datasets', namespace, dataset_name, path, embedding)

In [14]:
from lilac.data.dataset_duckdb import SignalManifest

signal_manifest_path = os.path.join(signal_dir, 'signal_manifest.json')
with open(signal_manifest_path) as f:
  signal_manifest = SignalManifest.parse_raw(f.read())
  embedding_filename_prefix = signal_manifest.embedding_filename_prefix

In [15]:
from lilac.schema import PathKey
import pickle
import numpy as np

ds = ll.get_dataset(namespace, dataset_name)
emd_path = (path, embedding)
df = ds.select_rows([ll.Column(emd_path, alias='val')]).df()

all_spans: list[tuple[PathKey, list[tuple[int, int]]]] = []
num_spans = 0
for _, row in df.iterrows():
  id = (row['__rowid__'],)
  spans: list[tuple[int, int]] = [
    (int(x['__value__']['start']), int(x['__value__']['end'])) for x in row['val']
  ]
  num_spans += len(spans)
  all_spans.append((id, spans))

# Make sure the embeddings length matches the number of spans.
embeddings = np.load(
  os.path.join(signal_dir, f'{embedding_filename_prefix}.npy'), allow_pickle=False)
assert len(embeddings) == num_spans

spans_fname = os.path.join(signal_dir, f'{embedding_filename_prefix}.spans.pkl')
with open(spans_fname, 'wb') as f:
  pickle.dump(all_spans, f)
print('Wrote', spans_fname)

signal_manifest.files = []
with open(signal_manifest_path, 'w') as f:
  f.write(signal_manifest.json(exclude_none=True, indent=2))

print('Edited', signal_manifest_path)

Wrote data/datasets/local/twitter-support/text/cohere/embeddings-00000-of-00001.spans.pkl
Edited data/datasets/local/twitter-support/text/cohere/signal_manifest.json
