In [11]:
# Create text embeddings
from transformers import AutoTokenizer, AutoModel
import torch

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embedding(text):
    """Get embedding for a single text string."""
    result = get_embeddings_batch({"text": [text]})
    return result["embedding"][0]

In [12]:
from datasets import load_from_disk

embeddings_dataset = load_from_disk("../data/hf_github_issues/embeddings_dataset")

# Add FAISS index
import faiss
embeddings_dataset.add_faiss_index(column="embedding")




  0%|          | 0/9 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embedding'],
    num_rows: 8050
})

In [15]:
# Simple search examples
question = "How can I load a dataset offline?"
question_embedding = get_embedding(question)  # No brackets, no .cpu().detach().numpy()
question_embedding

[-0.5819831490516663,
 0.187882199883461,
 -0.1997627317905426,
 0.25605908036231995,
 0.04811717942357063,
 0.1493137627840042,
 0.4367828369140625,
 0.1334896981716156,
 0.21081118285655975,
 0.1817542314529419,
 -0.10582804679870605,
 -0.04561541974544525,
 0.017933575436472893,
 0.33569419384002686,
 -0.06685853004455566,
 0.15630897879600525,
 -0.032135963439941406,
 0.232852503657341,
 -0.08596356958150864,
 -0.07701358199119568,
 0.03681779280304909,
 -0.1416911780834198,
 -0.10878720879554749,
 -0.09768678992986679,
 -0.0923842117190361,
 -0.2894090414047241,
 -0.021543769165873528,
 0.33573198318481445,
 -0.21782493591308594,
 -0.03038995899260044,
 0.24877575039863586,
 0.23496510088443756,
 0.30833733081817627,
 0.251161128282547,
 -0.00010162172111449763,
 0.04864766448736191,
 -0.17329850792884827,
 -0.1872214376926422,
 -0.37341925501823425,
 -0.09212183952331543,
 -0.4507195055484772,
 -0.1154232919216156,
 0.021488472819328308,
 -0.48447713255882263,
 -0.140022054314613

In [17]:
import numpy as np
import pandas as pd

# Convert to numpy array
question_embedding = np.array(get_embedding(question), dtype=np.float32)

scores, samples = embeddings_dataset.get_nearest_examples(
    "embedding", question_embedding, k=5
)

samples_df = pd.DataFrame(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

for _, row in samples_df.iterrows():
    print(f"Score: {row['scores']:.4f}")
    print(f"TITLE: {row['title']}")
    print(f"URL: {row['html_url']}")
    print(f"Comment: {row['comments']}")
    print("-" * 80)

Score: 25.5050
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824
Comment: Requiring online connection is a deal breaker in some cases unfortunately so it'd be great if offline mode is added similar to how `transformers` loads models offline fine.

@mandubian's second bullet point suggests that there's a workaround allowing you to use your offline (custom?) dataset with `datasets`. Could you please elaborate on how that should look like?
--------------------------------------------------------------------------------
Score: 24.5555
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824
Comment: The local dataset builders (csv, text , json and pandas) are now part of the `datasets` package since #1726 :)
You can now use them offline
```python
datasets = load_dataset('text', data_files=data_files)
```

We'll do a new release soon
--------------------------------------------------------