In [1]:
from datasets import load_dataset, Dataset
import faiss


#data of github issues
issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset

  from .autonotebook import tqdm as notebook_tqdm
Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

In [2]:
#exclude all data that are not extracted by pull request and no comment
issues_dataset = issues_dataset.filter(lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0))

issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 808
})

In [3]:
#remove useless columns
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
print(columns_to_remove)

issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

{'events_url', 'author_association', 'assignees', 'labels', 'is_pull_request', 'node_id', 'created_at', 'repository_url', 'closed_at', 'comments_url', 'id', 'pull_request', 'active_lock_reason', 'locked', 'timeline_url', 'milestone', 'updated_at', 'number', 'state', 'labels_url', 'performed_via_github_app', 'assignee', 'user', 'url'}


Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 808
})

In [4]:
#data process via pandas
issues_dataset.set_format("pandas")
df = issues_dataset[:]

#see first row
df["comments"][0].tolist()

['Cool, I think we can do both :)',
 '@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master branch; and eventually reverted without messing up the repo history).']

In [5]:
#to get one row for one comment instead of one row for 4 comments
comments_df = df.explode("comments", ignore_index=True)
comments_df.head(4)

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Protect master branch,"Cool, I think we can do both :)",After accidental merge commit (91c55355b634d0d...
1,https://github.com/huggingface/datasets/issues...,Protect master branch,@lhoestq now the 2 are implemented.\r\n\r\nPle...,After accidental merge commit (91c55355b634d0d...
2,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,Hi ! I guess the caching mechanism should have...,## Describe the bug\r\nAfter upgrading to data...
3,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,"If it's easy enough to implement, then yes ple...",## Describe the bug\r\nAfter upgrading to data...


In [6]:
#convert back to dataset
comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

In [7]:
#check comment length
comments_dataset = comments_dataset.map(lambda x: {"comment_length": len(x["comments"].split())})

#make sure all comments are more than 15 words
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)
comments_dataset

Map: 100%|██████████| 2964/2964 [00:00<00:00, 20084.39 examples/s]
Filter: 100%|██████████| 2964/2964 [00:00<00:00, 117765.52 examples/s]


Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 2175
})

In [8]:
#combine title, description and comments into one text
def concatenate_text(examples):
    
    return {"text":
            examples["title"] + "\n" + examples["body"] + "\n" + examples["comments"]
            }

In [9]:
comments_dataset = comments_dataset.map(concatenate_text)
comments_dataset

Map: 100%|██████████| 2175/2175 [00:00<00:00, 9374.69 examples/s]


Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text'],
    num_rows: 2175
})

In [10]:
from transformers import AutoTokenizer, AutoModel
import torch

#sentence-transformers to used for assymmetric semantic search (short query -> long doc)
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

device = torch.device("cuda")
model.to(device)
device

device(type='cuda')

In [11]:
#function to collect the [CLS] token of the last hidden state of the tokenizer
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0] #first token is [CLS] token


In [12]:
#to get embedding
def get_embeddings(text_list):

    encoded_input = tokenizer(text_list, padding=True, truncation=True, return_tensors="pt")
    
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)

    return cls_pooling(model_output)

In [13]:
#try with one text entry
embedding = get_embeddings(comments_dataset["text"][0])
embedding.shape

torch.Size([1, 768])

In [14]:
#apply to each row
embeddings_dataset = comments_dataset.map(lambda x: {"embeddings" : get_embeddings(x["text"]).detach().cpu().numpy()[0]}) #to numpy as FAISS indexing needs it

Map: 100%|██████████| 2175/2175 [01:20<00:00, 26.93 examples/s]


In [15]:
embeddings_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 2175
})

In [16]:
#FAISS finds the most similar embeddings to input embeddings by a special data structure called index
embeddings_dataset.add_faiss_index(column="embeddings")
embeddings_dataset

100%|██████████| 3/3 [00:00<00:00, 272.44it/s]


Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 2175
})

In [17]:
index = faiss.IndexFlatL2(4)
print(index.is_trained)

True


In [18]:
#perform queries on index by doing nearest neigbour lookup
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [19]:
#compare with the whole corpus  to find the nost similar embeddings
scores, samples = embeddings_dataset.get_nearest_examples("embeddings", question_embedding, k=5)

In [20]:
#sort them via pandas
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [21]:
#iterate over first few rowa
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

COMMENT: Requiring online connection is a deal breaker in some cases unfortunately so it'd be great if offline mode is added similar to how `transformers` loads models offline fine.

@mandubian's second bullet point suggests that there's a workaround allowing you to use your offline (custom?) dataset with `datasets`. Could you please elaborate on how that should look like?
SCORE: 25.50501251220703
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824

COMMENT: The local dataset builders (csv, text , json and pandas) are now part of the `datasets` package since #1726 :)
You can now use them offline
```python
datasets = load_dataset('text', data_files=data_files)
```

We'll do a new release soon
SCORE: 24.555545806884766
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824

COMMENT: I opened a PR that allows to reload modules that have already been loaded once even if there's no interne