In [3]:
!pip install torch
!pip install transformers[sentencepiece]



In [4]:
# Import Necessary Libraries

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch

In [5]:
# Load Dataset

github_dataset = load_dataset('lewtun/github-issues', split = 'train')
github_dataset

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

In [6]:
# Filtering out Null Pull requests and 0 comment length

github_dataset = github_dataset.filter(
    lambda x: (x['is_pull_request'] == False and len(x['comments']) > 0)
)

github_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 808
})

In [7]:
# Data manipulation and cleaning to remove unnecessary columns

columns = github_dataset.column_names
columns_to_keep = ['title','body','html_url','comments']
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)

github_dataset = github_dataset.remove_columns(columns_to_remove)
github_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 808
})

In [8]:
# Explode comments to get comments on each row

github_dataset.set_format('pandas')
df = github_dataset[:]
df['comments'][0].tolist()

comments_df = df.explode('comments', ignore_index = True)
comments_df.head(4)

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Protect master branch,"Cool, I think we can do both :)",After accidental merge commit (91c55355b634d0d...
1,https://github.com/huggingface/datasets/issues...,Protect master branch,@lhoestq now the 2 are implemented.\r\n\r\nPle...,After accidental merge commit (91c55355b634d0d...
2,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,Hi ! I guess the caching mechanism should have...,## Describe the bug\r\nAfter upgrading to data...
3,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,"If it's easy enough to implement, then yes ple...",## Describe the bug\r\nAfter upgrading to data...


In [9]:
# Convert DataFrame back to Dataset

comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

In [10]:
# More preprocessing

comments_dataset = comments_dataset.map(
    lambda x: {'comment_length': len(x['comments'].split())}
)

comments_dataset = comments_dataset.filter(
    lambda x: x['comment_length'] > 15
)

comments_dataset

Map:   0%|          | 0/2964 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2964 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 2175
})

In [11]:
# Define concatenation function

def concatenate_text(examples):
  return {
      "text": examples["title"]
        + " \n "
        + examples["body"]
        + " \n "
        + examples["comments"]
  }

comments_dataset = comments_dataset.map(concatenate_text)

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

In [12]:
# Define model and tokenizer

checkpoint = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device) # Move the model to the selected device
print(f"Model moved to: {device}")

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

MPNetModel LOAD REPORT from: sentence-transformers/multi-qa-mpnet-base-dot-v1
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model moved to: cuda


In [13]:
# Check whether processes are on CPU or GPU

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Device set to: {device}")

Device set to: cuda


In [14]:
# Define Getting embeddings for whole sentences

def cls_pooling(model_output):
  return model_output.last_hidden_state[:,0]


In [15]:
# Getting the actual Embeddings

def get_embeddings(text_list):
  encoded_input = tokenizer(
      text_list, padding=True, truncation=True, return_tensors='pt'
  )

  encoded_input = {k: v.to(device) for k,v in encoded_input.items()}
  model_output = model(**encoded_input)
  return cls_pooling(model_output)

In [16]:
# Testing the function

embeddings = get_embeddings(comments_dataset['text'][0])
embeddings.shape

torch.Size([1, 768])

In [17]:
# Mapping the function to the entire dataset

embeddings_dataset = comments_dataset.map(
    lambda x: {'embeddings': get_embeddings(x['text']).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

In [18]:
!pip install faiss-cpu



In [19]:
# Add FAISS Index

embeddings_dataset.add_faiss_index(column='embeddings')

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 2175
})

In [20]:
# Define our question

question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [21]:
# Define scores and samples

scores, samples = embeddings_dataset.get_nearest_examples(
    'embeddings', question_embedding, k=5
)

In [22]:
samples_df = pd.DataFrame.from_dict(samples)
samples_df['scores'] = scores
samples_df.sort_values('scores',ascending=False, inplace=True)

In [23]:
# Structuring the Answer

for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

COMMENT: Requiring online connection is a deal breaker in some cases unfortunately so it'd be great if offline mode is added similar to how `transformers` loads models offline fine.

@mandubian's second bullet point suggests that there's a workaround allowing you to use your offline (custom?) dataset with `datasets`. Could you please elaborate on how that should look like?
SCORE: 25.505016326904297
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824

COMMENT: The local dataset builders (csv, text , json and pandas) are now part of the `datasets` package since #1726 :)
You can now use them offline
```python
datasets = load_dataset('text', data_files=data_files)
```

We'll do a new release soon
SCORE: 24.555538177490234
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824

COMMENT: I opened a PR that allows to reload modules that have already been loaded once even if there's n