In [1]:
# !wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
# !wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz
# !gzip -dkv SQuAD_it-*.json.gz
# # !rm *gz

In [2]:
from datasets import load_dataset, Dataset, DatasetDict

# Load local and remote files

In [None]:
# Local
# One can also pass the .gz/.zip file for auto-decompression
data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
squad_it_dataset = load_dataset('json', data_files=data_files, field="data")

# # Online
# url = "https://github.com/crux82/squad-it/raw/master/"
# data_files = {
#     "train": url + "SQuAD_it-train.json.gz",
#     "test": url + "SQuAD_it-test.json.gz",
# }
# squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

In [None]:
squad_it_dataset

# Slicing, dicing, selecting, sorting, mapping, filtering, etc.

In [None]:
# !wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
# !unzip drugsCom_raw.zip

In [3]:
data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

Found cached dataset csv (C:/Users/yuwei/.cache/huggingface/datasets/csv/default-024c4b2ee4d58497/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/2 [00:00<?, ?it/s]

## Slicing and shuffle
`select` returns a `Dataset`; slicing using [:10] returns a dict

In [None]:
drug_sample = drug_dataset['train'].shuffle(seed=42).select(range(10))
print(type(drug_sample), type(drug_sample[:2]))

## Create, rename and remove columns

.map() needs a function that returns a dict

In [None]:
# Rename columns
# Check if the value of 'Unnamed: 0' is unique, if so it's an ID
for split in drug_dataset:
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique('Unnamed: 0'))
    
drug_dataset = drug_dataset.rename_column('Unnamed: 0', 'patient_id')
drug_dataset

In [None]:
# Add a column based on existing columns
def compute_review_length(example):
    return {"review_length": len(example['review'].split())}

drug_dataset = drug_dataset.map(compute_review_length)
drug_dataset['train'][0]

In [None]:
# Add a column based on new data
drug_dataset['train'].add_column('tmp', range(len(drug_dataset['train'])))

## Map and filter

In [None]:
drug_dataset = drug_dataset.filter(lambda x: x['condition'] is not None)

In [None]:
drug_dataset = drug_dataset.map(lambda x: {'condition': x['condition'].lower()})

## Sort...

In [None]:
# Sort reversely (descending)
drug_dataset['train'].sort('review_length', reverse=True)[-1]

## Train-test split

In [None]:
drug_dataset_train_val = drug_dataset['train'].train_test_split(train_size=0.8, seed=42)
drug_dataset_train_val['validation'] = drug_dataset_train_val.pop('test')

In [None]:
drug_dataset_train_val

# Map with batch

In [None]:
#unescape html characters
import html
text = "I&#039;m a transformer called BERT"
html.unescape(text) 

# Since it's batched, the input has to be a slice of dataset instead of one row
drug_dataset = drug_dataset.map(
    lambda x: {'review': [html.unescape(o) for o in x['review']]},
    batched=True,
)

In [None]:
checkpoint = "bert-base-cased"

## Batch the data

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(x):
    return tokenizer(x['review'], truncation=True)

In [None]:
# tokenizer alaready takes one row or a slice of dataset, so I don't have to pass a slice
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

## Use num_proc to make use of parallelization of .map() itself

All variables/functions have to be declared inside the function passed to map, otherwise, there will be an "undefined" error. 
One way is to declare everything in the function but it's too slow


In [None]:
from transformers import AutoTokenizer
slow_tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=False)

In [None]:
# See the error
def slow_tokenize_function(examples):
    return slow_tokenizer(examples["review"], truncation=True)
tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)

A solution is to use the "partial" function  
Another solution is to use a wrapper  
https://discuss.huggingface.co/t/tokenizer-is-not-defined/39231  

**ERROR**  
Either solution gives me the following error no matter I use slow or fast tokenizer  
"AttributeError: Can't get attribute 'Dataset' on <module 'datasets.arrow_dataset'"  
The error disappears when I set num_proc=1 or num_proc=2, and only occurs at the end of the processing, so something wrong at the executor as opposed to the workers?


In [None]:
from functools import partial
def tokenize_function(tokenizer, examples):
    return tokenizer(examples["review"], truncation=True)
partial_tokenize_function = partial(tokenize_function, tokenizer)
tokenized_dataset = drug_dataset.map(partial_tokenize_function, batched=True, num_proc=2) 

In [None]:
class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def tokenize_function(self, examples):
        return self.tokenizer(examples["review"], truncation=True)

tokenizer_wrapper = TokenizerWrapper(tokenizer)
%time tokenized_dataset = drug_dataset.map(
    tokenizer_wrapper.tokenize_function, batched=True, num_proc=2
)

## Extend samples when mapping the tokenizer
One can keep truncated text by setting `return_overflowing_tokens=True`. This results in extra element in fields such as `input_ids` in the output of `tokenizer`, which can cause error since the number of elements of `input_ids` are increased and conflict with other fields. This needs special treatment

In [None]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
[len(i) for i in tokenize_and_split(drug_dataset['train'][1])['input_ids']]

### ERROR!

In [None]:
# ERROR! - some examples have input_ids as 1-element-list, some have multiple
tokenize_dataset = drug_dataset.map(tokenize_and_split, batched=True)

### SOLUTION1 - remove_columns

In [None]:
# SOLUTION! remove_columns
# This remove columns specified in the original dataset, 
# and only returns the result from the tokenizer
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

In [None]:
# Examples with the same 'overflow_to_sample_mapping' belong to the same sample before mapping
samples = tokenized_dataset['train'][:6]
print([len(x) for x in samples['input_ids']])
print(samples['overflow_to_sample_mapping'])

### SOLUTION 2 - Map back from the tokenized to the original
Use `overflow_to_sample_mapping` to map back from the tokenized dataset to the 
original dataset. Duplicate original examples the text of which is truncated


In [None]:
def tokenize_and_split(examples):
    # For batched=True, the input contains multiple rows and thus is a dict whose values are lists
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    # Extend the length of each element in result, with the value from
    # the original example
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

In [None]:
print(tokenized_dataset, drug_dataset)
# tmp = tokenized_dataset['train'][:20]
# print([len(x) for x in tmp['input_ids']])
# print([tmp['review_length']])

# to and from Pandas

## set_format - display only

In [None]:
# This only changes the __getitem__() method when displaying the dataset, 
# and doesn't change the underlying format. So if one wants to use the data as 
# pandas, one has to select all elements
drug_dataset.set_format('pandas')
drug_dataset['train'][:2]
df_train = drug_dataset['train'][:]

In [None]:
df_train.head(2)

## to_pandas

In [None]:
df_train = drug_dataset['train'].to_pandas()

In [None]:
frequency = (
    df_train['condition']
    .value_counts()
    .reset_index()
    .rename(columns={'index': 'condition', 'condition': 'frequency'})
)

freq_dataset = Dataset.from_pandas(frequency)
freq_dataset

# Saving a dataset

## Arrow - save_to_disk/load_from_disk

In [None]:
from datasets import DatasetDict
# Arraw is convinent for use. It saves to a folder with. 
tmp = DatasetDict()
for i in drug_dataset:
    tmp[i] = drug_dataset[i].select(range(10))

In [None]:
tmp.save_to_disk('test_save_dataset')

In [None]:
from datasets import load_from_disk
tmp2 = load_from_disk('test_save_dataset')

## CSV, JSON, Parquet  
Parquet is good for storage

In [None]:
for split, data in tmp2.items():
    data.to_csv(f"test_save_dataset_{split}.csv", index=None)

In [None]:
data_files = {
    "train": "test_save_dataset_train.csv",
    "test": "test_save_dataset_test.csv",
}
tmp3 = load_dataset('csv', data_files=data_files)

# Memory-mapping and Streaming

## Memory-mapping 

Datasets treats each dataset as a memory-mapped file, which provides a mapping between RAM and filesystem storage that allows the library to access and operate on elements of the dataset without needing to fully load it into memory.  
Below the file size on disk is far larger than that of memory taken

In [None]:
# The server is no longer available; Use another set
# data_files = "https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
data_files = "https://the-eye.eu/public/AI/pile_v2/data/EuroParliamentProceedings_1996_2011.jsonl.zst"
parl_dataset = load_dataset('json', data_files=data_files, split='train')
parl_dataset

In [7]:
import psutil
print(f'RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB')
print(f"Number of files in dataset : {parl_dataset.dataset_size}")
size_gb = parl_dataset.dataset_size / (1024**3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")

RAM used: 279.12 MB
Number of files in dataset : 4923828175
Dataset size (cache file) : 4.59 GB


In [9]:
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(parl_dataset), batch_size):
    _ = parl_dataset[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(parl_dataset)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

Iterated over 69814 examples (about 4.6 GB) in 11.0s, i.e. 0.415 GB/s


## Streaming
In case data can't even fit the disk
ERRORS: I can't get it working under proxy. The data streaming could be declared sudcessfully, but actual streaming won't start

In [5]:
# import os
# os.environ['http_proxy'] = "https://127.0.0.1:52304" 
# os.environ['https_proxy'] = "https://127.0.0.1:52304" 
# os.environ['HTTP_PROXY'] = "https://127.0.0.1:52304" 
# os.environ['HTTPS_PROXY'] = "https://127.0.0.1:52304" 

parl_dataset_streamed = load_dataset('json', data_files=data_files, split='train', streaming=True)
parl_dataset_streamed

<datasets.iterable_dataset.IterableDataset at 0x1edde4f63c8>

In [6]:
tmp = parl_dataset_streamed.take(1)
list(tmp)

HTTPS proxies https://127.0.0.1:52304 are not supported, ignoring
HTTPS proxies https://127.0.0.1:52304 are not supported, ignoring
HTTPS proxies https://127.0.0.1:52304 are not supported, ignoring
HTTPS proxies https://127.0.0.1:52304 are not supported, ignoring


[]

In [7]:
# Doing eveyrthing streaming
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = parl_dataset_streamed.map(lambda x: tokenizer(x["text"]))

shuffled_dataset = parl_dataset_streamed.shuffle(buffer_size=10_000, seed=42)
# Skip the first 1,000 examples and include the rest in the training set
train_dataset = shuffled_dataset.skip(1000)
# Take the first 1,000 examples for the validation set
validation_dataset = shuffled_dataset.take(1000)

In [None]:
# Combine two remote datasets
from itertools import islice
from datasets import interleave_datasets

combined_dataset = interleave_datasets([dataset1_streamed, dataset2_streamed])
list(islice(combined_dataset, 2))

# Create own dataset

## Fetch data online

In [4]:
import requests
url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)
print(response.status_code)
# print(response.json())

200


In [133]:
GITHUB_TOKEN = "my token"
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [7]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=5_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [8]:
fetch_issues()

  0%|          | 0/50 [00:00<?, ?it/s]

Downloaded all the issues for datasets! Dataset stored at ./datasets-issues.jsonl


In [109]:
# Something is wrong if I read the whole dataset. Read only partial of it

# tmp = pd.read_json("datasets-issues.jsonl", orient='records', lines=True)
# tmp.iloc[0:3222].to_json("tmp.jsonl", orient="records", lines=True)

issues_dataset = load_dataset("json", data_files="tmp.jsonl", split="train")
issues_dataset

Found cached dataset json (C:/Users/yuwei/.cache/huggingface/datasets/json/default-46735dbdeb568ee7/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request'],
    num_rows: 3222
})

## Clean it up

In [71]:
sample = issues_dataset.shuffle(seed=42).select(range(3))
for url, pr in zip(sample['html_url'], sample['pull_request']):
    print(f">> URL: {url}\n>> Pull request: {pr}\n")

Loading cached shuffled indices for dataset at C:\Users\yuwei\.cache\huggingface\datasets\json\default-46735dbdeb568ee7\0.0.0\8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96\cache-9101792ddde358f6.arrow


>> URL: https://github.com/huggingface/datasets/issues/4181
>> Pull request: None

>> URL: https://github.com/huggingface/datasets/pull/3726
>> Pull request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/3726', 'html_url': 'https://github.com/huggingface/datasets/pull/3726', 'diff_url': 'https://github.com/huggingface/datasets/pull/3726.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/3726.patch', 'merged_at': datetime.datetime(2022, 2, 15, 16, 55, 44)}

>> URL: https://github.com/huggingface/datasets/issues/5967
>> Pull request: None



In [111]:
# It contains both issues and pull request
issues_dataset = issues_dataset.map(
    lambda x: {"is_pull_request": x["pull_request"] is None}
)

Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\json\default-46735dbdeb568ee7\0.0.0\8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96\cache-48d3dc794ef8b517.arrow


In [112]:
# Find out the close time
issues_dataset = issues_dataset.map(
    lambda x: {"time_to_close": 
               None if x["closed_at"] is None 
               else (x["closed_at"] - x['created_at']) / 1000 / 60  # ms to minutes
              }
)

Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\json\default-46735dbdeb568ee7\0.0.0\8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96\cache-353016e11dbd50b8.arrow


In [113]:
# Avg close time 
pd_closed = issues_dataset.filter(lambda x: x['time_to_close'] is not None).to_pandas()

print(f"Avg time to close issues: {pd_closed['time_to_close'].mean()} minutes")

print(f"Avg time to close pr: {pd_closed[pd_closed['is_pull_request']]['time_to_close'].mean()} minutes")

Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\json\default-46735dbdeb568ee7\0.0.0\8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96\cache-e9d63546e0953f95.arrow


## Get comments of issues

In [136]:
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]

issues_with_comments_dataset = issues_dataset.map(
    lambda x: {"comments": get_comments(x["number"])}
)


Map:   0%|          | 0/3222 [00:00<?, ? examples/s]

ProxyError: HTTPSConnectionPool(host='api.github.com', port=443): Max retries exceeded with url: /repos/huggingface/datasets/issues/6002/comments (Caused by ProxyError('Cannot connect to proxy.', OSError(0, 'Error')))

## Upload the dataset

In [140]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
issues_with_comments_dataset.push_to_hub("github-issues")

# Semantic search

## Load data

In [141]:
from datasets import load_dataset

issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading and preparing dataset json/lewtun--github-issues to C:/Users/yuwei/.cache/huggingface/datasets/lewtun___json/lewtun--github-issues-f3dae4585375b7e9/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to C:/Users/yuwei/.cache/huggingface/datasets/lewtun___json/lewtun--github-issues-f3dae4585375b7e9/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

In [148]:
issues_dataset = issues_dataset.filter(
    lambda x: ((not x['is_pull_request']) and len(x['comments'])) > 0
)
issues_dataset

Filter:   0%|          | 0/808 [00:00<?, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 808
})

In [153]:
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
# same as minus here since set 1 is a subset of set 2
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 808
})

## Explose rows

### Use pandas

In [208]:
df = issues_dataset.to_pandas()

comments_df = df.explode('comments', ignore_index=True)
comments_df.head()

comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

### Use .map()

In [219]:
def map_to_explode(examples):
    result = {k: [] for k in examples}
    comments = examples.pop('comments')
    for i, comment_i in enumerate(comments):
        n_rows_to_explode = len(comment_i)
        for k, v in examples.items():
            result[k] += [v[i]] * n_rows_to_explode
        result['comments'] += comment_i
    return result

comments_dataset = issues_dataset.map(map_to_explode, batched=True)
comments_dataset

Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\lewtun___json\lewtun--github-issues-f3dae4585375b7e9\0.0.0\8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96\cache-21fed2d1b1df02c7.arrow


Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

## Create some features

In [220]:
comments_dataset = comments_dataset.map(lambda x: {'comment_length': len(x['comments'].split())})
comments_dataset = comments_dataset.filter(lambda x: x['comment_length'] > 1)
comments_dataset

Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\lewtun___json\lewtun--github-issues-f3dae4585375b7e9\0.0.0\8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96\cache-146c530e18da32cb.arrow
Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\lewtun___json\lewtun--github-issues-f3dae4585375b7e9\0.0.0\8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96\cache-aefb5875810b9194.arrow


Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 2934
})

In [223]:
comments_dataset = comments_dataset.map(
    lambda x: {'text': " \n ".join([x['title'], x['body'], x['comments']])}
)
comments_dataset[0]

Map:   0%|          | 0/2934 [00:00<?, ? examples/s]

{'html_url': 'https://github.com/huggingface/datasets/issues/2945',
 'title': 'Protect master branch',
 'comments': 'Cool, I think we can do both :)',
 'body': 'After accidental merge commit (91c55355b634d0dc73350a7ddee1a6776dbbdd69) into `datasets` master branch, all commits present in the feature branch were permanently added to `datasets` master branch history, as e.g.:\r\n- 00cc036fea7c7745cfe722360036ed306796a3f2\r\n- 13ae8c98602bbad8197de3b9b425f4c78f582af1\r\n- ...\r\n\r\nI propose to protect our master branch, so that we avoid we can accidentally make this kind of mistakes in the future:\r\n- [x] For Pull Requests using GitHub, allow only squash merging, so that only a single commit per Pull Request is merged into the master branch\r\n  - Currently, simple merge commits are already disabled\r\n  - I propose to disable rebase merging as well\r\n- ~~Protect the master branch from direct pushes (to avoid accidentally pushing of merge commits)~~\r\n  - ~~This protection would rejec

## Create embeddings

Instructions: https://www.sbert.net/examples/applications/semantic-search/README.html#symmetric-vs-asymmetric-semantic-search

In [225]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [228]:
import torch

device = torch.device("cuda")
model.to(device)

AssertionError: Torch not compiled with CUDA enabled