In [1]:
# !wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
# !wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz
# !gzip -dkv SQuAD_it-*.json.gz
# # !rm *gz

In [1]:
from datasets import load_dataset, Dataset, DatasetDict
import datasets

# Load local and remote files

In [None]:
# Local
# One can also pass the .gz/.zip file for auto-decompression
# data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
# squad_it_dataset = load_dataset('json', data_files=data_files, field="data")

# Online
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

In [None]:
squad_it_dataset

# Slicing, dicing, selecting, sorting, mapping, filtering, etc.

In [None]:
# !wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
# !unzip drugsCom_raw.zip

In [3]:
data_dir = "./data/"
data_files = {"train": data_dir + "drugsComTrain_raw.tsv", "test": data_dir + "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

## Slicing and shuffle
`select` returns a `Dataset`; slicing using [:10] returns a dict

In [3]:
drug_sample = drug_dataset['train'].shuffle(seed=42).select(range(10))
print(type(drug_sample), type(drug_sample[:2]))

<class 'datasets.arrow_dataset.Dataset'> <class 'dict'>


## Create, rename and remove columns

.map() needs a function that returns a dict

In [4]:
# Rename columns
# Check if the value of 'Unnamed: 0' is unique, if so it's an ID
for split in drug_dataset:
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique('Unnamed: 0'))
    
drug_dataset = drug_dataset.rename_column('Unnamed: 0', 'patient_id')
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [5]:
# Add a column based on existing columns
def compute_review_length(example):
    return {"review_length": len(example['review'].split())}

drug_dataset = drug_dataset.map(compute_review_length)
drug_dataset['train'][0]

Map:   0%|          | 0/161297 [00:00<?, ? examples/s]

Map:   0%|          | 0/53766 [00:00<?, ? examples/s]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'Left Ventricular Dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [6]:
# Add a column based on new data
drug_dataset['train'].add_column('tmp', range(len(drug_dataset['train'])))

Dataset({
    features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'tmp'],
    num_rows: 161297
})

## Map and filter

In [7]:
drug_dataset = drug_dataset.filter(lambda x: x['condition'] is not None)

Filter:   0%|          | 0/161297 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53766 [00:00<?, ? examples/s]

In [8]:
drug_dataset = drug_dataset.map(lambda x: {'condition': x['condition'].lower()})

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

## Sort

In [9]:
# Sort reversely (descending)
drug_dataset['train'].sort('review_length', reverse=True)[-1]

{'patient_id': 225461,
 'drugName': 'Bupropion',
 'condition': 'smoking cessation',
 'review': '"Nice"',
 'rating': 10.0,
 'date': 'October 27, 2014',
 'usefulCount': 20,
 'review_length': 1}

## Train-test split

In [10]:
drug_dataset_train_val = drug_dataset['train'].train_test_split(train_size=0.8, seed=42)
drug_dataset_train_val['validation'] = drug_dataset_train_val.pop('test')

In [11]:
drug_dataset_train_val

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 128318
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 32080
    })
})

# Map with batch

In [12]:
#unescape html characters
import html
text = "I&#039;m a transformer called BERT"
html.unescape(text) 

# Since it's batched, the input has to be a slice of dataset instead of one row
drug_dataset = drug_dataset.map(
    lambda x: {'review': [html.unescape(o) for o in x['review']]},
    batched=True,
)

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

In [13]:
checkpoint = "bert-base-cased"

## Batch the data

In [14]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(x):
    return tokenizer(x['review'], truncation=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [15]:
# tokenizer alaready takes one row or a slice of dataset, so I don't have to pass a slice
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

CPU times: total: 32.2 s
Wall time: 15.1 s


## Use num_proc to make use of parallelization of .map() itself

All variables/functions have to be declared inside the function passed to map, otherwise, there will be an "undefined" error. 
One way is to declare everything in the function but it's too slow


In [16]:
from transformers import AutoTokenizer
slow_tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=False)

In [17]:
# See the error
def slow_tokenize_function(examples):
    return slow_tokenizer(examples["review"], truncation=True)
tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)

Map (num_proc=8):   0%|          | 0/160398 [00:00<?, ? examples/s]

NameError: name 'slow_tokenizer' is not defined

A solution is to use the "partial" function  
Another solution is to use a wrapper  
https://discuss.huggingface.co/t/tokenizer-is-not-defined/39231  

**ERROR**  
On my X1, Either solution gives me the following error no matter I use slow or fast tokenizer  
"AttributeError: Can't get attribute 'Dataset' on <module 'datasets.arrow_dataset'"  
The error disappears when I set num_proc=1 or num_proc=2, and only occurs at the end of the processing, so something wrong at the executor as opposed to the workers?  

On Legion there is no such error 


In [19]:
from functools import partial
def tokenize_function(tokenizer, examples):
    return tokenizer(examples["review"], truncation=True)
partial_tokenize_function = partial(tokenize_function, tokenizer)
tokenized_dataset = drug_dataset.map(partial_tokenize_function, batched=True, num_proc=4) 

Map (num_proc=4):   0%|          | 0/160398 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/53471 [00:00<?, ? examples/s]

In [26]:
class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def tokenize_function(self, examples):
        return self.tokenizer(examples["review"], truncation=True)

tokenizer_wrapper = TokenizerWrapper(tokenizer)

In [29]:
%time tokenized_dataset = drug_dataset.map(tokenizer_wrapper.tokenize_function, batched=True, num_proc=8)

Map (num_proc=8):   0%|          | 0/160398 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/53471 [00:00<?, ? examples/s]

CPU times: total: 78.1 ms
Wall time: 11.7 s


## Extend samples when mapping the tokenizer
One can keep truncated text by setting `return_overflowing_tokens=True`. This results in extra element in fields such as `input_ids` in the output of `tokenizer`, which can cause error since the number of elements of `input_ids` are increased and conflict with other fields. This needs special treatment

In [30]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
[len(i) for i in tokenize_and_split(drug_dataset['train'][1])['input_ids']]

[128, 49]

### ERROR!

In [31]:
# ERROR! - some examples have input_ids as 1-element-list, some have multiple
tokenize_dataset = drug_dataset.map(tokenize_and_split, batched=True)

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

ArrowInvalid: Column 8 named input_ids expected length 1000 but got length 1397

### SOLUTION1 - remove_columns

In [32]:
# SOLUTION! remove_columns
# This remove columns specified in the original dataset, 
# and only returns the result from the tokenizer
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

In [33]:
# Examples with the same 'overflow_to_sample_mapping' belong to the same sample before mapping
samples = tokenized_dataset['train'][:6]
print([len(x) for x in samples['input_ids']])
print(samples['overflow_to_sample_mapping'])

[25, 128, 49, 128, 55, 116]
[0, 1, 1, 2, 2, 3]


### SOLUTION 2 - Map back from the tokenized to the original
Use `overflow_to_sample_mapping` to map back from the tokenized dataset to the 
original dataset. Duplicate original examples the text of which is truncated


In [34]:
def tokenize_and_split(examples):
    # For batched=True, the input contains multiple rows and thus is a dict whose values are lists
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    # Extend the length of each element in result, with the value from
    # the original example
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [35]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

In [36]:
print(tokenized_dataset, drug_dataset)
# tmp = tokenized_dataset['train'][:20]
# print([len(x) for x in tmp['input_ids']])
# print([tmp['review_length']])

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 228656
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 76239
    })
}) DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 53471
    })
})


# to and from Pandas

## set_format - display only

In [37]:
# This only changes the __getitem__() method when displaying the dataset, 
# and doesn't change the underlying format. So if one wants to use the data as 
# pandas, one has to select all elements
drug_dataset.set_format('pandas')
drug_dataset['train'][:2]
df_train = drug_dataset['train'][:]

In [38]:
df_train.head(2)

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,206461,Valsartan,left ventricular dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27,17
1,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141


## to_pandas

In [39]:
df_train = drug_dataset['train'].to_pandas()

In [40]:
frequency = (
    df_train['condition']
    .value_counts()
    .reset_index()
    .rename(columns={'index': 'condition', 'condition': 'frequency'})
)

freq_dataset = Dataset.from_pandas(frequency)
freq_dataset

Dataset({
    features: ['frequency', 'count'],
    num_rows: 884
})

# Saving a dataset

## Arrow - save_to_disk/load_from_disk

In [41]:
from datasets import DatasetDict
# Arraw is convinent for use. It saves to a folder with. 
tmp = DatasetDict()
for i in drug_dataset:
    tmp[i] = drug_dataset[i].select(range(10))

In [42]:
tmp.save_to_disk(data_dir + 'test_save_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

In [43]:
from datasets import load_from_disk
tmp2 = load_from_disk(data_dir + 'test_save_dataset')

## CSV, JSON, Parquet  
Parquet is good for storage

In [44]:
for split, data in tmp2.items():
    data.to_csv(data_dir + f"test_save_dataset_{split}.csv", index=None)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [48]:
data_files = {
    "train": data_dir + "test_save_dataset_train.csv",
    "test": data_dir + "test_save_dataset_test.csv",
}
tmp3 = load_dataset('csv', data_files=data_files)

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

# Memory-mapping and Streaming

## Memory-mapping 

Datasets treats each dataset as a memory-mapped file, which provides a mapping between RAM and filesystem storage that allows the library to access and operate on elements of the dataset without needing to fully load it into memory.  
Below the file size on disk is far larger than that of memory taken

In [49]:
# The server is no longer available; Use another set
# data_files = "https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
data_files = "https://the-eye.eu/public/AI/pile_v2/data/EuroParliamentProceedings_1996_2011.jsonl.zst"
parl_dataset = load_dataset('json', data_files=data_files, split='train')
parl_dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

In [50]:
import psutil
print(f'RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB')
print(f"Number of files in dataset : {parl_dataset.dataset_size}")
size_gb = parl_dataset.dataset_size / (1024**3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")

RAM used: 884.76 MB


NameError: name 'parl_dataset' is not defined

In [9]:
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(parl_dataset), batch_size):
    _ = parl_dataset[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(parl_dataset)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

Iterated over 69814 examples (about 4.6 GB) in 11.0s, i.e. 0.415 GB/s


## Streaming
In case data can't even fit the disk
ERRORS: I can't get it working under proxy. The data streaming could be declared sudcessfully, but actual streaming won't start

In [53]:
data_files

'https://the-eye.eu/public/AI/pile_v2/data/EuroParliamentProceedings_1996_2011.jsonl.zst'

In [51]:
# import os
# os.environ['http_proxy'] = "https://127.0.0.1:52304" 
# os.environ['https_proxy'] = "https://127.0.0.1:52304" 
# os.environ['HTTP_PROXY'] = "https://127.0.0.1:52304" 
# os.environ['HTTPS_PROXY'] = "https://127.0.0.1:52304" 

parl_dataset_streamed = load_dataset('json', data_files=data_files, split='train', streaming=True)
parl_dataset_streamed

<datasets.iterable_dataset.IterableDataset at 0x15826784e50>

In [52]:
tmp = parl_dataset_streamed.take(1)
list(tmp)

[]

In [54]:
# Doing eveyrthing streaming
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = parl_dataset_streamed.map(lambda x: tokenizer(x["text"]))

shuffled_dataset = parl_dataset_streamed.shuffle(buffer_size=10_000, seed=42)
# Skip the first 1,000 examples and include the rest in the training set
train_dataset = shuffled_dataset.skip(1000)
# Take the first 1,000 examples for the validation set
validation_dataset = shuffled_dataset.take(1000)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [55]:
# Combine two remote datasets
from itertools import islice
from datasets import interleave_datasets

combined_dataset = interleave_datasets([dataset1_streamed, dataset2_streamed])
list(islice(combined_dataset, 2))

NameError: name 'dataset1_streamed' is not defined

# Create own dataset

## Fetch data online

In [59]:
import requests
url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)
print(response.status_code)
# print(response.json())

200


In [133]:
GITHUB_TOKEN = "my token"
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [7]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=5_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [8]:
fetch_issues()

  0%|          | 0/50 [00:00<?, ?it/s]

Downloaded all the issues for datasets! Dataset stored at ./datasets-issues.jsonl


In [109]:
# Something is wrong if I read the whole dataset. Read only partial of it

# tmp = pd.read_json("datasets-issues.jsonl", orient='records', lines=True)
# tmp.iloc[0:3222].to_json("tmp.jsonl", orient="records", lines=True)

issues_dataset = load_dataset("json", data_files="tmp.jsonl", split="train")
issues_dataset

Found cached dataset json (C:/Users/yuwei/.cache/huggingface/datasets/json/default-46735dbdeb568ee7/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request'],
    num_rows: 3222
})

## Clean it up

In [71]:
sample = issues_dataset.shuffle(seed=42).select(range(3))
for url, pr in zip(sample['html_url'], sample['pull_request']):
    print(f">> URL: {url}\n>> Pull request: {pr}\n")

Loading cached shuffled indices for dataset at C:\Users\yuwei\.cache\huggingface\datasets\json\default-46735dbdeb568ee7\0.0.0\8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96\cache-9101792ddde358f6.arrow


>> URL: https://github.com/huggingface/datasets/issues/4181
>> Pull request: None

>> URL: https://github.com/huggingface/datasets/pull/3726
>> Pull request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/3726', 'html_url': 'https://github.com/huggingface/datasets/pull/3726', 'diff_url': 'https://github.com/huggingface/datasets/pull/3726.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/3726.patch', 'merged_at': datetime.datetime(2022, 2, 15, 16, 55, 44)}

>> URL: https://github.com/huggingface/datasets/issues/5967
>> Pull request: None



In [111]:
# It contains both issues and pull request
issues_dataset = issues_dataset.map(
    lambda x: {"is_pull_request": x["pull_request"] is None}
)

Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\json\default-46735dbdeb568ee7\0.0.0\8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96\cache-48d3dc794ef8b517.arrow


In [112]:
# Find out the close time
issues_dataset = issues_dataset.map(
    lambda x: {"time_to_close": 
               None if x["closed_at"] is None 
               else (x["closed_at"] - x['created_at']) / 1000 / 60  # ms to minutes
              }
)

Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\json\default-46735dbdeb568ee7\0.0.0\8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96\cache-353016e11dbd50b8.arrow


In [113]:
# Avg close time 
pd_closed = issues_dataset.filter(lambda x: x['time_to_close'] is not None).to_pandas()

print(f"Avg time to close issues: {pd_closed['time_to_close'].mean()} minutes")

print(f"Avg time to close pr: {pd_closed[pd_closed['is_pull_request']]['time_to_close'].mean()} minutes")

Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\json\default-46735dbdeb568ee7\0.0.0\8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96\cache-e9d63546e0953f95.arrow


## Get comments of issues

In [136]:
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]

issues_with_comments_dataset = issues_dataset.map(
    lambda x: {"comments": get_comments(x["number"])}
)


Map:   0%|          | 0/3222 [00:00<?, ? examples/s]

ProxyError: HTTPSConnectionPool(host='api.github.com', port=443): Max retries exceeded with url: /repos/huggingface/datasets/issues/6002/comments (Caused by ProxyError('Cannot connect to proxy.', OSError(0, 'Error')))

## Upload the dataset

In [140]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
issues_with_comments_dataset.push_to_hub("github-issues")

# Semantic search with sentence embedding

## Load data

In [2]:
from datasets import load_dataset

issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

In [3]:
issues_dataset = issues_dataset.filter(
    lambda x: ((not x['is_pull_request']) and len(x['comments'])) > 0
)
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 808
})

In [4]:
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
# same as minus here since set 1 is a subset of set 2
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 808
})

## Explode rows

### Use pandas

In [5]:
df = issues_dataset.to_pandas()

comments_df = df.explode('comments', ignore_index=True)
comments_df.head()

comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

### Use .map()

In [6]:
def map_to_explode(examples):
    result = {k: [] for k in examples}
    comments = examples.pop('comments')
    for i, comment_i in enumerate(comments):
        n_rows_to_explode = len(comment_i)
        for k, v in examples.items():
            result[k] += [v[i]] * n_rows_to_explode
        result['comments'] += comment_i
    return result

comments_dataset = issues_dataset.map(map_to_explode, batched=True)
comments_dataset

Map:   0%|          | 0/808 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

## Create some features

In [7]:
comments_dataset = comments_dataset.map(lambda x: {'comment_length': len(x['comments'].split())})
comments_dataset = comments_dataset.filter(lambda x: x['comment_length'] > 1)
comments_dataset

Map:   0%|          | 0/2964 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2964 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 2934
})

In [8]:
comments_dataset = comments_dataset.map(
    lambda x: {'text': " \n ".join([x['title'], x['body'], x['comments']])}
)
comments_dataset[0]

Map:   0%|          | 0/2934 [00:00<?, ? examples/s]

{'html_url': 'https://github.com/huggingface/datasets/issues/2945',
 'title': 'Protect master branch',
 'comments': 'Cool, I think we can do both :)',
 'body': 'After accidental merge commit (91c55355b634d0dc73350a7ddee1a6776dbbdd69) into `datasets` master branch, all commits present in the feature branch were permanently added to `datasets` master branch history, as e.g.:\r\n- 00cc036fea7c7745cfe722360036ed306796a3f2\r\n- 13ae8c98602bbad8197de3b9b425f4c78f582af1\r\n- ...\r\n\r\nI propose to protect our master branch, so that we avoid we can accidentally make this kind of mistakes in the future:\r\n- [x] For Pull Requests using GitHub, allow only squash merging, so that only a single commit per Pull Request is merged into the master branch\r\n  - Currently, simple merge commits are already disabled\r\n  - I propose to disable rebase merging as well\r\n- ~~Protect the master branch from direct pushes (to avoid accidentally pushing of merge commits)~~\r\n  - ~~This protection would rejec

## Create embeddings

Instructions: https://www.sbert.net/examples/applications/semantic-search/README.html#symmetric-vs-asymmetric-semantic-search

### Preprocessing

In [9]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [10]:
import torch

device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

#### CLS pooling
Collect the last hidden state for the special [CLS] token  
CLS token: Append a special <CLS> token to the start of every sequence. This special token is meant to capture the sequence-level information. During the training process, some sentence-level classification task based on this CLS embedding will tune the CLS token representation via backpropagation.  
  
From [article of pooling methods](https://blog.ml6.eu/the-art-of-pooling-embeddings-c56575114cf8)


In [11]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

### Embedding

In [12]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

text_input = comments_dataset['text'][0]
embedding = get_embeddings(text_input)
# Detach from the computational graph, copy it to host memory, and then convert to numpy array
embedding = embedding.detach().cpu().numpy()

print(text_input, '\n', embedding.shape)

Protect master branch 
 After accidental merge commit (91c55355b634d0dc73350a7ddee1a6776dbbdd69) into `datasets` master branch, all commits present in the feature branch were permanently added to `datasets` master branch history, as e.g.:
- 00cc036fea7c7745cfe722360036ed306796a3f2
- 13ae8c98602bbad8197de3b9b425f4c78f582af1
- ...

I propose to protect our master branch, so that we avoid we can accidentally make this kind of mistakes in the future:
- [x] For Pull Requests using GitHub, allow only squash merging, so that only a single commit per Pull Request is merged into the master branch
  - Currently, simple merge commits are already disabled
  - I propose to disable rebase merging as well
- ~~Protect the master branch from direct pushes (to avoid accidentally pushing of merge commits)~~
  - ~~This protection would reject direct pushes to master branch~~
  - ~~If so, for each release (when we need to commit directly to the master branch), we should previously disable the protection an

In [13]:
# Compute everything
embeddings_dataset = comments_dataset.map(
    lambda x: {'embeddings': get_embeddings(x['text']).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/2934 [00:00<?, ? examples/s]

## FAISS similarity search
Using [FAISS](https://faiss.ai/) for efficient similarity search


In [14]:
# !pip install faiss-cpu
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 2934
})

In [15]:
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [16]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [18]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [26]:
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

COMMENT: Requiring online connection is a deal breaker in some cases unfortunately so it'd be great if offline mode is added similar to how `transformers` loads models offline fine.

@mandubian's second bullet point suggests that there's a workaround allowing you to use your offline (custom?) dataset with `datasets`. Could you please elaborate on how that should look like?
SCORE: 25.505020141601562
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824

COMMENT: The local dataset builders (csv, text , json and pandas) are now part of the `datasets` package since #1726 :)
You can now use them offline
```python
datasets = load_dataset('text', data_files=data_files)
```

We'll do a new release soon
SCORE: 24.555545806884766
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824

COMMENT: I opened a PR that allows to reload modules that have already been loaded once even if there's no intern