In [1]:
# retrieve saved datasets
from datasets import Dataset
import pandas as pd

# Reload from saved file
df = pd.read_json("../data/hf_github_issues/datasets-issues.jsonl", lines=True)
issues_dataset = Dataset.from_pandas(df)

# Reload comments dataset
from datasets import load_from_disk

issues_with_comments_dataset = load_from_disk("../data/hf_github_issues/issues_with_comments")

In [2]:
# Save dataset to the Hub
import os
from dotenv import load_dotenv

load_dotenv()

issues_with_comments_dataset.push_to_hub(
    "tensor-polinomics/hf-datasets-github-issues-with-comments",
    token=os.getenv("HF_TOKEN_WRITE")  # Bypasses all cached credentials
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/tensor-polinomics/hf-datasets-github-issues-with-comments/commit/9140b3e90a78161925b8909b78607824ce673170', commit_message='Upload dataset', commit_description='', oid='9140b3e90a78161925b8909b78607824ce673170', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tensor-polinomics/hf-datasets-github-issues-with-comments', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tensor-polinomics/hf-datasets-github-issues-with-comments'), pr_revision=None, pr_num=None)

In [3]:
# Load the dataset
from datasets import load_dataset
remote_dataset = load_dataset("tensor-polinomics/hf-datasets-github-issues-with-comments", split="train")
remote_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'type', 'active_lock_reason', 'sub_issues_summary', 'issue_dependencies_summary', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request', 'is_pull_request'],
    num_rows: 7818
})

In [4]:
# Remove pull requests and rows with no comments
issues_dataset = remote_dataset.filter(
    lambda x: (x["is_pull_request"] is False) and (len(x["comments"]) > 0)
)
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'type', 'active_lock_reason', 'sub_issues_summary', 'issue_dependencies_summary', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request', 'is_pull_request'],
    num_rows: 2732
})

In [5]:
# Remove irrelevant columns
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
issues_dataset = issues_dataset.remove_columns(
    [col for col in columns if col not in columns_to_keep]
)
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2732
})

In [6]:
import pprint
pprint.pprint(issues_dataset[0])

{'body': '## Summary\n'
         '\n'
         '`embed_table_storage` crashes with SIGKILL (exit code 137) when '
         'processing sharded datasets containing `Sequence()` nested types '
         'like `Sequence(Nifti())`. Likely affects `Sequence(Image())` and '
         '`Sequence(Audio())` as well.\n'
         '\n'
         'The crash occurs at the C++ level with no Python traceback.\n'
         '\n'
         '### Related Issues\n'
         '\n'
         '- #7852 - Problems with NifTI (closed, but related embedding '
         'issues)\n'
         "- #6790 - PyArrow 'Memory mapping file failed' (potentially "
         'related)\n'
         '- #7893 - OOM issue (separate bug, but discovered together)\n'
         '\n'
         '### Context\n'
         '\n'
         'Discovered while uploading the [Aphasia Recovery Cohort '
         '(ARC)](https://openneuro.org/datasets/ds004884) neuroimaging dataset '
         'to HuggingFace Hub. Even after fixing the OOM issue (#7893), this '
  

In [7]:
# Switch to pandas for data processing
issues_dataset.set_format("pandas")
df_issues = issues_dataset[:]

In [8]:
pprint.pprint(df_issues["comments"][0].tolist())

["I wasn't able to reproduce the crash on my side (macos arm 54, pyarrow 22 "
 'and a nifti file I found '
 '[online](https://s3.amazonaws.com/openneuro.org/ds004884/sub-M2001/ses-1076/anat/sub-M2001_ses-1076_acq-tfl3_run-4_T1w.nii.gz?versionId=9aVGb3C.VcoBgxrhNzFnL6O0MvxQsXX7&AWSAccessKeyId=AKIARTA7OOV5WQ3DGSOB&Signature=LQMLzjsuzSV7MtNAdQaFdqWqmbM%3D&Expires=1765473937))\n'
 '\n'
 'could the issue be specific to your env ? have you tried on other '
 'environments like colab maybe ?',
 'Hi @lhoestq,\n'
 '\n'
 'Thank you so much for taking the time to investigate this. Your comment '
 'about not being able to reproduce it with a single NIfTI file actually '
 'helped me understand the bug better.\n'
 '\n'
 '**Key finding:** This bug is scale-dependent. It only manifests with real, '
 'full-scale data, and not with synthetic test files.\n'
 '\n'
 'I created a sandbox branch that isolates the exact state before the '
 'workaround:\n'
 '\n'
 '**üîó Reproduction branch:** '
 'https://githu

In [9]:
df_issues_full = df_issues.explode("comments", ignore_index=True)
df_issues_full.head()

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,embed_table_storage crashes (SIGKILL) on shard...,I wasn't able to reproduce the crash on my sid...,## Summary\n\n`embed_table_storage` crashes wi...
1,https://github.com/huggingface/datasets/issues...,embed_table_storage crashes (SIGKILL) on shard...,"Hi @lhoestq,\n\nThank you so much for taking t...",## Summary\n\n`embed_table_storage` crashes wi...
2,https://github.com/huggingface/datasets/issues...,embed_table_storage crashes (SIGKILL) on shard...,@lhoestq Brief update - I've added a reproduct...,## Summary\n\n`embed_table_storage` crashes wi...
3,https://github.com/huggingface/datasets/issues...,push_to_hub OOM: _push_parquet_shards_to_hub a...,`preupload_lfs_files` removes the parquet byte...,## Summary\n\nLarge dataset uploads crash or h...
4,https://github.com/huggingface/datasets/issues...,push_to_hub OOM: _push_parquet_shards_to_hub a...,@lhoestq Thank you for pushing back on this an...,## Summary\n\nLarge dataset uploads crash or h...


In [10]:
# Switch back to Dataset
from datasets import Dataset
comments_dataset = Dataset.from_pandas(df_issues_full)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 10498
})

In [11]:
# Add a new column containing the number of words per comment
comments_dataset = comments_dataset.map(
    lambda x: {"comment_length": len(x["comments"].split())}
)
comments_dataset

Map:   0%|          | 0/10498 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 10498
})

In [12]:
comments_dataset[0]

{'html_url': 'https://github.com/huggingface/datasets/issues/7894',
 'title': 'embed_table_storage crashes (SIGKILL) on sharded datasets with Sequence() nested types',
 'comments': "I wasn't able to reproduce the crash on my side (macos arm 54, pyarrow 22 and a nifti file I found [online](https://s3.amazonaws.com/openneuro.org/ds004884/sub-M2001/ses-1076/anat/sub-M2001_ses-1076_acq-tfl3_run-4_T1w.nii.gz?versionId=9aVGb3C.VcoBgxrhNzFnL6O0MvxQsXX7&AWSAccessKeyId=AKIARTA7OOV5WQ3DGSOB&Signature=LQMLzjsuzSV7MtNAdQaFdqWqmbM%3D&Expires=1765473937))\n\ncould the issue be specific to your env ? have you tried on other environments like colab maybe ?",
 'body': '## Summary\n\n`embed_table_storage` crashes with SIGKILL (exit code 137) when processing sharded datasets containing `Sequence()` nested types like `Sequence(Nifti())`. Likely affects `Sequence(Image())` and `Sequence(Audio())` as well.\n\nThe crash occurs at the C++ level with no Python traceback.\n\n### Related Issues\n\n- #7852 - Prob

In [13]:
# Remove rows where comment_length < 14
comments_dataset = comments_dataset.filter(
    lambda x: x["comment_length"] >= 14
)
comments_dataset

Filter:   0%|          | 0/10498 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 8050
})

In [17]:
def concatenate_text(examples):
    title = examples["title"] or ""
    body = examples["body"] or ""
    comments = examples["comments"] or ""
    
    # If comments is a list, join it
    if isinstance(comments, list):
        comments = " ".join(comments)
    
    return {"text": f"{title} \n {body} \n {comments}"}

In [18]:
# Apply the function to the dataset
comments_dataset = comments_dataset.map(concatenate_text)
comments_dataset

Map:   0%|          | 0/8050 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text'],
    num_rows: 8050
})

In [None]:
comments_dataset[0]

{'html_url': 'https://github.com/huggingface/datasets/issues/7894',
 'title': 'embed_table_storage crashes (SIGKILL) on sharded datasets with Sequence() nested types',
 'comments': "I wasn't able to reproduce the crash on my side (macos arm 54, pyarrow 22 and a nifti file I found [online](https://s3.amazonaws.com/openneuro.org/ds004884/sub-M2001/ses-1076/anat/sub-M2001_ses-1076_acq-tfl3_run-4_T1w.nii.gz?versionId=9aVGb3C.VcoBgxrhNzFnL6O0MvxQsXX7&AWSAccessKeyId=AKIARTA7OOV5WQ3DGSOB&Signature=LQMLzjsuzSV7MtNAdQaFdqWqmbM%3D&Expires=1765473937))\n\ncould the issue be specific to your env ? have you tried on other environments like colab maybe ?",
 'body': '## Summary\n\n`embed_table_storage` crashes with SIGKILL (exit code 137) when processing sharded datasets containing `Sequence()` nested types like `Sequence(Nifti())`. Likely affects `Sequence(Image())` and `Sequence(Audio())` as well.\n\nThe crash occurs at the C++ level with no Python traceback.\n\n### Related Issues\n\n- #7852 - Prob

In [19]:
# Create text embeddings
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [22]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create a pooling function, using the last hidden state from CLS token
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [23]:
# Create a helper function to compute embeddings
def get_embedding(text_list):
    encoded_input = tokenizer(
        text_list,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    encoded_input = {k: v.to(model.device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [24]:
embeddings = get_embedding(comments_dataset["text"][:10])
embeddings.shape

torch.Size([10, 768])