In [1]:
import requests

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/7900',
  'repository_url': 'https://api.github.com/repos/huggingface/datasets',
  'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/7900/labels{/name}',
  'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/7900/comments',
  'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/7900/events',
  'html_url': 'https://github.com/huggingface/datasets/issues/7900',
  'id': 3711751590,
  'node_id': 'I_kwDODunzps7dPNWm',
  'number': 7900,
  'title': '`Permission denied` when sharing cache between users',
  'user': {'login': 'qthequartermasterman',
   'id': 19497738,
   'node_id': 'MDQ6VXNlcjE5NDk3NzM4',
   'avatar_url': 'https://avatars.githubusercontent.com/u/19497738?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/qthequartermasterman',
   'html_url': 'https://github.com/qthequartermasterman',
   'followers_url': 'https://api.github.com/users/qt

In [2]:
response.status_code

200

In [8]:
import os
from dotenv import load_dotenv
import requests

load_dotenv()  # reads .env into environment

token = os.getenv("GITHUB_TOKEN")

# Example GET request
headers = {"Authorization": f"Bearer {token}"}

In [9]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm

response = requests.get("https://api.github.com/repos/huggingface/datasets/issues?per_page=1", headers=headers)
print(response.status_code)
print(response.json())

def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("../data/hf_github_issues"),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

200
[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/7900', 'repository_url': 'https://api.github.com/repos/huggingface/datasets', 'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/7900/labels{/name}', 'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/7900/comments', 'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/7900/events', 'html_url': 'https://github.com/huggingface/datasets/issues/7900', 'id': 3711751590, 'node_id': 'I_kwDODunzps7dPNWm', 'number': 7900, 'title': '`Permission denied` when sharing cache between users', 'user': {'login': 'qthequartermasterman', 'id': 19497738, 'node_id': 'MDQ6VXNlcjE5NDk3NzM4', 'avatar_url': 'https://avatars.githubusercontent.com/u/19497738?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/qthequartermasterman', 'html_url': 'https://github.com/qthequartermasterman', 'followers_url': 'https://api.github.com/users/qthequartermasterman/followers', 'follo

In [11]:
# Depending on your internet connection, this can take several minutes to run...
fetch_issues()

  0%|          | 0/100 [00:00<?, ?it/s]

Reached GitHub rate limit. Sleeping for one hour ...
Downloaded all the issues for datasets! Dataset stored at ../data/hf_github_issues/datasets-issues.jsonl


In [15]:
# Convert to Dataset
from datasets import Dataset
import pandas as pd

df = pd.read_json("../data/hf_github_issues/datasets-issues.jsonl", lines=True)
issues_dataset = Dataset.from_pandas(df)
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'type', 'active_lock_reason', 'sub_issues_summary', 'issue_dependencies_summary', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request'],
    num_rows: 7818
})

In [16]:
# Examine HTML and pull requests
sample = issues_dataset.shuffle(seed=42).select(range(5))
for url, pr in zip(sample["html_url"], sample["pull_request"]):
    print(f">> URL: {url}")
    print(f">> Pull Request: {pr}\n")

>> URL: https://github.com/huggingface/datasets/pull/316
>> Pull Request: {'diff_url': 'https://github.com/huggingface/datasets/pull/316.diff', 'html_url': 'https://github.com/huggingface/datasets/pull/316', 'merged_at': '2020-06-30T08:31:55Z', 'patch_url': 'https://github.com/huggingface/datasets/pull/316.patch', 'url': 'https://api.github.com/repos/huggingface/datasets/pulls/316'}

>> URL: https://github.com/huggingface/datasets/issues/854
>> Pull Request: None

>> URL: https://github.com/huggingface/datasets/pull/3647
>> Pull Request: {'diff_url': 'https://github.com/huggingface/datasets/pull/3647.diff', 'html_url': 'https://github.com/huggingface/datasets/pull/3647', 'merged_at': '2022-01-28T15:35:57Z', 'patch_url': 'https://github.com/huggingface/datasets/pull/3647.patch', 'url': 'https://api.github.com/repos/huggingface/datasets/pulls/3647'}

>> URL: https://github.com/huggingface/datasets/issues/5855
>> Pull Request: None

>> URL: https://github.com/huggingface/datasets/pull/579

In [17]:
# Create a new field indicating whether the issue is a pull request
issue_dataset = issues_dataset.map(
    lambda example: {"is_pull_request": False if example["pull_request"] is None else True}
)

Map:   0%|          | 0/7818 [00:00<?, ? examples/s]

In [18]:
# Examine comments for a specific issue
issue_number = 2792
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/comments/897594128',
  'html_url': 'https://github.com/huggingface/datasets/pull/2792#issuecomment-897594128',
  'issue_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792',
  'id': 897594128,
  'node_id': 'IC_kwDODunzps41gDMQ',
  'user': {'login': 'bhavitvyamalik',
   'id': 19718818,
   'node_id': 'MDQ6VXNlcjE5NzE4ODE4',
   'avatar_url': 'https://avatars.githubusercontent.com/u/19718818?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/bhavitvyamalik',
   'html_url': 'https://github.com/bhavitvyamalik',
   'followers_url': 'https://api.github.com/users/bhavitvyamalik/followers',
   'following_url': 'https://api.github.com/users/bhavitvyamalik/following{/other_user}',
   'gists_url': 'https://api.github.com/users/bhavitvyamalik/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/

In [20]:
# Get comments
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]
issue_number = 2792
comments = get_comments(issue_number)
comments

["@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = 'gooaq'\r\n\r\n    def test_load_dataset(self, dataset_name):\r\n        configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n>       self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n    self.parent.assertTrue(len(dataset[split]) > 0)\r\nE   AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?",
 'Thanks for the help, @albertvillanova! All tests are passing now.']

In [None]:
# Add a new comments column to the dataset
issues_with_comments_dataset = issue_dataset.map(
    lambda example: {"comments": get_comments(example["number"])}
)

Map:   0%|          | 0/7818 [00:00<?, ? examples/s]

In [None]:
# Save dataset to the Hub
issues_with_comments_dataset.push_to_hub("hf-datasets-github-issues-with-comments")


In [None]:
# Load the dataset
from datasets import load_dataset
remote_dataset = load_dataset("tensor-polinomics/hf-datasets-github-issues-with-comments")
remote_dataset