### Create Github issues corpus

In [1]:
# install requests package if not installed
! pip install requests



In [2]:
# import requests package
import requests

In [3]:
# Download the github issues from dataset repository
# first let's get the first issue on first page
url = 'https://api.github.com/repos/huggingface/datasets/issues/1'
response = requests.get(url)

In [4]:
response.status_code

200

In [5]:
response.json()

{'url': 'https://api.github.com/repos/huggingface/datasets/issues/1',
 'repository_url': 'https://api.github.com/repos/huggingface/datasets',
 'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/1/labels{/name}',
 'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/1/comments',
 'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/1/events',
 'html_url': 'https://github.com/huggingface/datasets/pull/1',
 'id': 599457467,
 'node_id': 'MDExOlB1bGxSZXF1ZXN0NDAzMDk1NDYw',
 'number': 1,
 'title': 'changing nlp.bool to nlp.bool_',
 'user': {'login': 'mariamabarham',
  'id': 38249783,
  'node_id': 'MDQ6VXNlcjM4MjQ5Nzgz',
  'avatar_url': 'https://avatars.githubusercontent.com/u/38249783?v=4',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/mariamabarham',
  'html_url': 'https://github.com/mariamabarham',
  'followers_url': 'https://api.github.com/users/mariamabarham/followers',
  'following_url': 'https://api.github.com/u

In [7]:
# get the github token from env file
! pip install python-dotenv
import os

from dotenv import load_dotenv
load_dotenv()





True

In [8]:
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
headers = {'Authorization': f'token {GITHUB_TOKEN}'}

In [9]:
# get the github issues from dataset repository

import time
import math
from pathlib import Path
import pandas as pd
from tqdm import tqdm

In [18]:
# define a function to get the github issues from dataset repository
all_issues = [] # debugging

def fetch_issues(
        owner='huggingface',
        repo='datasets',
        num_issues=10_000,
        rate_limit=5000,
        issues_path=Path("../data"),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    # all_issues = []
    per_page = 100 # number of issues to return per page
    num_pages = math.ceil(num_issues / per_page) # number of pages to request
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(1, num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        # if we reach the rate limit, save the batch and wait until we can make more requests
        if len(batch) >= rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = [] # flush batch for next iteration
            print(f"Reached Github rate limit. Sleeping for 1 min...")
            time.sleep(60) # sleep for 1 minute 

    # save the remaining issues
    all_issues.extend(batch)
    df = pd.Dataframe.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl, orient='records', lines=True")
    print (f"Downloaded all the issues for {repo}! Dataset saved at {issues_path}/{repo}-issues.jsonl")

In [19]:
# fetch the issues from the dataset repository
fetch_issues()

 49%|████▉     | 49/99 [01:00<00:56,  1.13s/it]

Reached Github rate limit. Sleeping for 1 min...


100%|██████████| 99/99 [02:23<00:00,  1.45s/it]


AttributeError: module 'pandas' has no attribute 'Dataframe'

In [21]:
len(all_issues)


5841

In [23]:
# create a dataframe from all_issues list
df = pd.DataFrame.from_records(all_issues)
df.head()

Unnamed: 0,url,repository_url,labels_url,comments_url,events_url,html_url,id,node_id,number,title,...,closed_at,author_association,active_lock_reason,body,reactions,timeline_url,performed_via_github_app,state_reason,draft,pull_request
0,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1754359316,I_kwDODunzps5okWYU,5947,Return the audio filename when decoding fails ...,...,,NONE,,### Feature request\r\n\r\nReturn the audio fi...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,
1,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1754234469,I_kwDODunzps5oj35l,5946,IndexError Not Solving -> IndexError: Invalid ...,...,,NONE,,### Describe the bug\n\nin <cell line: 1>:1 ...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,
2,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1754084577,I_kwDODunzps5ojTTh,5945,Failing to upload dataset to the hub,...,,NONE,,### Describe the bug\n\nTrying to upload a dat...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,
3,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/5944,1752882200,PR_kwDODunzps5Sx7O4,5944,Arrow dataset builder to be able to load and s...,...,,CONTRIBUTOR,,This adds a Arrow dataset builder to be able t...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...
4,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1752824336,I_kwDODunzps5oefoQ,5943,Language `lzh` is not shown on the web interface,...,,NONE,,### Describe the bug\r\n\r\nDespite its popula...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,


In [29]:
df.tail()

Unnamed: 0,url,repository_url,labels_url,comments_url,events_url,html_url,id,node_id,number,title,...,closed_at,author_association,active_lock_reason,body,reactions,timeline_url,performed_via_github_app,state_reason,draft,pull_request
5836,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues/5,600295889,MDU6SXNzdWU2MDAyOTU4ODk=,5,ValueError when a split is empty,...,2020-04-29T09:23:05Z,CONTRIBUTOR,,"When a split is empty either TEST, VALIDATION ...",{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,completed,,
5837,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues/4,600185417,MDU6SXNzdWU2MDAxODU0MTc=,4,[Feature] Keep the list of labels of a dataset...,...,2020-05-04T06:11:57Z,CONTRIBUTOR,,It would be useful to keep the list of the lab...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,completed,,
5838,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues/3,600180050,MDU6SXNzdWU2MDAxODAwNTA=,3,[Feature] More dataset outputs,...,2020-05-04T06:12:27Z,CONTRIBUTOR,,Add the following dataset outputs:\r\n\r\n- Sp...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,completed,,
5839,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues/2,599767671,MDU6SXNzdWU1OTk3Njc2NzE=,2,Issue to read a local dataset,...,2020-05-11T18:55:22Z,CONTRIBUTOR,,"Hello,\r\n\r\nAs proposed by @thomwolf, I open...",{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,completed,,
5840,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/1,599457467,MDExOlB1bGxSZXF1ZXN0NDAzMDk1NDYw,1,changing nlp.bool to nlp.bool_,...,2020-04-14T12:01:40Z,CONTRIBUTOR,,,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...


In [25]:
owner='huggingface'
repo='datasets'
num_issues=10_000
rate_limit=5000
issues_path=Path("../data")
df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient='records', lines=True)
print (f"Downloaded all the issues for {repo}! Dataset saved at {issues_path}/{repo}-issues.jsonl")

Downloaded all the issues for datasets! Dataset saved at ..\data/datasets-issues.jsonl


In [26]:
from datasets import load_dataset

In [27]:
# load the issues from local file
issues_dataset = load_dataset('json', data_files='../data/datasets-issues.jsonl', split='train')

Downloading and preparing dataset json/default to C:/Users/Raj/.cache/huggingface/datasets/json/default-db71a5ded63c27a6/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

In [28]:
issues_dataset

NameError: name 'issues_dataset' is not defined