### Create Github issues corpus

In [5]:
# install requests package if not installed
! pip install requests



In [6]:
# import requests package
import requests

In [7]:
# Download the github issues from dataset repository
# first let's get the first issue on first page
url = 'https://api.github.com/repos/huggingface/datasets/issues/1'
response = requests.get(url)

In [8]:
response.status_code

200

In [9]:
response.json()

{'url': 'https://api.github.com/repos/huggingface/datasets/issues/1',
 'repository_url': 'https://api.github.com/repos/huggingface/datasets',
 'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/1/labels{/name}',
 'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/1/comments',
 'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/1/events',
 'html_url': 'https://github.com/huggingface/datasets/pull/1',
 'id': 599457467,
 'node_id': 'MDExOlB1bGxSZXF1ZXN0NDAzMDk1NDYw',
 'number': 1,
 'title': 'changing nlp.bool to nlp.bool_',
 'user': {'login': 'mariamabarham',
  'id': 38249783,
  'node_id': 'MDQ6VXNlcjM4MjQ5Nzgz',
  'avatar_url': 'https://avatars.githubusercontent.com/u/38249783?v=4',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/mariamabarham',
  'html_url': 'https://github.com/mariamabarham',
  'followers_url': 'https://api.github.com/users/mariamabarham/followers',
  'following_url': 'https://api.github.com/u

In [10]:
# get the github token from env file
! pip install python-dotenv
import os

from dotenv import load_dotenv
load_dotenv()





True

In [11]:
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
headers = {'Authorization': f'token {GITHUB_TOKEN}'}

In [12]:
# get the github issues from dataset repository

import time
import math
from pathlib import Path
import pandas as pd
from tqdm import tqdm

In [14]:
# define a function to get the github issues from dataset repository
all_issues = [] # debugging

def fetch_issues(
        owner='huggingface',
        repo='datasets',
        num_issues=5000,
        rate_limit=5000,
        issues_path=Path("../data"),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    # all_issues = []
    per_page = 100 # number of issues to return per page
    num_pages = math.ceil(num_issues / per_page) # number of pages to request
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(1, num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        # if we reach the rate limit, save the batch and wait until we can make more requests
        if len(batch) >= rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = [] # flush batch for next iteration
            print(f"Reached Github rate limit. Sleeping for 1 min...")
            time.sleep(60) # sleep for 1 minute 

    # save the remaining issues
    all_issues.extend(batch)
    df = pd.Dataframe.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl, orient='records', lines=True")
    print (f"Downloaded all the issues for {repo}! Dataset saved at {issues_path}/{repo}-issues.jsonl")

In [15]:
# fetch the issues from the dataset repository
fetch_issues()

100%|██████████| 49/49 [00:54<00:00,  1.11s/it]


AttributeError: module 'pandas' has no attribute 'Dataframe'

In [16]:
len(all_issues)


4900

In [17]:
# create a dataframe from all_issues list
df = pd.DataFrame.from_records(all_issues)
df.head()

Unnamed: 0,url,repository_url,labels_url,comments_url,events_url,html_url,id,node_id,number,title,...,closed_at,author_association,active_lock_reason,body,reactions,timeline_url,performed_via_github_app,state_reason,draft,pull_request
0,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1755197946,I_kwDODunzps5onjH6,5950,Support for data with instance-wise dictionary...,...,,NONE,,### Feature request\n\nI notice that when load...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,
1,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/5949,1754843717,PR_kwDODunzps5S4oPC,5949,Replace metadata utils with `huggingface_hub`'...,...,,CONTRIBUTOR,,Use `huggingface_hub`'s RepoCard API instead o...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...
2,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/5948,1754794611,PR_kwDODunzps5S4dUt,5948,Fix sequence of array support for most dtype,...,,CONTRIBUTOR,,"Fixes #5936 \r\nAlso, a related fix to #5927",{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...
3,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1754359316,I_kwDODunzps5okWYU,5947,Return the audio filename when decoding fails ...,...,,NONE,,### Feature request\r\n\r\nReturn the audio fi...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,
4,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1754234469,I_kwDODunzps5oj35l,5946,IndexError Not Solving -> IndexError: Invalid ...,...,,NONE,,### Describe the bug\n\nin <cell line: 1>:1 ...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,


In [18]:
df.tail()

Unnamed: 0,url,repository_url,labels_url,comments_url,events_url,html_url,id,node_id,number,title,...,closed_at,author_association,active_lock_reason,body,reactions,timeline_url,performed_via_github_app,state_reason,draft,pull_request
4895,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/959,754418610,MDExOlB1bGxSZXF1ZXN0NTMwMzIxOTM1,959,Add Tunizi Dataset,...,2020-12-03T14:21:40Z,MEMBER,,,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...
4896,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/958,754404095,MDExOlB1bGxSZXF1ZXN0NTMwMzA5ODkz,958,dataset(ncslgr): add initial loading script,...,2020-12-07T16:35:39Z,CONTRIBUTOR,,clean #789,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...
4897,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/957,754380073,MDExOlB1bGxSZXF1ZXN0NTMwMjg5OTk4,957,Isixhosa ner corpus,...,2020-12-01T18:14:58Z,CONTRIBUTOR,,,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...
4898,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/956,754368378,MDExOlB1bGxSZXF1ZXN0NTMwMjgwMzU1,956,Add Norwegian NER,...,2020-12-01T18:09:21Z,CONTRIBUTOR,,This PR adds the [Norwegian NER](https://githu...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...
4899,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/955,754367291,MDExOlB1bGxSZXF1ZXN0NTMwMjc5NDQw,955,Added PragmEval benchmark,...,2020-12-03T09:36:47Z,CONTRIBUTOR,,,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...


In [19]:
# list all null values in the dataframe
df.isnull().sum()

url                            0
repository_url                 0
labels_url                     0
comments_url                   0
events_url                     0
html_url                       0
id                             0
node_id                        0
number                         0
title                          0
user                           0
labels                         0
state                          0
locked                         0
assignee                    4293
assignees                      0
milestone                   4839
comments                       0
created_at                     0
updated_at                     0
closed_at                    597
author_association             0
active_lock_reason          4900
body                         180
reactions                      0
timeline_url                   0
performed_via_github_app    4900
state_reason                3515
draft                       1901
pull_request                1901
dtype: int

In [20]:
# dataframe outside of the function definition works fine
# owner='huggingface'
repo='datasets'
# num_issues=10_000
# rate_limit=5000
issues_path=Path("../data")
df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient='records', lines=True)
print (f"Downloaded all the issues for {repo}! Dataset saved at {issues_path}/{repo}-issues.jsonl")

Downloaded all the issues for datasets! Dataset saved at ..\data/datasets-issues.jsonl


In [23]:
from datasets import load_dataset
from datasets import load_from_disk

In [30]:
# load the local file from disk
# issues_dataset = load_from_disk('../data/datasets-issues.jsonl')
issues_dataset = load_dataset('json', data_files='../data/datasets-issues.jsonl', split='train', streaming=True)

In [35]:
# get the next example from the dataset
next(iter(issues_dataset))

{'url': 'https://api.github.com/repos/huggingface/datasets/issues/5950',
 'repository_url': 'https://api.github.com/repos/huggingface/datasets',
 'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/5950/labels{/name}',
 'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/5950/comments',
 'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/5950/events',
 'html_url': 'https://github.com/huggingface/datasets/issues/5950',
 'id': 1755197946,
 'node_id': 'I_kwDODunzps5onjH6',
 'number': 5950,
 'title': 'Support for data with instance-wise dictionary as features',
 'user': {'login': 'richardwth',
  'id': 33274336,
  'node_id': 'MDQ6VXNlcjMzMjc0MzM2',
  'avatar_url': 'https://avatars.githubusercontent.com/u/33274336?v=4',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/richardwth',
  'html_url': 'https://github.com/richardwth',
  'followers_url': 'https://api.github.com/users/richardwth/followers',
  'following_url': 'h

#### Cleaning up data

In [37]:
from itertools import islice

In [43]:
# sample the iterable dataset
sample = list(islice(issues_dataset, 5))
sample_df = pd.DataFrame(sample)
sample_df.head()

Unnamed: 0,url,repository_url,labels_url,comments_url,events_url,html_url,id,node_id,number,title,...,closed_at,author_association,active_lock_reason,body,reactions,timeline_url,performed_via_github_app,state_reason,draft,pull_request
0,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1755197946,I_kwDODunzps5onjH6,5950,Support for data with instance-wise dictionary...,...,,NONE,,### Feature request\n\nI notice that when load...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,
1,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/5949,1754843717,PR_kwDODunzps5S4oPC,5949,Replace metadata utils with `huggingface_hub`'...,...,,CONTRIBUTOR,,Use `huggingface_hub`'s RepoCard API instead o...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...
2,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/5948,1754794611,PR_kwDODunzps5S4dUt,5948,Fix sequence of array support for most dtype,...,,CONTRIBUTOR,,"Fixes #5936 \r\nAlso, a related fix to #5927",{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...
3,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1754359316,I_kwDODunzps5okWYU,5947,Return the audio filename when decoding fails ...,...,,NONE,,### Feature request\r\n\r\nReturn the audio fi...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,
4,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1754234469,I_kwDODunzps5oj35l,5946,IndexError Not Solving -> IndexError: Invalid ...,...,,NONE,,### Describe the bug\n\nin <cell line: 1>:1 ...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,


In [45]:
# print out the url and pull request entries
for url, pr in zip(sample_df['html_url'], sample_df['pull_request']):
    print(f"URL: {url}")
    print(f"Pull Request: {pr}")
    print()

URL: https://github.com/huggingface/datasets/issues/5950
Pull Request: None

URL: https://github.com/huggingface/datasets/pull/5949
Pull Request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/5949', 'html_url': 'https://github.com/huggingface/datasets/pull/5949', 'diff_url': 'https://github.com/huggingface/datasets/pull/5949.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/5949.patch', 'merged_at': None}

URL: https://github.com/huggingface/datasets/pull/5948
Pull Request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/5948', 'html_url': 'https://github.com/huggingface/datasets/pull/5948', 'diff_url': 'https://github.com/huggingface/datasets/pull/5948.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/5948.patch', 'merged_at': None}

URL: https://github.com/huggingface/datasets/issues/5947
Pull Request: None

URL: https://github.com/huggingface/datasets/issues/5946
Pull Request: None



In [48]:
# create a new column to store is_pull_request (pandas version)
cleanup_df = df.copy()
cleanup_df['is_pull_request'] = cleanup_df['pull_request'].apply(lambda x: False if pd.isnull(x) else True)
cleanup_df.head()

Unnamed: 0,url,repository_url,labels_url,comments_url,events_url,html_url,id,node_id,number,title,...,author_association,active_lock_reason,body,reactions,timeline_url,performed_via_github_app,state_reason,draft,pull_request,is_pull_request
0,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1755197946,I_kwDODunzps5onjH6,5950,Support for data with instance-wise dictionary...,...,NONE,,### Feature request\n\nI notice that when load...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,,False
1,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/5949,1754843717,PR_kwDODunzps5S4oPC,5949,Replace metadata utils with `huggingface_hub`'...,...,CONTRIBUTOR,,Use `huggingface_hub`'s RepoCard API instead o...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...,True
2,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/5948,1754794611,PR_kwDODunzps5S4dUt,5948,Fix sequence of array support for most dtype,...,CONTRIBUTOR,,"Fixes #5936 \r\nAlso, a related fix to #5927",{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...,True
3,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1754359316,I_kwDODunzps5okWYU,5947,Return the audio filename when decoding fails ...,...,NONE,,### Feature request\r\n\r\nReturn the audio fi...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,,False
4,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1754234469,I_kwDODunzps5oj35l,5946,IndexError Not Solving -> IndexError: Invalid ...,...,NONE,,### Describe the bug\n\nin <cell line: 1>:1 ...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,,False


In [49]:
# create a new column to store is_pull_request (datasets version)
issues_dataset = issues_dataset.map(lambda x: {'is_pull_request': False if pd.isnull(x['pull_request']) else True})

In [52]:
# calculate the number of pull requests
cleanup_df['is_pull_request'].value_counts()

is_pull_request
True     2999
False    1901
Name: count, dtype: int64

In [53]:
# testing the github comments endpoint
issue_number = 5950
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/comments/1591075911',
  'html_url': 'https://github.com/huggingface/datasets/issues/5950#issuecomment-1591075911',
  'issue_url': 'https://api.github.com/repos/huggingface/datasets/issues/5950',
  'id': 1591075911,
  'node_id': 'IC_kwDODunzps5e1eRH',
  'user': {'login': 'lhoestq',
   'id': 42851186,
   'node_id': 'MDQ6VXNlcjQyODUxMTg2',
   'avatar_url': 'https://avatars.githubusercontent.com/u/42851186?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/lhoestq',
   'html_url': 'https://github.com/lhoestq',
   'followers_url': 'https://api.github.com/users/lhoestq/followers',
   'following_url': 'https://api.github.com/users/lhoestq/following{/other_user}',
   'gists_url': 'https://api.github.com/users/lhoestq/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/lhoestq/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/lhoestq/subscriptions',
   'organizations_ur

In [54]:
# define a function to get the github comments from an issue
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]

In [55]:
# test the function works
get_comments(5950)

['Hi ! We use the Arrow columnar format under the hood, which doesn\'t support such dictionaries: each field must have a fixed type and exist in each sample.\r\n\r\nInstead you can restructure your data like\r\n```\r\n{\r\n    "index": 0,\r\n    "keys": ["2 * x + y >= 3"],\r\n    "values": [["2 * x + y >= 3", "4 * x + 2 * y >= 6"]],\r\n    }\r\n},\r\n...\r\n{\r\n    "index": 9999,\r\n    "keys": ["x >= 6"],\r\n    "values": [["x >= 6", "x >= 0", "x >= -1"]],\r\n},\r\n...\r\n```']

In [56]:
# create a new column to store the comments
cleanup_df['comments'] = cleanup_df['number'].apply(get_comments)

In [57]:
cleanup_df.head()

Unnamed: 0,url,repository_url,labels_url,comments_url,events_url,html_url,id,node_id,number,title,...,author_association,active_lock_reason,body,reactions,timeline_url,performed_via_github_app,state_reason,draft,pull_request,is_pull_request
0,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1755197946,I_kwDODunzps5onjH6,5950,Support for data with instance-wise dictionary...,...,NONE,,### Feature request\n\nI notice that when load...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,,False
1,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/5949,1754843717,PR_kwDODunzps5S4oPC,5949,Replace metadata utils with `huggingface_hub`'...,...,CONTRIBUTOR,,Use `huggingface_hub`'s RepoCard API instead o...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...,True
2,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/5948,1754794611,PR_kwDODunzps5S4dUt,5948,Fix sequence of array support for most dtype,...,CONTRIBUTOR,,"Fixes #5936 \r\nAlso, a related fix to #5927",{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...,True
3,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1754359316,I_kwDODunzps5okWYU,5947,Return the audio filename when decoding fails ...,...,NONE,,### Feature request\r\n\r\nReturn the audio fi...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,,False
4,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1754234469,I_kwDODunzps5oj35l,5946,IndexError Not Solving -> IndexError: Invalid ...,...,NONE,,### Describe the bug\n\nin <cell line: 1>:1 ...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,,False


In [58]:
# check for null values in the comments column
cleanup_df['comments'].isnull().sum()

0

In [59]:
# sample comments column
cleanup_df['comments'].sample(5)

3208                                                   []
536     [_The documentation is not available anymore a...
2999    [@lhoestq, thanks for your review.\r\n\r\nI ad...
3102                                                   []
4082    [We are very actively working on this. How doe...
Name: comments, dtype: object

In [62]:
# show all the columns of the dataframe
cleanup_df.columns

Index(['url', 'repository_url', 'labels_url', 'comments_url', 'events_url',
       'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels',
       'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments',
       'created_at', 'updated_at', 'closed_at', 'author_association',
       'active_lock_reason', 'body', 'reactions', 'timeline_url',
       'performed_via_github_app', 'state_reason', 'draft', 'pull_request',
       'is_pull_request'],
      dtype='object')