In [6]:

import requests
import dotenv 
import os
from tqdm import tqdm
dotenv.load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

In [7]:
ENDPOINT = 'https://api.github.com/graphql'
HEADERS_QGL = {
    "Authorization": f"Bearer {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.starfire-preview+json",
}

In [8]:


def search_repos_query(query, after_cursor=None):
    if after_cursor is None:
        query = """
        {
            search(query: """ + query  + """ , type: REPOSITORY, first: 100) {
                pageInfo {
                    hasNextPage
                    endCursor
                }
                nodes {
                    ... on Repository {
                        nameWithOwner
                        stargazerCount
                        primaryLanguage {
                            name
                        }
                        issueTemplates {
                            filename
                            body
                            name
                            title
                        }
                        owner {
                            login
                        }
                        pullRequestTemplates {
                            filename
                            body
                        }
                        primaryLanguage {
                            name
                        }
                        hasWikiEnabled
                        description
                    }
                }
            }
        }
        """
    else:
        query = '''
        {
            search(query: "stars:>1000", type: REPOSITORY, first: 100, after: ''' + after_cursor + ''') {
                pageInfo {
                    hasNextPage
                    endCursor
                }
                nodes {
                    ... on Repository {
                        nameWithOwner
                        stargazerCount
                        primaryLanguage {
                            name
                        }
                        issueTemplates {
                            filename
                            body
                            name
                            title
                        }
                        owner {
                            login
                        }
                        pullRequestTemplates {
                            filename
                            body
                        }
                        primaryLanguage {
                            name
                        }
                        hasWikiEnabled
                        description
                    }
                }
            }
        }
        '''
    return query

def form_query(query, after_cursor=None):
    if after_cursor is None:
        return search_repos_query(query)
    else:
        return search_repos_query(query,after_cursor=after_cursor)        

In [9]:
def get_repos_from_query(query):
    templates_repo = []
    individual_issue_contents = []
    individual_pr_contents = []
    after_cursor = None
    total_projects = 0
    for i in range(1, 11):
        print(f"Page {i}")
        query = form_query(query, after_cursor=after_cursor)
        response = requests.post(ENDPOINT, json={'query': query}, headers=HEADERS_QGL)
        data = response.json()
        after_cursor = '"' + data["data"]["search"]["pageInfo"]["endCursor"] + '"'
        total_projects += len(data["data"]["search"]["nodes"])
        for node in data["data"]["search"]["nodes"]:
            # check if the repo has issue templates and pull request templates
            if node["issueTemplates"] and node["pullRequestTemplates"]:
                # add the repo to the list
                # form a json object
                json_data = {}
                issue_files = []
                for templates in node["issueTemplates"]:
                    issue_contents_data = {}
                    issue_contents_data['repo'] = node["nameWithOwner"]
                    issue_contents_data['owner'] = node["owner"]["login"]
                    issue_contents_data['issue_file'] = templates['filename']
                    issue_contents_data['title'] = templates['title']
                    issue_contents_data['body'] = templates['body']
                    issue_contents_data['name'] = templates['name']
                    issue_files.append(templates['filename'])
                    individual_issue_contents.append(issue_contents_data)
                file_names = []
                for templates in node["pullRequestTemplates"]:
                    pr_contents_data = {}
                    pr_contents_data['repo'] = node["nameWithOwner"]
                    pr_contents_data['owner'] = node["owner"]["login"]
                    pr_contents_data['pr_file'] = templates['filename']
                    pr_contents_data['body'] = templates['body']
                    file_names.append(templates['filename'])
                    individual_pr_contents.append(pr_contents_data)
                if node['primaryLanguage'] is None:
                    if "language:" in query:
                        query_lang = query.split("language:")[1].split(" ")[0]
                        json_data['primaryLanguage'] = query_lang
                    else:
                        json_data['primaryLanguage'] = "None"
                else:
                    json_data['primaryLanguage'] = node['primaryLanguage']['name']
                if node['description'] is None:
                    json_data['description'] = "None"
                else:
                    json_data['description'] = node['description']
                json_data['hasWikiEnabled'] = node['hasWikiEnabled']
                json_data['stargazerCount'] = node['stargazerCount']
                json_data['pull_request_files'] = file_names
                json_data['issue_files'] = issue_files
                json_data['nameWithOwner'] = node['nameWithOwner']
                json_data['owner'] = node['owner']['login']
                templates_repo.append(json_data)
        if not data["data"]["search"]["pageInfo"]["hasNextPage"]:
            break

    print(len(templates_repo))
    print(total_projects)
    return templates_repo, individual_issue_contents, individual_pr_contents, total_projects

In [10]:
all_templates_repo = []
all_issue_contents = []
all_pr_contents = []
all_total_projects = 0
# get stars:>1000 repos 
query = '"stars:>1000"'
templates_repo, individual_issue_contents, individual_pr_contents, total_projects = get_repos_from_query(query=query)
all_templates_repo.extend(templates_repo)
all_issue_contents.extend(individual_issue_contents)
all_pr_contents.extend(individual_pr_contents)
all_total_projects += total_projects
# get top python repos
query = '"stars:>1000 language:python"'
templates_repo, individual_issue_contents, individual_pr_contents, total_projects = get_repos_from_query(query=query)
all_templates_repo.extend(templates_repo)
all_issue_contents.extend(individual_issue_contents)
all_pr_contents.extend(individual_pr_contents)
all_total_projects += total_projects


# get top java repos
query = '"stars:>1000 language:java"'
templates_repo, individual_issue_contents, individual_pr_contents, total_projects = get_repos_from_query(query=query)
all_templates_repo.extend(templates_repo)
all_issue_contents.extend(individual_issue_contents)
all_pr_contents.extend(individual_pr_contents)
all_total_projects += total_projects


# get top javascript repos
query = '"stars:>1000 language:javascript"'
templates_repo, individual_issue_contents, individual_pr_contents, total_projects = get_repos_from_query(query=query)
all_templates_repo.extend(templates_repo)
all_issue_contents.extend(individual_issue_contents)
all_pr_contents.extend(individual_pr_contents)
all_total_projects += total_projects


# get top c++ repos
query = '"stars:>1000 language:c++"'
templates_repo, individual_issue_contents, individual_pr_contents, total_projects = get_repos_from_query(query=query)
all_templates_repo.extend(templates_repo)
all_issue_contents.extend(individual_issue_contents)
all_pr_contents.extend(individual_pr_contents)
all_total_projects += total_projects

query = '"stars:>1000 language:c"'
templates_repo, individual_issue_contents, individual_pr_contents, total_projects = get_repos_from_query(query=query)
all_templates_repo.extend(templates_repo)
all_issue_contents.extend(individual_issue_contents)
all_pr_contents.extend(individual_pr_contents)
all_total_projects += total_projects

query = '"stars:>1000 language:go"'
templates_repo, individual_issue_contents, individual_pr_contents, total_projects = get_repos_from_query(query=query)
all_templates_repo.extend(templates_repo)
all_issue_contents.extend(individual_issue_contents)
all_pr_contents.extend(individual_pr_contents)
all_total_projects += total_projects

query = '"stars:>1000 language:ruby"'
templates_repo, individual_issue_contents, individual_pr_contents, total_projects = get_repos_from_query(query=query)
all_templates_repo.extend(templates_repo)
all_issue_contents.extend(individual_issue_contents)
all_pr_contents.extend(individual_pr_contents)
all_total_projects += total_projects

query = '"stars:>1000 language:php"'
templates_repo, individual_issue_contents, individual_pr_contents, total_projects = get_repos_from_query(query=query)
all_templates_repo.extend(templates_repo)
all_issue_contents.extend(individual_issue_contents)
all_pr_contents.extend(individual_pr_contents)
all_total_projects += total_projects

query = '"stars:>1000 language:rust"'
templates_repo, individual_issue_contents, individual_pr_contents, total_projects = get_repos_from_query(query=query)
all_templates_repo.extend(templates_repo)
all_issue_contents.extend(individual_issue_contents)
all_pr_contents.extend(individual_pr_contents)
all_total_projects += total_projects

query = '"stars:>1000 language:swift"'
templates_repo, individual_issue_contents, individual_pr_contents, total_projects = get_repos_from_query(query=query)
all_templates_repo.extend(templates_repo)
all_issue_contents.extend(individual_issue_contents)
all_pr_contents.extend(individual_pr_contents)
all_total_projects += total_projects

query = '"stars:>1000 language:typescript"'
templates_repo, individual_issue_contents, individual_pr_contents, total_projects = get_repos_from_query(query=query)
all_templates_repo.extend(templates_repo)
all_issue_contents.extend(individual_issue_contents)
all_pr_contents.extend(individual_pr_contents)
all_total_projects += total_projects

print(all_total_projects)
print(len(all_templates_repo))

Page 1
Page 2
Page 3
Page 4
Page 5
Page 6
Page 7
Page 8
Page 9
Page 10
182
1000
Page 1
Page 2
Page 3
Page 4
Page 5
Page 6
Page 7
Page 8
Page 9
Page 10
186
1000
Page 1
Page 2
Page 3
Page 4
Page 5
Page 6
Page 7
Page 8
Page 9
Page 10
181
1000
Page 1
Page 2
Page 3
Page 4
Page 5
Page 6
Page 7
Page 8
Page 9
Page 10
188
1000
Page 1
Page 2
Page 3
Page 4
Page 5
Page 6
Page 7
Page 8
Page 9
Page 10
185
1000
Page 1
Page 2
Page 3
Page 4
Page 5
Page 6
Page 7
Page 8
Page 9
Page 10
173
1000
Page 1
Page 2
Page 3
Page 4
Page 5
Page 6
Page 7
Page 8
Page 9
Page 10
197
1000
Page 1
Page 2
Page 3
Page 4
Page 5
Page 6
Page 7
Page 8
Page 9
Page 10
182
1000
Page 1
Page 2
Page 3
Page 4
Page 5
Page 6
Page 7
Page 8
Page 9
Page 10
178
1000
Page 1
Page 2
Page 3
Page 4
Page 5
Page 6
Page 7
Page 8
Page 9
Page 10
194
1000
Page 1
Page 2
Page 3
Page 4
Page 5
Page 6
Page 7
Page 8
Page 9
Page 10
176
1000
Page 1
Page 2
Page 3
Page 4
Page 5
Page 6
Page 7
Page 8
Page 9
Page 10
187
1000
12000
2209


In [1]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

df = pd.DataFrame(all_templates_repo)

for repo in all_templates_repo:
    repo['nameWithOwner'] = repo['nameWithOwner'].replace('/', '_')
    repo['owner'] = repo['owner'].replace('/', '_')
    repo['description'] = repo['description']
    repo['issue_files_to_str'] = ",".join(repo['issue_files']) 
    repo['pull_request_files_to_str'] = ",".join(repo['pull_request_files'])
    df = df.append(repo, ignore_index=True)

df.to_csv('templates_repo.csv', index=False)

NameError: name 'all_templates_repo' is not defined

In [12]:
issue_df = pd.DataFrame(all_issue_contents)

for issue_content in individual_issue_contents:
    issue_content['repo'] = issue_content['repo'].replace('/', '_')
    issue_content['owner'] = issue_content['owner'].replace('/', '_')
    issue_content['issue_file'] = issue_content['issue_file'].replace('/', '_')
    issue_content['body'] = issue_content['body'].replace('/', '_')
    issue_content['name'] = issue_content['name'].replace('/', '_')
    issue_content['issue_file'] = issue_content['issue_file'].replace('/', '_')
    issue_df = issue_df.append(issue_content, ignore_index=True)

issue_df.to_csv('individual_issue_contents.csv', index=False)

In [13]:
issue_df

Unnamed: 0,repo,owner,issue_file,title,body,name
0,facebook/react,facebook,bug_report.md,Bug:,<!--\n Please provide a clear and concise des...,🐛 Bug Report
1,getify/You-Dont-Know-JS,getify,content-question.md,,"**Yes, I promise I've read the [Contributions ...",Content Question
2,getify/You-Dont-Know-JS,getify,foreign-translation-request.md,,"Foreign translations are appreciated. However,...",Foreign Translation Request
3,getify/You-Dont-Know-JS,getify,report-technical-mistake.md,,"**Yes, I promise I've read the [Contributions ...",Report Technical Mistake
4,getify/You-Dont-Know-JS,getify,textual-grammar-typo.md,,"**Yes, I promise I've read the [Contributions ...",Textual/Grammar Typo
...,...,...,...,...,...,...
6569,forem_forem,forem,bug_report.md,,"<!-- Before creating a bug report, try disabli...",Bug report
6570,go-kratos_kratos,go-kratos,bug-report.md,,<!--\nPlease answer these questions before sub...,🐛 Bug Report
6571,go-kratos_kratos,go-kratos,feature-request.md,[Feature],Please see the FAQ in our main README.md befor...,💡 Feature Request
6572,go-kratos_kratos,go-kratos,proposal.md,[Proposal],Please see the FAQ in our main README.md befor...,🧱 Proposal Request


In [14]:
pr_df = pd.DataFrame(all_pr_contents)

for pr_content in individual_pr_contents:
    pr_content['repo'] = pr_content['repo'].replace('/', '_')
    pr_content['owner'] = pr_content['owner'].replace('/', '_')
    pr_content['pr_file'] = pr_content['pr_file'].replace('/', '_')
    pr_content['body'] = pr_content['body'].replace('/', '_')
    pr_df = pr_df.append(pr_content, ignore_index=True)

pr_df.to_csv('individual_pr_contents.csv', index=False)

In [15]:


# load cvs file 
df = pd.read_csv('templates_repo.csv')

# remove issue_files and pull_request_files columns that have nan values and just ',' in them
df = df.dropna(subset=['issue_files', 'pull_request_files'], how='all')
df = df[(df['issue_files'] != ',') & (df['pull_request_files'] != ',')]
df = df.reset_index(drop=True)

df = df.drop_duplicates(subset=['nameWithOwner', 'owner'], keep='first')
df = df.reset_index(drop=True)

df['issue_files_to_str'] = df['issue_files'].apply(lambda x: x.replace('[', ''))
df['issue_files_to_str'] = df['issue_files_to_str'].apply(lambda x: x.replace(']', ''))
df['issue_files_to_str'] = df['issue_files_to_str'].apply(lambda x: x.replace("'", ''))
df['issue_files_to_str'] = df['issue_files_to_str'].apply(lambda x: x.replace(" ", ''))

df['pull_request_files_to_str'] = df['pull_request_files'].apply(lambda x: x.replace('[', ''))
df['pull_request_files_to_str'] = df['pull_request_files_to_str'].apply(lambda x: x.replace(']', ''))
df['pull_request_files_to_str'] = df['pull_request_files_to_str'].apply(lambda x: x.replace("'", ''))
df['pull_request_files_to_str'] = df['pull_request_files_to_str'].apply(lambda x: x.replace(" ", ''))

df['len_of_issue_files'] = df['issue_files_to_str'].apply(lambda x: len(x.split(',')))
df['len_of_pull_request_files'] = df['pull_request_files_to_str'].apply(lambda x: len(x.split(',')))

    primaryLanguage                                        description  \
1              None    A book series on JavaScript. @YDKJS on twitter.   
3              Dart  Flutter makes it easy and fast to build beauti...   
4        TypeScript                                 Visual Studio Code   
6            Python  Command-line program to download videos from Y...   
7          Markdown  刷算法全靠套路，认准 labuladong 就够了！English version supp...   
..              ...                                                ...   
530           Swift                                    Firefox for iOS   
533           Swift                Websockets in swift for iOS and OSX   
534           Swift  A tool for defining design systems and using t...   
536           Swift                          Windows alt-tab on macOS    
537           Swift  SwiftEntryKit is a presentation library for iO...   

     hasWikiEnabled  stargazerCount            pull_request_files  \
1             False          167317  ['PUL

In [16]:
len(df)

538

In [17]:

issue_contents_df = pd.read_csv('individual_issue_contents.csv')

issue_contents_df = issue_contents_df.dropna(subset=['issue_file', 'body'], how='all')
issue_contents_df = issue_contents_df[(issue_contents_df['issue_file'] != '') & (issue_contents_df['body'] != '')]
issue_contents_df = issue_contents_df.reset_index(drop=True)
issue_contents_df = issue_contents_df.reset_index(drop=True)

issue_contents_df.head()

Unnamed: 0,repo,owner,issue_file,title,body,name
0,facebook/react,facebook,bug_report.md,Bug:,<!--\n Please provide a clear and concise des...,🐛 Bug Report
1,getify/You-Dont-Know-JS,getify,content-question.md,,"**Yes, I promise I've read the [Contributions ...",Content Question
2,getify/You-Dont-Know-JS,getify,foreign-translation-request.md,,"Foreign translations are appreciated. However,...",Foreign Translation Request
3,getify/You-Dont-Know-JS,getify,report-technical-mistake.md,,"**Yes, I promise I've read the [Contributions ...",Report Technical Mistake
4,getify/You-Dont-Know-JS,getify,textual-grammar-typo.md,,"**Yes, I promise I've read the [Contributions ...",Textual/Grammar Typo


In [18]:
# print same repo and owner but different issue_file
different_issue_file = issue_contents_df[issue_contents_df.duplicated(subset=['repo', 'owner'], keep=False)]
print(different_issue_file)


                         repo      owner                      issue_file  \
0              facebook/react   facebook                   bug_report.md   
1     getify/You-Dont-Know-JS     getify             content-question.md   
2     getify/You-Dont-Know-JS     getify  foreign-translation-request.md   
3     getify/You-Dont-Know-JS     getify     report-technical-mistake.md   
4     getify/You-Dont-Know-JS     getify         textual-grammar-typo.md   
...                       ...        ...                             ...   
6569              forem_forem      forem                   bug_report.md   
6570         go-kratos_kratos  go-kratos                   bug-report.md   
6571         go-kratos_kratos  go-kratos              feature-request.md   
6572         go-kratos_kratos  go-kratos                     proposal.md   
6573         go-kratos_kratos  go-kratos                     question.md   

           title                                               body  \
0          Bug: 

In [19]:

pr_contents_df = pd.read_csv('individual_pr_contents.csv')

pr_contents_df = pr_contents_df.dropna(subset=['pr_file', 'body'], how='all')
pr_contents_df = pr_contents_df[(pr_contents_df['pr_file'] != '') & (pr_contents_df['body'] != '')]
pr_contents_df = pr_contents_df.reset_index(drop=True)
pr_contents_df = pr_contents_df.reset_index(drop=True)

pr_contents_df.head()


Unnamed: 0,repo,owner,pr_file,body
0,facebook/react,facebook,PULL_REQUEST_TEMPLATE.md,<!--\n Thanks for submitting a pull request!\...
1,getify/You-Dont-Know-JS,getify,PULL_REQUEST_TEMPLATE.md,"**Yes, I promise I've read the [Contributions ..."
2,vinta/awesome-python,vinta,PULL_REQUEST_TEMPLATE.md,## What is this Python project?\n\nDescribe fe...
3,flutter/flutter,flutter,PULL_REQUEST_TEMPLATE.md,*Replace this paragraph with a description of ...
4,microsoft/vscode,microsoft,pull_request_template.md,<!-- Thank you for submitting a Pull Request. ...


In [20]:


different_pr_file = pr_contents_df[pr_contents_df.duplicated(subset=['repo', 'owner'], keep=False)]


# print the owner and repo and pr_file
print(different_pr_file[['owner', 'repo', 'pr_file']])

                      owner                                         repo  \
0                  facebook                               facebook/react   
2                     vinta                         vinta/awesome-python   
4                 microsoft                             microsoft/vscode   
6                  ytdl-org                          ytdl-org/youtube-dl   
8                    golang                                    golang/go   
...                     ...                                          ...   
2511               markedjs                              markedjs_marked   
2526         ianstormtaylor                         ianstormtaylor_slate   
2528                 mobxjs                                  mobxjs_mobx   
2554  react-native-elements  react-native-elements_react-native-elements   
2561                 doczjs                                  doczjs_docz   

                       pr_file  
0     PULL_REQUEST_TEMPLATE.md  
2     PULL_REQUEST_TE