In [6]:
import pandas as pd
import time
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

In [None]:
GITHUB_TOKEN = 'TODO_TOKEN' # TODO add Token

In [8]:
transport = RequestsHTTPTransport(
    url='https://api.github.com/graphql',
    headers={'Authorization': f'bearer {GITHUB_TOKEN}'},
    retries=3,
)

client = Client(transport=transport, fetch_schema_from_transport=True)

In [9]:
def fetch_basic_repo_info(limit=300):
    query = """
    query ($cursor: String) {
      search(query: "stars:>1000  sort:stars", type: REPOSITORY, first: 100, after: $cursor) {
        pageInfo {
          hasNextPage
          endCursor
        }
        nodes {
          ... on Repository {
            name
            owner { login }
            stargazerCount
            createdAt
            updatedAt
            url
          }
        }
      }
    }
    """

    repos = []
    cursor = None
    total_repos = 0

    while total_repos < limit:
        print(f"🔄 Fetching repositories {total_repos + 1} to {total_repos + 100}...")
        variables = {"cursor": cursor}

        try:
            response = client.execute(gql(query), variable_values=variables)
            new_repos = response["search"]["nodes"]
            
            repos += new_repos
            total_repos = len(repos)

            page_info = response["search"]["pageInfo"]
            if not page_info["hasNextPage"]:
                break
            cursor = page_info["endCursor"]

        except Exception as e:
            print(f"⚠️ Error fetching repositories: {e}")
            time.sleep(10)
            continue

    return pd.DataFrame(repos)

In [10]:
df_repos = fetch_basic_repo_info(300)

🔄 Fetching repositories 1 to 100...
🔄 Fetching repositories 101 to 200...
🔄 Fetching repositories 201 to 300...


In [11]:
def fetch_extra_repo_info(repo_owner, repo_name):
    query = """
    query ($repoOwner: String!, $repoName: String!) {
      repository(owner: $repoOwner, name: $repoName) {
        mergedPRs: pullRequests(states: MERGED) {
          totalCount
        }
        closedPRs: pullRequests(states: CLOSED) {
          totalCount
        }
      }
    }
    """

    variables = {"repoOwner": repo_owner, "repoName": repo_name}

    try:
        response = client.execute(gql(query), variable_values=variables)
        repo_info = response['repository']
        
        print(f"Fetched PRs for {repo_owner}/{repo_name}")
        
        return {
            "mergedPRs": repo_info['mergedPRs']['totalCount'],
            "closedPRs": repo_info['closedPRs']['totalCount']
        }
    except Exception as e:
        print(f"⚠️ Error fetching PRs for {repo_owner}/{repo_name}: {e}")
        return {
            "mergedPRs": None,
            "closedPRs": None
        }


In [12]:
df_extra = []
for _, row in df_repos.iterrows():
    extra_info = fetch_extra_repo_info(row['owner']['login'], row['name'])
    df_extra.append(extra_info)

df_extra = pd.DataFrame(df_extra)
df_final = pd.concat([df_repos.reset_index(drop=True), df_extra], axis=1)


Fetched PRs for freeCodeCamp/freeCodeCamp
Fetched PRs for codecrafters-io/build-your-own-x
Fetched PRs for sindresorhus/awesome
Fetched PRs for EbookFoundation/free-programming-books
Fetched PRs for public-apis/public-apis
Fetched PRs for jwasham/coding-interview-university
Fetched PRs for kamranahmedse/developer-roadmap
Fetched PRs for donnemartin/system-design-primer
Fetched PRs for 996icu/996.ICU
Fetched PRs for vinta/awesome-python
Fetched PRs for facebook/react
Fetched PRs for awesome-selfhosted/awesome-selfhosted
Fetched PRs for practical-tutorials/project-based-learning
Fetched PRs for vuejs/vue
Fetched PRs for TheAlgorithms/Python
Fetched PRs for torvalds/linux
Fetched PRs for trekhleb/javascript-algorithms
Fetched PRs for tensorflow/tensorflow
Fetched PRs for getify/You-Dont-Know-JS
Fetched PRs for CyC2018/CS-Notes
Fetched PRs for ossu/computer-science
Fetched PRs for ohmyzsh/ohmyzsh
Fetched PRs for Significant-Gravitas/AutoGPT
Fetched PRs for twbs/bootstrap
Fetched PRs for mi

In [13]:
df_final["totalPRs"] = df_final["mergedPRs"] + df_final["closedPRs"]
df_filtered = df_final[df_final["totalPRs"] >= 200]
df_sorted = df_filtered.sort_values(by="stargazerCount", ascending=False)
df_top_200 = df_sorted.head(200)
df_top_200.to_csv("filtered_top_200_repos.csv", index=False)
