In [31]:
import pandas as pd
import time
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

In [None]:
GITHUB_TOKEN = 'TOKEN' # TODO trocar token GITHUB

In [33]:
transport = RequestsHTTPTransport(
    url='https://api.github.com/graphql',
    headers={'Authorization': f'bearer {GITHUB_TOKEN}'},
    retries=3,
)

client = Client(transport=transport, fetch_schema_from_transport=True)

In [None]:
def fetch_basic_repo_info(limit=300):
    query = """
    query ($cursor: String) {
      search(query: "stars:>1000  sort:stars", type: REPOSITORY, first: 100, after: $cursor) {
        pageInfo {
          hasNextPage
          endCursor
        }
        nodes {
          ... on Repository {
            name
            owner { login }
            stargazerCount
            createdAt
            updatedAt
            url
          }
        }
      }
    }
    """

    repos = []
    cursor = None
    total_repos = 0

    while total_repos < limit:
        print(f"🔄 Fetching repositories {total_repos + 1} to {total_repos + 100}...")
        variables = {"cursor": cursor}

        try:
            response = client.execute(gql(query), variable_values=variables)
            new_repos = response["search"]["nodes"]
            
            repos += new_repos
            total_repos = len(repos)

            page_info = response["search"]["pageInfo"]
            if not page_info["hasNextPage"]:
                break
            cursor = page_info["endCursor"]

        except Exception as e:
            print(f"⚠️ Error fetching repositories: {e}")
            time.sleep(10)
            continue

    return pd.DataFrame(repos)

In [None]:
df_repos = fetch_basic_repo_info(300)

🔄 Fetching repositories 1 to 100...
🔄 Fetching repositories 101 to 200...


In [36]:
def fetch_extra_repo_info(repo_owner, repo_name):
    query = """
    query ($repoOwner: String!, $repoName: String!) {
      repository(owner: $repoOwner, name: $repoName) {
        mergedPRs: pullRequests(states: MERGED) {
          totalCount
        }
        closedPRs: pullRequests(states: CLOSED) {
          totalCount
        }
      }
    }
    """

    variables = {"repoOwner": repo_owner, "repoName": repo_name}

    try:
        response = client.execute(gql(query), variable_values=variables)
        repo_info = response['repository']
        
        print(f"Fetched PRs for {repo_owner}/{repo_name}")
        
        return {
            "mergedPRs": repo_info['mergedPRs']['totalCount'],
            "closedPRs": repo_info['closedPRs']['totalCount']
        }
    except Exception as e:
        print(f"⚠️ Error fetching PRs for {repo_owner}/{repo_name}: {e}")
        return {
            "mergedPRs": None,
            "closedPRs": None
        }


In [37]:
df_extra = []
for _, row in df_repos.iterrows():
    extra_info = fetch_extra_repo_info(row['owner']['login'], row['name'])
    df_extra.append(extra_info)

df_extra = pd.DataFrame(df_extra)
df_final = pd.concat([df_repos.reset_index(drop=True), df_extra], axis=1)


Fetched PRs for freeCodeCamp/freeCodeCamp
Fetched PRs for codecrafters-io/build-your-own-x
Fetched PRs for sindresorhus/awesome
Fetched PRs for EbookFoundation/free-programming-books
Fetched PRs for public-apis/public-apis
Fetched PRs for jwasham/coding-interview-university
Fetched PRs for kamranahmedse/developer-roadmap
Fetched PRs for donnemartin/system-design-primer
Fetched PRs for 996icu/996.ICU
Fetched PRs for vinta/awesome-python
Fetched PRs for facebook/react
Fetched PRs for awesome-selfhosted/awesome-selfhosted
Fetched PRs for practical-tutorials/project-based-learning
Fetched PRs for vuejs/vue
Fetched PRs for TheAlgorithms/Python
Fetched PRs for torvalds/linux
Fetched PRs for trekhleb/javascript-algorithms
Fetched PRs for tensorflow/tensorflow
Fetched PRs for getify/You-Dont-Know-JS
Fetched PRs for CyC2018/CS-Notes
Fetched PRs for ossu/computer-science
Fetched PRs for ohmyzsh/ohmyzsh
Fetched PRs for Significant-Gravitas/AutoGPT
Fetched PRs for twbs/bootstrap
Fetched PRs for fl

In [39]:
df_final["totalPRs"] = df_final["mergedPRs"] + df_final["closedPRs"]
df_final

Unnamed: 0,name,owner,stargazerCount,createdAt,updatedAt,url,mergedPRs,closedPRs,totalPRs
0,freeCodeCamp,{'login': 'freeCodeCamp'},415131,2014-12-24T17:49:19Z,2025-04-04T14:35:06Z,https://github.com/freeCodeCamp/freeCodeCamp,24704,14847,39551
1,build-your-own-x,{'login': 'codecrafters-io'},367973,2018-05-09T12:03:18Z,2025-04-04T14:30:43Z,https://github.com/codecrafters-io/build-your-...,140,138,278
2,awesome,{'login': 'sindresorhus'},354598,2014-07-11T13:42:37Z,2025-04-04T14:31:57Z,https://github.com/sindresorhus/awesome,664,1408,2072
3,free-programming-books,{'login': 'EbookFoundation'},354432,2013-10-11T06:50:37Z,2025-04-04T13:48:19Z,https://github.com/EbookFoundation/free-progra...,6796,3613,10409
4,public-apis,{'login': 'public-apis'},334875,2016-03-20T23:49:42Z,2025-04-04T14:25:54Z,https://github.com/public-apis/public-apis,1872,1089,2961
...,...,...,...,...,...,...,...,...,...
195,scikit-learn,{'login': 'scikit-learn'},61649,2010-08-17T09:43:38Z,2025-04-04T13:24:57Z,https://github.com/scikit-learn/scikit-learn,13032,4982,18014
196,Apollo-11,{'login': 'chrislgarry'},61542,2014-04-03T15:45:02Z,2025-04-04T13:35:39Z,https://github.com/chrislgarry/Apollo-11,342,120,462
197,awesome-scalability,{'login': 'binhnguyennus'},61282,2017-12-27T03:46:40Z,2025-04-04T14:34:48Z,https://github.com/binhnguyennus/awesome-scala...,36,42,78
198,Java,{'login': 'TheAlgorithms'},61235,2016-07-16T10:21:02Z,2025-04-04T14:24:34Z,https://github.com/TheAlgorithms/Java,1868,2969,4837


In [40]:
how_many_over_200 = (df_final["totalPRs"] >= 200).sum()
how_many_over_200

np.int64(181)

In [41]:
df_final.to_csv("top_200_repositories.csv", index=False)
print("✅ Repositórios salvos em 'top_200_repositories.csv'")

✅ Repositórios salvos em 'top_200_repositories.csv'
