In [1]:
!pip install gql requests

Collecting gql
  Downloading gql-3.5.0-py2.py3-none-any.whl.metadata (9.2 kB)
Collecting graphql-core<3.3,>=3.2 (from gql)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Collecting backoff<3.0,>=1.11.1 (from gql)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Downloading gql-3.5.0-py2.py3-none-any.whl (74 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.0/74.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading backoff-2.2.1-py3-none-any.whl (15 kB)
Downloading graphql_core-3.2.6-py3-none-any.whl (203 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.4/203.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: graphql-core, backoff, gql
Successfully installed backoff-2.2.1 gql-3.5.0 graphql-core-3.2.6


In [2]:
import requests
import json
import pandas as pd

In [3]:
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

# TODO LEMBRETE - Ao fazer o commit, colocar placeholder em vez do token
GITHUB_TOKEN = "ADICIONARTOKEN"

transport = RequestsHTTPTransport(
    url="https://api.github.com/graphql",
    headers={"Authorization": f"Bearer {GITHUB_TOKEN}"},
    use_json=True,
    timeout=60
)

client = Client(transport=transport, fetch_schema_from_transport=True)


In [4]:
query_basic_info = """
query ($cursor: String) {
  search(query: "stars:>1", type: REPOSITORY, first: 100, after: $cursor) {
    pageInfo {
      hasNextPage
      endCursor
    }
    nodes {
      ... on Repository {
        name
        owner {
          login
        }
        createdAt
        updatedAt
        primaryLanguage {
          name
        }
        pullRequests(states: MERGED) {
          totalCount
        }
      }
    }
  }
}
"""

In [5]:
repos = []
cursor = None
total_repos = 0

while total_repos < 100:
    print(f"🔄 Fetching repositories {total_repos + 1} to {total_repos + 100}...")

    variables = {"cursor": cursor}

    try:
        response = client.execute(gql(query_basic_info), variable_values=variables)
        repos += response["search"]["nodes"]
        total_repos = len(repos)

        page_info = response["search"]["pageInfo"]
        if not page_info["hasNextPage"]:
            break

        cursor = page_info["endCursor"]

    except Exception as e:
        print(f"⚠️ Error fetching repositories: {e}")
        time.sleep(10)
        continue

print(f"✅ Repository collection complete! Total: {total_repos}")

🔄 Fetching repositories 1 to 100...
✅ Repository collection complete! Total: 100


In [6]:
query_extra_info = """
query ($repoOwner: String!, $repoName: String!) {
  repository(owner: $repoOwner, name: $repoName) {
    releases(first: 1) {
      totalCount
    }
    issues(first: 1) {
      totalCount
    }
    closedIssues: issues(states: CLOSED, first: 1) {
      totalCount
    }
    stargazers(first: 1) {
      totalCount
    }
  }
}
"""

In [7]:
for repo in repos:
    repo_owner = repo["owner"]["login"]
    repo_name = repo["name"]

    print(f"🔄 Fetching extra details for {repo_owner}/{repo_name}...")

    variables = {"repoOwner": repo_owner, "repoName": repo_name}

    try:
        extra_data = client.execute(gql(query_extra_info), variable_values=variables)
        repo.update(extra_data["repository"])

    except Exception as e:
        print(f"⚠️ Error fetching details for {repo_owner}/{repo_name}: {e}")
        time.sleep(10)
        continue

print("✅ Extra data collection complete!")

🔄 Fetching extra details for freeCodeCamp/freeCodeCamp...
🔄 Fetching extra details for EbookFoundation/free-programming-books...
🔄 Fetching extra details for sindresorhus/awesome...
🔄 Fetching extra details for codecrafters-io/build-your-own-x...
🔄 Fetching extra details for public-apis/public-apis...
🔄 Fetching extra details for jwasham/coding-interview-university...
🔄 Fetching extra details for kamranahmedse/developer-roadmap...
🔄 Fetching extra details for donnemartin/system-design-primer...
🔄 Fetching extra details for 996icu/996.ICU...
🔄 Fetching extra details for vinta/awesome-python...
🔄 Fetching extra details for facebook/react...
🔄 Fetching extra details for practical-tutorials/project-based-learning...
🔄 Fetching extra details for awesome-selfhosted/awesome-selfhosted...
🔄 Fetching extra details for vuejs/vue...
🔄 Fetching extra details for TheAlgorithms/Python...
🔄 Fetching extra details for trekhleb/javascript-algorithms...
🔄 Fetching extra details for tensorflow/tensorflow

In [8]:
df = pd.DataFrame(repos)
df.head()


Unnamed: 0,name,owner,createdAt,updatedAt,primaryLanguage,pullRequests,releases,issues,closedIssues,stargazers
0,freeCodeCamp,{'login': 'freeCodeCamp'},2014-12-24T17:49:19Z,2025-02-16T23:36:00Z,{'name': 'TypeScript'},{'totalCount': 24325},{'totalCount': 0},{'totalCount': 18893},{'totalCount': 18727},{'totalCount': 410077}
1,free-programming-books,{'login': 'EbookFoundation'},2013-10-11T06:50:37Z,2025-02-17T00:20:07Z,{'name': 'HTML'},{'totalCount': 6787},{'totalCount': 0},{'totalCount': 1159},{'totalCount': 1130},{'totalCount': 350510}
2,awesome,{'login': 'sindresorhus'},2014-07-11T13:42:37Z,2025-02-17T00:21:27Z,,{'totalCount': 661},{'totalCount': 0},{'totalCount': 347},{'totalCount': 335},{'totalCount': 347292}
3,build-your-own-x,{'login': 'codecrafters-io'},2018-05-09T12:03:18Z,2025-02-17T00:33:42Z,{'name': 'Markdown'},{'totalCount': 140},{'totalCount': 0},{'totalCount': 726},{'totalCount': 536},{'totalCount': 335323}
4,public-apis,{'login': 'public-apis'},2016-03-20T23:49:42Z,2025-02-17T00:22:10Z,{'name': 'Python'},{'totalCount': 1872},{'totalCount': 0},{'totalCount': 651},{'totalCount': 603},{'totalCount': 327653}
