In [None]:
!pip install gql requests



In [None]:
import requests
import json
import pandas as pd
import time

In [None]:
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

# TODO LEMBRETE - Ao fazer o commit, colocar placeholder em vez do token
GITHUB_TOKEN = "TOKEN_GITHUB"

transport = RequestsHTTPTransport(
    url="https://api.github.com/graphql",
    headers={"Authorization": f"Bearer {GITHUB_TOKEN}"},
    use_json=True,
    timeout=60
)

client = Client(transport=transport, fetch_schema_from_transport=True)


In [None]:
query_basic_info = """
query ($cursor: String) {
  search(query: "stars:>1", type: REPOSITORY, first: 100, after: $cursor) {
    pageInfo {
      hasNextPage
      endCursor
    }
    nodes {
      ... on Repository {
        name
        owner {
          login
        }
        createdAt
        updatedAt
        primaryLanguage {
          name
        }
        pullRequests(states: MERGED) {
          totalCount
        }
      }
    }
  }
}
"""

In [None]:
repos = []
cursor = None
total_repos = 0

while total_repos < 1000:
    print(f"🔄 Fetching repositories {total_repos + 1} to {total_repos + 100}...")

    variables = {"cursor": cursor}

    try:
        response = client.execute(gql(query_basic_info), variable_values=variables)
        repos += response["search"]["nodes"]
        total_repos = len(repos)

        page_info = response["search"]["pageInfo"]
        if not page_info["hasNextPage"]:
            break

        cursor = page_info["endCursor"]

    except Exception as e:
        print(f"⚠️ Error fetching repositories: {e}")
        time.sleep(10)
        continue

print(f"✅ Repository collection complete! Total: {total_repos}")

🔄 Fetching repositories 1 to 100...
🔄 Fetching repositories 101 to 200...
🔄 Fetching repositories 201 to 300...
🔄 Fetching repositories 301 to 400...
🔄 Fetching repositories 401 to 500...
🔄 Fetching repositories 501 to 600...
🔄 Fetching repositories 601 to 700...
🔄 Fetching repositories 701 to 800...
🔄 Fetching repositories 801 to 900...
🔄 Fetching repositories 901 to 1000...
✅ Repository collection complete! Total: 1000


In [None]:
query_extra_info = """
query ($repoOwner: String!, $repoName: String!) {
  repository(owner: $repoOwner, name: $repoName) {
    releases(first: 1) {
      totalCount
    }
    issues(first: 1) {
      totalCount
    }
    closedIssues: issues(states: CLOSED, first: 1) {
      totalCount
    }
    stargazers(first: 1) {
      totalCount
    }
  }
}
"""

In [None]:
for repo in repos:
    repo_owner = repo["owner"]["login"]
    repo_name = repo["name"]

    print(f"🔄 Fetching extra details for {repo_owner}/{repo_name}...")

    variables = {"repoOwner": repo_owner, "repoName": repo_name}

    try:
        extra_data = client.execute(gql(query_extra_info), variable_values=variables)
        repo.update(extra_data["repository"])

    except Exception as e:
        print(f"⚠️ Error fetching details for {repo_owner}/{repo_name}: {e}")
        time.sleep(10)
        continue

print("✅ Extra data collection complete!")

🔄 Fetching extra details for freeCodeCamp/freeCodeCamp...
🔄 Fetching extra details for EbookFoundation/free-programming-books...
🔄 Fetching extra details for sindresorhus/awesome...
🔄 Fetching extra details for codecrafters-io/build-your-own-x...
🔄 Fetching extra details for public-apis/public-apis...
🔄 Fetching extra details for jwasham/coding-interview-university...
🔄 Fetching extra details for kamranahmedse/developer-roadmap...
🔄 Fetching extra details for donnemartin/system-design-primer...
🔄 Fetching extra details for 996icu/996.ICU...
🔄 Fetching extra details for vinta/awesome-python...
🔄 Fetching extra details for facebook/react...
🔄 Fetching extra details for practical-tutorials/project-based-learning...
🔄 Fetching extra details for awesome-selfhosted/awesome-selfhosted...
🔄 Fetching extra details for vuejs/vue...
🔄 Fetching extra details for TheAlgorithms/Python...
🔄 Fetching extra details for trekhleb/javascript-algorithms...
🔄 Fetching extra details for torvalds/linux...
🔄 F

In [None]:
df = pd.DataFrame(repos)
df.head()

In [None]:
# Corrigir a coluna 'owner' para mostrar apenas o login
df["owner"] = df['owner'].apply(lambda x: x['login'] if isinstance(x, dict) else x)

# Corrigir a coluna 'primaryLanguage' para mostrar apenas o nome da linguagem
df["primaryLanguage"] = df["primaryLanguage"].apply(lambda x: x["name"] if isinstance(x, dict) else None)

# Corrigir as colunas que contêm dicionários com 'totalCount'
columns_to_fix = ["pullRequests", "releases", "issues", "closedIssues", "stargazers"]
for col in columns_to_fix:
    df[col] = df[col].apply(lambda x: x["totalCount"] if isinstance(x, dict) else 0)

df


In [None]:
df.to_csv("formatted_github_data.csv", index=False)