In [2]:
!pip install gql requests

Collecting gql
  Downloading gql-3.5.2-py2.py3-none-any.whl.metadata (9.4 kB)
Collecting graphql-core<3.2.5,>=3.2 (from gql)
  Downloading graphql_core-3.2.4-py3-none-any.whl.metadata (10 kB)
Collecting backoff<3.0,>=1.11.1 (from gql)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Downloading gql-3.5.2-py2.py3-none-any.whl (74 kB)
   ---------------------------------------- 0.0/74.3 kB ? eta -:--:--
   ---------------------- ----------------- 41.0/74.3 kB 2.0 MB/s eta 0:00:01
   ---------------------------------------- 74.3/74.3 kB 2.1 MB/s eta 0:00:00
Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Downloading graphql_core-3.2.4-py3-none-any.whl (203 kB)
   ---------------------------------------- 0.0/203.2 kB ? eta -:--:--
   ------------------ --------------------- 92.2/203.2 kB 2.6 MB/s eta 0:00:01
   ---------------------------------------- 203.2/203.2 kB 2.5 MB/s eta 0:00:00
Installing collected packages: graphql-core, backoff, gql
Successfully installed backoff

In [3]:
import requests
import pandas as pd
import time
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

In [None]:
GITHUB_TOKEN = "GITHUB_TOKEN"  # TODO trocar github TOKEN

transport = RequestsHTTPTransport(
    url="https://api.github.com/graphql",
    headers={"Authorization": f"Bearer {GITHUB_TOKEN}"},
    use_json=True,
    timeout=60
)

In [5]:
client = Client(transport=transport, fetch_schema_from_transport=True)

In [6]:
# Query básica para coletar informações iniciais
def fetch_basic_repo_info():
    query = """
    query ($cursor: String) {
      search(query: "language:Java sort:stars", type: REPOSITORY, first: 100, after: $cursor) {
        pageInfo {
          hasNextPage
          endCursor
        }
        nodes {
          ... on Repository {
            name
            owner { login }
            createdAt
            updatedAt
          }
        }
      }
    }
    """

    repos = []
    cursor = None
    total_repos = 0

    while total_repos < 1000:
        print(f"🔄 Fetching repositories {total_repos + 1} to {total_repos + 100}...")
        variables = {"cursor": cursor}

        try:
            response = client.execute(gql(query), variable_values=variables)
            repos += response["search"]["nodes"]
            total_repos = len(repos)

            page_info = response["search"]["pageInfo"]
            if not page_info["hasNextPage"]:
                break
            cursor = page_info["endCursor"]

        except Exception as e:
            print(f"⚠️ Error fetching repositories: {e}")
            time.sleep(10)
            continue

    return pd.DataFrame(repos)

In [7]:
def fetch_extra_repo_info(repo_owner, repo_name):
    query = """
    query ($repoOwner: String!, $repoName: String!) {
      repository(owner: $repoOwner, name: $repoName) {
        stargazers { totalCount }
        releases { totalCount }
      }
    }
    """

    variables = {"repoOwner": repo_owner, "repoName": repo_name}

    try:
        response = client.execute(gql(query), variable_values=variables)
        return {
            "stargazers": response['repository']['stargazers']['totalCount'],
            "releases": response['repository']['releases']['totalCount']
        }
    except Exception as e:
        print(f"⚠️ Error fetching details for {repo_owner}/{repo_name}: {e}")
        return {"stargazers": 0, "releases": 0}


In [8]:
# Coleta de dados básicos
df = fetch_basic_repo_info()

🔄 Fetching repositories 1 to 100...
🔄 Fetching repositories 101 to 200...
🔄 Fetching repositories 201 to 300...
🔄 Fetching repositories 301 to 400...
🔄 Fetching repositories 401 to 500...
🔄 Fetching repositories 501 to 600...
🔄 Fetching repositories 601 to 700...
🔄 Fetching repositories 701 to 800...
🔄 Fetching repositories 801 to 900...
🔄 Fetching repositories 901 to 1000...


In [9]:
df_extra = []
for _, row in df.iterrows():
    extra_info = fetch_extra_repo_info(row['owner']['login'], row['name'])
    df_extra.append(extra_info)

KeyboardInterrupt: 

In [None]:
df_extra = pd.DataFrame(df_extra)
df = pd.concat([df, df_extra], axis=1)

In [None]:
# Processamento e salvamento dos dados
df.to_csv("top_1000_java_repos.csv", index=False)
print("✅ CSV file saved!")