In [None]:
import requests
import pandas as pd
import time
from datetime import datetime
from tqdm import tqdm

In [None]:
# ========= CONFIGURAÇÕES =========
GITHUB_TOKEN = "TODO_TOKEN" # TODO add token
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}
BASE_URL = "https://api.github.com"

In [None]:
def fetch_prs_for_repo(owner, repo, max_pages=1):
    pr_data = []

    for page in range(1, max_pages + 1):
        url = f"{BASE_URL}/repos/{owner}/{repo}/pulls"
        params = {"state": "closed", "per_page": 100, "page": page}
        r = requests.get(url, headers=HEADERS, params=params)
        if r.status_code != 200:
            print(f"⚠️ Error fetching PRs for {owner}/{repo}: {r.status_code}")
            break

        prs = r.json()
        if not prs:
            break

        for pr in prs:
            if not pr["closed_at"]:
                continue

            created_at = datetime.fromisoformat(pr["created_at"].replace("Z", "+00:00"))
            closed_at = datetime.fromisoformat(pr["closed_at"].replace("Z", "+00:00"))
            time_diff_hours = (closed_at - created_at).total_seconds() / 3600

            if time_diff_hours <= 1:
                continue

            # Busca reviews
            reviews_url = pr["_links"]["review_comments"]["href"].replace("comments", "reviews")
            reviews_r = requests.get(reviews_url, headers=HEADERS)
            if reviews_r.status_code != 200:
                continue
            reviews = reviews_r.json()
            if len(reviews) < 1:
                continue

            # Coleta PR completo para detalhes
            pr_details_url = pr["url"]
            pr_details_r = requests.get(pr_details_url, headers=HEADERS)
            if pr_details_r.status_code != 200:
                continue
            pr_details = pr_details_r.json()

            # Participantes
            participants_url = pr["_links"]["self"]["href"] + "/participants"
            participants_r = requests.get(participants_url, headers=HEADERS)
            participants = participants_r.json() if participants_r.status_code == 200 else []

            pr_data.append({
                "repo": f"{owner}/{repo}",
                "pr_number": pr["number"],
                "state": "merged" if pr["merged_at"] else "closed",
                "created_at": pr["created_at"],
                "closed_at": pr["closed_at"],
                "time_hours": round(time_diff_hours, 2),
                "title_length": len(pr["title"]) if pr["title"] else 0,
                "body_length": len(pr["body"]) if pr["body"] else 0,
                "comments": pr_details.get("comments", 0),
                "review_comments": pr_details.get("review_comments", 0),
                "additions": pr_details.get("additions", 0),
                "deletions": pr_details.get("deletions", 0),
                "changed_files": pr_details.get("changed_files", 0),
                "participants": len(participants),
                "author": pr["user"]["login"]
            })

        time.sleep(1)

    return pr_data


In [None]:

df_repos = pd.read_csv("filtered_top_200_repos.csv")

all_prs = []

for _, row in tqdm(df_repos.iterrows(), total=len(df_repos)):
    owner = eval(row["owner"])["login"] if isinstance(row["owner"], str) else row["owner"]
    repo = row["name"]
    prs = fetch_prs_for_repo(owner, repo)
    all_prs.extend(prs)

# Salva dataset com métricas
df_prs = pd.DataFrame(all_prs)
df_prs.to_csv("prs_com_metricas_tail_100.csv", index=False)
print("✅ PRs salvos com sucesso!")