In [1]:
import os
import requests
from dotenv import load_dotenv
import clickhouse_connect
import itertools
import pandas as pd
import json

load_dotenv()
clickhouse_host = os.getenv("CLICKHOUSE_HOST")
username = os.getenv("CLICKHOUSE_USER")
password = os.getenv("CLICKHOUSE_PASSWORD")
github_token = os.getenv("GITHUB_TOKEN")

client = clickhouse_connect.get_client(host=clickhouse_host, port=8123, username=username, password=password)

headers = {
  "Authorization": f"token {github_token}"
}



In [2]:
# search most related projects
def search_projects(project_ids, limit):
    sql_search_projects = """
    WITH
        -- 获取仓库在指定时间段内的活跃开发者
        active_developers AS (
            SELECT DISTINCT actor_id, repo_id, repo_name
            FROM opensource.events
            WHERE repo_id IN (%s)
            AND (type IN ('IssuesEvent', 'PullRequestEvent', 'IssueCommentEvent', 'PullRequestReviewEvent','PullRequestReviewCommentEvent'))
            AND created_at >= '2025-01-01'
        ),
        -- 计算其他仓库中的活跃开发者
        repo_activity AS (
            SELECT repo_id, repo_name, COUNT(DISTINCT actor_id) AS active_count
            FROM opensource.events
            WHERE actor_id IN (SELECT actor_id FROM active_developers)
            AND repo_id NOT IN (%s)
            AND (type IN ('IssuesEvent', 'PullRequestEvent', 'IssueCommentEvent','PullRequestReviewEvent', 'PullRequestReviewCommentEvent'))
            AND created_at >= '2025-01-01'
            GROUP BY repo_id, repo_name
        )
        -- 获取关联度最高的 x 个仓库
        SELECT repo_id, repo_name, active_count
        FROM repo_activity
        ORDER BY active_count DESC
        LIMIT %s
        """
    formatted_query = sql_search_projects % (', '.join(f"'{id}'" for id in project_ids), ', '.join(f"'{id}'" for id in project_ids), limit)
    results = client.query(formatted_query)
    return results

In [12]:
repo_ids = [
  "646410686", "725205304", "178626720", "23496542", "523007292", "669331976", "308638334", "494232964", 
  "46153892", "933928679", "658928958", "679366051", "653496050", "151636194", "178075572", "634081686", 
  "819733173", "178976529", "615882673", "942206898", "813348592", "619959033", "599547518", "740303686", 
  "612354784", "156939672", "833984877", "654122609", "547806116", "153097643", "1460385", "5108051", 
  "843222", "16587283", "102692863", "73328905", "145670234", "112647343", "707869465", "881221486", 
  "534011733", "200722670", "136202695", "86031674", "205706595", "448599559", "216628419", "209120637", 
  "314197645", "155220641", "498011141", "517552648", "342728683", "65600975", "65711522", "154739597", 
  "45717250", "33015583", "422274596", "235860204", "176982014", "114185437", "753490180"
]

projects = set()
for repo_id in repo_ids:
    results = search_projects([repo_id], 5)
    for row in results.result_rows:
        projects.add (row[0])
projects = list(projects)

repo_ids_int = [int(id) for id in repo_ids]

projects_ids = list(set(projects) - set(repo_ids_int))

In [14]:
len(projects_ids)

142

In [15]:
def get_projects_openrank(project_ids):
  sql_get_openrank = """
  SELECT repo_id, repo_name, SUM(openrank) as total_openrank
  FROM opensource.global_openrank
  WHERE repo_id IN (%s)
  AND created_at >= '2025-01-01'
  GROUP BY repo_id, repo_name
  """
  formatted_query = sql_get_openrank % (', '.join(f"'{id}'" for id in project_ids))
  results = client.query(formatted_query)
  
  filtered_projects = []
  for row in results.result_rows:
    repo_id = row[0]
    repo_name = row[1]
    total_openrank = float(row[2])
    if total_openrank >= 50:
      filtered_projects.append((repo_id, repo_name, total_openrank))
  
  filtered_projects.sort(key=lambda x: x[2], reverse=True)
  return filtered_projects
  
projects_with_openrank = get_projects_openrank(projects_ids)
print(f"Found {len(projects_with_openrank)} projects with openrank >= 50")

# Extract project IDs and names for further use
filtered_projects_ids = [proj[0] for proj in projects_with_openrank]
filtered_projects_names = [proj[1] for proj in projects_with_openrank]

# Create a dictionary mapping IDs to names for easy lookup
filtered_projects_dict = {proj[0]: proj[1] for proj in projects_with_openrank}

Found 72 projects with openrank >= 50


In [16]:
# Convert the dictionary to a DataFrame
projects_df = pd.DataFrame(list(filtered_projects_dict.items()), columns=['repo_id', 'repo_name'])

# Save the DataFrame to a CSV file
projects_df.to_csv('searched_projects.csv', index=False)

print(f"Saved {len(filtered_projects_dict)} projects to filtered_projects.csv")

Saved 72 projects to filtered_projects.csv


In [18]:
ai_application = [
  656099147, 703411624, 710601088, 615510678, 946380199, 660551251, 643445235,
  605673387, 691347156, 621799276, 730534580, 626805178, 193215554, 552661142,
  560704231, 680120071, 607289185, 676672661, 621803253, 614765452, 627480054,
  221654678, 599524116, 870189331, 761974775, 784181462, 839372398, 747030811,
  834688418, 782539945, 576642715, 671269505, 890668799, 631254164, 762304524,
  954873280, 862449441, 644686905, 824874689, 771302083, 664592117, 384219990,
  614764248, 662766482, 638629097, 775250190, 156332497, 679237742, 634224458,
  653574784, 675431271, 943934999, 942114501, 622737738, 612344730, 610260322,
  701547123, 589831718, 580642043, 527591471, 910418754, 357241735, 644461337,
  881458615, 958830659, 860252770, 666299222, 464415161, 881448795, 522158088,
  553119108
]

filtered_infra_application_ids = list(set(filtered_projects_ids) - set(ai_application))

len(filtered_infra_application_ids)

66