In [None]:
# static parameters for seed projects
seed_projects1 = ['65600975','599547518','552661142'] #pytorch,vllm,langchain
seed_projects2 = ['65600975','599547518','552661142','614765452'] #pytorch,vllm,langchain, autogpt
seed_projects3 = ['65600975','599547518','552661142','614765452','621799276'] #pytorch,vllm,langchain, autogpt, langchain-chatchat
seed_projects4 = ['614765452','612354784','621799276'] # autogpt, llama.cpp, langchain-chatchat
seed_projects5 = ['235860204','23496542','156939672'] # deepspeed, triton, onnxruntime
seed_projects6 = ['235860204','23496542','156939672','175592968'] # deepspeed, triton, onnxruntime, volcano


seed_projects_data4ai = ['7833168','15111821','60246359','99919302','507775','283046497','402945349','33884891','30203935','71932349'] #elastic/kibana, delta-io/delta, apache/hudi


In [2]:
import os
from dotenv import load_dotenv
# 连接 Clickhouse 数据库
import clickhouse_connect
import itertools
import pandas as pd
import json

load_dotenv()
clickhouse_host = os.getenv("CLICKHOUSE_HOST")
username = os.getenv("CLICKHOUSE_USER")
password = os.getenv("CLICKHOUSE_PASSWORD")

client = clickhouse_connect.get_client(host=clickhouse_host, port=8123, username=username, password=password)


def execute_search_projects(project_ids, limit):   
    sql_search_projects = """ 
    WITH
        -- 获取仓库在指定时间段内的活跃开发者
        active_developers AS (
            SELECT DISTINCT actor_id, repo_id, repo_name
            FROM opensource.events
            WHERE repo_id IN (%s)
            AND created_at >= '2024-01-01'
            AND created_at < '2024-11-01'
            AND (type IN ('IssuesEvent', 'PullRequestEvent', 'IssueCommentEvent', 'PullRequestReviewEvent','PullRequestReviewCommentEvent'))
        ),
        -- 计算其他仓库中的活跃开发者
        repo_activity AS (
            SELECT repo_id, repo_name, COUNT(DISTINCT actor_id) AS active_count
            FROM opensource.events
            WHERE actor_id IN (SELECT actor_id FROM active_developers)
            AND created_at >= '2024-01-01'
            AND created_at < '2024-11-01'
            AND repo_id NOT IN (%s)
            AND (type IN ('IssuesEvent', 'PullRequestEvent', 'IssueCommentEvent','PullRequestReviewEvent', 'PullRequestReviewCommentEvent'))
            GROUP BY repo_id, repo_name
        )
        -- 获取关联度最高的 x 个仓库
        SELECT repo_id, repo_name, active_count
        FROM repo_activity
        ORDER BY active_count DESC
        LIMIT %s
        """
    formatted_query = sql_search_projects % (', '.join(f"'{id}'" for id in project_ids), ', '.join(f"'{id}'" for id in project_ids), limit)
    results = client.query(formatted_query)
    return results
        

def execute_query_relations(repo_id1, repo_id2):
    sql_query_relations = """ 
        -- 计算两个仓库之间的共同开发者数量
        WITH
            -- 获取第一个仓库在指定时间段内的活跃开发者
            active_developers_1 AS (
                SELECT DISTINCT actor_id
                FROM opensource.events
                WHERE repo_id = %s
                AND created_at >= '2024-01-01'
                AND created_at < '2024-11-01'
                AND (type IN ('IssuesEvent', 'PullRequestEvent', 'IssueCommentEvent','PullRequestReviewEvent','PullRequestReviewCommentEvent'))
            ),
            -- 获取第二个仓库在指定时间段内的活跃开发者
            active_developers_2 AS (
                SELECT DISTINCT actor_id
                FROM opensource.events
                WHERE repo_id = %s
                AND created_at >= '2024-01-01'
                AND created_at < '2024-11-01'
                AND (type IN ('IssuesEvent', 'PullRequestEvent', 'IssueCommentEvent', 'PullRequestReviewEvent', 'PullRequestReviewCommentEvent'))
            )
        -- 计算共同开发者数量
        SELECT COUNT(DISTINCT a.actor_id) AS common_developer_count
        FROM active_developers_1 a
        JOIN active_developers_2 b ON a.actor_id = b.actor_id
        """
    formatted_query = sql_query_relations % (f"'{repo_id1}'", f"'{repo_id2}'")
    results = client.query(formatted_query)
    return results

def execute_nodes_openrank(repo_id):
    sql_nodes_openrank = """
        -- 查询仓库的 openrank 均值
        SELECT repo_id, avg(openrank) AS average_openrank
        FROM opensource.global_openrank
        WHERE repo_id = %s
        AND platform = 'GitHub'
        AND created_at >= '2024-01-01'
        AND created_at < '2024-11-01'
        GROUP BY repo_id
    """
    formatted_query = sql_nodes_openrank % (f"'{repo_id}'")
    results = client.query(formatted_query)
    return results

def execute_query_projects_name(repo_id):
    sql_query_project_name = """
        -- 根据 repo_id 查询最新的 repo_name
        SELECT repo_id, repo_name
        FROM opensource.events
        WHERE repo_id IN (%s)
        ORDER BY created_at DESC
        LIMIT 1
        """
    formatted_query = sql_query_project_name % (f"'{repo_id}'")
    results = client.query(formatted_query)
    return results




In [3]:
def get_repo_ids(results):
    return list(set(result[0] for result in results))

# seed_projects
first_projects= set()
for project_id in seed_projects_data4ai:
    results = execute_search_projects([project_id], 10)
    for row in results.result_rows:
        first_projects.add (row[0])
first_projects = list(first_projects)

second_projects= set()
for project_id in first_projects:
    results = execute_search_projects([project_id], 5)
    for row in results.result_rows:
        second_projects.add (row[0])
second_projects = list(second_projects)

final_projects = {}
for repo_id in second_projects:
    project_names = execute_query_projects_name(repo_id)
    if project_names :
        final_projects[repo_id] = {"name":project_names.result_rows[0][1],"openrank":None}

# 手动加上 DB-GPT
# final_projects[627480054] = {"name":'eosphoros-ai/DB-GPT',"openrank":None}

for repo_id in list(final_projects.keys()):
    openrank = execute_nodes_openrank(repo_id)
    if openrank:
        average_openrank = int(openrank.result_rows[0][1])
        if average_openrank >=40:
            final_projects[repo_id]["openrank"] = average_openrank
        else:
            del final_projects[repo_id]

nodes = []
for repo_id, info in final_projects.items():
    nodes.append([info["name"], info["openrank"]])

edges = []
for key1, key2 in itertools.combinations(final_projects.keys(), 2):
    project1 = final_projects[key1]['name']
    project2 = final_projects[key2]['name']

    query_result = execute_query_relations(key1, key2)
    value = query_result.result_rows[0][0]
    edges.append([project1, project2, value])

filtered_edges = [edge for edge in edges if edge[2] >= 40]

print(final_projects)
print(len(nodes),len(filtered_edges))

{20929025: {'name': 'microsoft/TypeScript', 'openrank': 160}, 93444615: {'name': 'hashicorp/terraform-provider-aws', 'openrank': 181}, 570384908: {'name': 'huggingface/peft', 'openrank': 51}, 858127: {'name': 'pandas-dev/pandas', 'openrank': 135}, 30203935: {'name': 'metabase/metabase', 'openrank': 223}, 436282917: {'name': 'elastic/elastic-agent', 'openrank': 48}, 78938149: {'name': 'nix-community/home-manager', 'openrank': 57}, 106462765: {'name': 'microsoft/vscode-python', 'openrank': 53}, 185861173: {'name': 'open-telemetry/opentelemetry-collector', 'openrank': 55}, 224762936: {'name': 'dbt-labs/docs.getdbt.com', 'openrank': 76}, 193215554: {'name': 'n8n-io/n8n', 'openrank': 105}, 364003912: {'name': 'opensearch-project/documentation-website', 'openrank': 67}, 7833168: {'name': 'elastic/kibana', 'openrank': 606}, 39464018: {'name': 'apache/superset', 'openrank': 143}, 20587599: {'name': 'apache/flink', 'openrank': 120}, 177839194: {'name': 'microsoft/vscode-remote-release', 'openra

In [4]:
# 将 nodes 和 edges 组合并存储到 JSON 文件
graph = {
    "nodes": nodes,
    "edges": filtered_edges
}

# 保存到 graph.json 文件中
with open('graph_data4ai.json', 'w') as json_file:
    json.dump(graph, json_file, indent=4)

In [None]:
second_projects= set()
for project_id in first_projects:
    results = execute_search_projects([project_id], 5)
    for row in results.result_rows:
        second_projects.add (row[0])
second_projects = list(second_projects)

print("First Projects:", first_projects)
print("Second Projects:", second_projects)

In [None]:
final_projects = {}
for repo_id in second_projects:
    project_names = execute_query_projects_name(repo_id)
    print(project_names.result_rows)
    if project_names :
        final_projects[repo_id] = project_names.result_rows[0][1]

In [None]:
edges = []
for key1, key2 in itertools.combinations(final_projects.keys(), 2):
    project1 = final_projects[key1]['name']
    project2 = final_projects[key2]['name']

    query_result = execute_query_relations(key1, key2)
    value = query_result.result_rows[0][0]
    edges.append([project1, project2, value])
    print(edges)


filtered_edges = [edge for edge in edges if edge[2] >= 40]

In [None]:
# 过滤边的数量
print(len(edges))

filtered_edges = [edge for edge in edges if edge[2] >= 40]
count = len(filtered_edges)
count

In [None]:
#用 PageRank 计算每个节点的中心性的值
import networkx as nx

# 示例节点和边
nodes = list(range(69))  # 假设有 69 个节点，编号为 0 到 68
edges = [
    (0, 1), (1, 2), (2, 0),  # 示例边，替换为你的边数据
    (1, 3), (3, 4), (4, 1),
    # 添加更多边
]

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)

pagerank_values = nx.pagerank(G)

for node, value in pagerank_values.items():
    print(f"Node {node}: PageRank = {value}")
