In [1]:
import json
import pandas as pd

### 查询 22、23、24 年大数据领域项目 Top 20

In [4]:
import requests

# 读取 CSV 文件
df = pd.read_csv('big_data_projects_full.csv')
repo_names = df['repository'].tolist()

# 获取每个 repo_name 的 openrank 数据
def get_openrank_data(repo_name):
  url = f'https://oss.x-lab.info/open_digger/github/{repo_name}/openrank.json'
  response = requests.get(url)
  if response.status_code == 200:
    return response.json()
  else:
    return None

# 存储所有项目的 openrank 数据
openrank_data = {}
for repo_name in repo_names:
  data = get_openrank_data(repo_name)
  if data:
    openrank_data[repo_name] = data

# 获取每一年排名 Top 20 的项目
def get_top_projects(year):
  projects = []
  for repo_name, data in openrank_data.items():
    if str(year) in data:
      projects.append((repo_name, data[str(year)]))
  projects.sort(key=lambda x: x[1], reverse=True)
  return projects[:20]

# # 打印出 2022、2023、2024 年每一年排名 Top 20 的项目和对应的值
# for year in [2022, 2023, 2024]:
#   top_projects = get_top_projects(year)
#   print(f"Top 20 projects for {year}:")
#   for repo_name, value in top_projects:
#     print(f"{repo_name}: {value}")
#   print("\n")

In [5]:

# 获取 2022、2023、2024 年的 openrank 数据
openrank_2022 = {repo_name: data['2022'] for repo_name, data in openrank_data.items() if '2022' in data}
openrank_2023 = {repo_name: data['2023'] for repo_name, data in openrank_data.items() if '2023' in data}
openrank_2024 = {repo_name: data['2024'] for repo_name, data in openrank_data.items() if '2024' in data}

# 将 2022、2023、2024 年的 openrank 数据添加到 DataFrame 中
df['openrank_2022'] = df['repository'].map(openrank_2022)
df['openrank_2023'] = df['repository'].map(openrank_2023)
df['openrank_2024'] = df['repository'].map(openrank_2024)

# 保存更新后的 DataFrame 到 CSV 文件
df.to_csv('big_data_projects_full.csv', index=False)


### 根据输入的 repo 列表，拼接项目的 GitHub url 地址

In [None]:
df = pd.read_csv('temp_projects.csv')
repo_list = df['repo_name'].tolist()
# 根据输入的 repo 列表，拼接出对应的 url，并保存到 top_projects.csv 文件中
def get_repo_info(repo_list):
    repo_info = []
    for repo in repo_list:
        repo_info.append('https://github.com/' + repo)
    return repo_info
df['repo_url'] = get_repo_info(repo_list)
df.to_csv('temp_projects.csv', index=False)

### 从 json 文件中读数据，保存到 csv 文件中

In [None]:
# 从 top_projects_labels.json 中读取数据，将 repo_name 和 labels 保存到 top_projects.csv 中
df = pd.read_csv('top_projects.csv')
repo_names = df['repo_name'].tolist()
with open('top_projects_labels.json', 'r', encoding='utf-8') as f:
  data = json.load(f)
# 查找每个 repo_name 对应的 label
all_labels = []
for repo_name in repo_names:
  labels = []
  for project in data:
    if project[0] == repo_name:
      for label in project[1]:
        labels.append(label['name'])
      break
  all_labels.append(labels)

all_labels_str = ['; '.join(map(str, labels)) for labels in all_labels]
df['labels'] = all_labels_str
  
df.to_csv('top_projects.csv', index=False)

### 过滤 nix 家族的项目节点

In [None]:
import json

# 读取 JSON 文件
with open('graph_data4ai.json', 'r') as file:
    data = json.load(file)

# 确保 data 是一个字典
if isinstance(data, list):
    data = data[0]

# 过滤 nodes
filtered_nodes = [node for node in data['nodes'] if 'nix-community' not in node[0] and 'NixOS' not in node[0]]

# 过滤 edges
filtered_edges = [edge for edge in data['edges'] if 'nix-community' not in edge[0] and 'nix-community' not in edge[1] and 'NixOS' not in edge[0] and 'NixOS' not in edge[1]]

# 更新数据
data['nodes'] = filtered_nodes
data['edges'] = filtered_edges

# 写回 JSON 文件
with open('graph_data4ai.json', 'w') as file:
    json.dump(data, file, indent=4)

print("过滤完成")