In [None]:
import json
import csv
import pandas as pd
import sys

json_file_path = 'hf_top_models.json'  # Path to your JSON file
csv_file_path = 'hf_top_models.csv'  # Path to save the CSV file
try:
  with open(json_file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

  # Convert to DataFrame
  if isinstance(data, list):
    df = pd.DataFrame(data)
  else:
    # If it's a single object or has a nested structure
    df = pd.json_normalize(data)

  # Write to CSV
  df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')
  print(f"Successfully converted {json_file_path} to {csv_file_path}")

except Exception as e:
  print(f"Error: {e}")

In [None]:
import os
import requests
from dotenv import load_dotenv
import clickhouse_connect
import itertools
import pandas as pd
import json

load_dotenv()
clickhouse_host = os.getenv("CLICKHOUSE_HOST")
username = os.getenv("CLICKHOUSE_USER")
password = os.getenv("CLICKHOUSE_PASSWORD")
github_token = os.getenv("GITHUB_TOKEN")

client = clickhouse_connect.get_client(host=clickhouse_host, port=8123, username=username, password=password)

headers = {
  "Authorization": f"token {github_token}"
}

# 查询 github openrank top 项目列表
def execute_query_top_openrank(created_at='2025-01-01'):
  sql_query_top_openrank = """
    SELECT
        repo_id,
        repo_name,
        ROUND(AVG(openrank), 2) AS avg_openrank_25
    FROM
        opensource.global_openrank
    WHERE
        platform = 'GitHub' AND
        created_at >= %s
    GROUP BY
        repo_id, repo_name
    ORDER BY
        avg_openrank_25 DESC
    LIMIT 500
  """
  formatted_query = sql_query_top_openrank % (f"'{created_at}'")
  results = client.query(formatted_query)
  return results

# 根据 repo_name, 通过 GitHub API 查询仓库基本信息
def get_repo_info(repo_name, headers):
    url = f"https://api.github.com/repos/{repo_name}"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        repo_info = {
            "repo_id": data.get("id"),
            "repo_name": data.get("full_name"),
            "stargazers_count": data.get("stargazers_count"),
            "forks_count": data.get("forks_count"),
            "language": data.get("language"),
            "created_at": data.get("created_at").split("T")[0],
            "description": data.get("description"),
            "topics": data.get("topics", []),
        }
        return repo_info
    else:
        print(f"Error fetching data for {repo_name}: {response.status_code}")
        return None



In [None]:
import pandas as pd
from datetime import datetime
import json

repo_names = [
  
]

results = []

for repo_name in repo_names:
  print(f"Processing {repo_name}...")
  
  # Get repo info from GitHub API
  repo_info = get_repo_info(repo_name, headers)
  
  if repo_info:
        
    # Compile all information
    result = {
      "repo_name": repo_info["repo_name"],
      "stars": repo_info["stargazers_count"],
      "forks": repo_info["forks_count"],
      "language": repo_info["language"],
      "created_at": repo_info["created_at"].split("T")[0],
      "description": repo_info["description"],
      "topics": ", ".join(repo_info.get("topics", [])) if repo_info.get("topics") else ""
    }
    
    results.append(result)
  else:
    print(f"Failed to get info for {repo_name}")

df_results = pd.DataFrame(results)
columns_order = ["repo_name", "stars", "forks", "language", "created_at", "description", 'topics']
df_results = df_results[columns_order]

csv_filename = f"repo_info_with_openrank_{datetime.now().strftime('%Y%m%d')}.csv"
df_results.to_csv(csv_filename, index=False)

print(f"Successfully processed {len(results)} repositories")
print(f"Results saved to {csv_filename}")

Processing NixOS/nixpkgs...
Processing llvm/llvm-project...
Processing home-assistant/core...
Processing CypherV2/Donarev-API...
Error fetching data for CypherV2/Donarev-API: 404
Failed to get info for CypherV2/Donarev-API
Processing pytorch/pytorch...
Processing odoo/odoo...
Processing 7Sence/Rise-Beta...
Error fetching data for 7Sence/Rise-Beta: 404
Failed to get info for 7Sence/Rise-Beta
Processing zephyrproject-rtos/zephyr...
Processing DXVVAY/hcaptcha-reverse...
Error fetching data for DXVVAY/hcaptcha-reverse: 404
Failed to get info for DXVVAY/hcaptcha-reverse
Processing microsoft/vscode...
Processing microsoft/winget-pkgs...
Processing elastic/kibana...
Processing DigitalPlatDev/FreeDomain...
Processing vllm-project/vllm...
Processing godotengine/godot...
Processing justachillcoder/binance-captcha-deobfuscator...
Error fetching data for justachillcoder/binance-captcha-deobfuscator: 404
Failed to get info for justachillcoder/binance-captcha-deobfuscator
Processing flutter/flutter.

### 通过 oss 读取指标数据 

In [None]:
# No need to import requests again since it's already imported in cell 1
import requests
import pandas as pd  

def fetch_repo_info(repo_names):
  repo_data = []
  
  for repo_name in repo_names:
    # Format the URLs with the repo name for each iteration
    contributors_url = f"https://oss.open-digger.cn/github/{repo_name}/contributors.json"
    participants_url = f"https://oss.open-digger.cn/github/{repo_name}/participants.json"
    
    try:
      contributors_response = requests.get(contributors_url, timeout=10)
      participants_response = requests.get(participants_url, timeout=10)
      
      if contributors_response.status_code == 200 and participants_response.status_code == 200:
        contributors_json = contributors_response.json()
        participants_json = participants_response.json()
        contributors_2025 = contributors_json.get('2025')
        participants_2025 = participants_json.get('2025')
        
        repo_info = {
          'repo_name': repo_name,
          'contributors_2025': contributors_2025 if contributors_2025 else 0,
          'participants_2025': participants_2025 if participants_2025 else 0
        }
        repo_data.append(repo_info)
        print(f"Processed {repo_name}")
      else:
        print(f"Error fetching data for {repo_name}")
    except Exception as e:
      print(f"Exception for {repo_name}: {e}")
    
    # Small delay to avoid overwhelming the API
    time.sleep(0.5)
  
  return repo_data

repo_names = [
  "pytorch/pytorch",
  "vllm-project/vllm",
  "langgenius/dify",
  "elastic/elasticsearch",
  "ollama/ollama",
  "openvinotoolkit/openvino",
  "apache/airflow",
  "sgl-project/sglang",
  "open-webui/open-webui",
  "ggml-org/llama.cpp",
  "BerriAI/litellm",
  "ray-project/ray",
  "stackblitz/bolt.new",
  "PaddlePaddle/Paddle",
  "airbytehq/airbyte",
  "apache/spark",
  "microsoft/onnxruntime",
  "n8n-io/n8n",
  "infiniflow/ragflow",
  "NVIDIA/NeMo"
]
# Call the function with repo_names
collected_data = fetch_repo_info(repo_names)

# Convert to dataframe and save to CSV
result_df = pd.DataFrame(collected_data)
result_df.to_csv('repo_metrics_2025.csv', index=False)
print(f"Saved data for {len(result_df)} repositories to repo_metrics_2025.csv")
