In [11]:
import os
import requests
from dotenv import load_dotenv
import clickhouse_connect
import itertools
import pandas as pd
import json
load_dotenv()
clickhouse_host = os.getenv("CLICKHOUSE_HOST")
username = os.getenv("CLICKHOUSE_USER")
password = os.getenv("CLICKHOUSE_PASSWORD")
github_token = os.getenv("GITHUB_TOKEN")

client = clickhouse_connect.get_client(host=clickhouse_host, port=8123, username=username, password=password)

headers = {
  "Authorization": f"token {github_token}"
}

In [6]:
# 读取 v1 和 v2 的数据
df_v1 = pd.read_csv('landscapev1.csv', encoding='utf-8-sig')
df_v2 = pd.read_csv('landscapev2.csv', encoding='utf-8-sig')

# 找出在 v1 中但不在 v2 中的 repo_id
v1_repo_ids = set(df_v1['repo_id'])
v2_repo_ids = set(df_v2['repo_id'])
missing_repo_ids = v1_repo_ids - v2_repo_ids

# 对于每个缺失的 repo_id，从 v1 中获取对应的数据并添加到 v2
new_rows = []
for repo_id in missing_repo_ids:
    v1_row = df_v1[df_v1['repo_id'] == repo_id].iloc[0]
    new_row = {
        'repo_id': v1_row['repo_id'],
        'repo_name': v1_row['repo_name'],
        'classification': v1_row['classification']
    }
    new_rows.append(new_row)

# 将新行添加到 v2 中
if new_rows:
    df_new = pd.DataFrame(new_rows)
    df_v2 = pd.concat([df_v2, df_new], ignore_index=True)
    
# 保存更新后的 v2 数据
df_v2.to_csv('landscape.csv', index=False, encoding='utf-8-sig')


## 获取 Landscape 所需的 GitHub 仓库信息

In [7]:
import requests  # Import requests for making API calls

# List of repositories to fetch
repo_names = pd.read_csv('landscape.csv')['repo_name'].tolist()

repo_data = []  # Initialize empty list for repo data
# fetch stars, language and descripiton through repo_name
def fetch_repo_info(repo_names, headers):
  github_repo_url = "https://api.github.com/repos/"
  openrank_repo_url = "https://oss.open-digger.cn/github/{repo_name}/openrank.json"
  for repo_name in repo_names:
    response = requests.get(github_repo_url + repo_name, headers=headers)
    if response.status_code == 200:
      data = response.json()
      repo_id = data['id']
      stars = data['stargazers_count'] 
      forks = data['forks_count']
      language = data['language']
      created_at = data['created_at'].split("T")[0]
      description = data['description']
      topics = ','.join(data.get("topics", [])) # 将topics列表转换为逗号分隔的字符串
      # avatar_url = data['owner']['avatar_url']
      openrank_url = openrank_repo_url.format(repo_name=repo_name)
      openrank_response = requests.get(openrank_url)
      if openrank_response.status_code == 200:
        openrank_json = openrank_response.json()
        openrank = openrank_json.get("2025-07")
      else:
        openrank = None
      
      repo_data.append({
        'repo_id': repo_id,
        'repo_name': repo_name,
        'stars': stars,
        'forks': forks, 
        'openrank_25': round(openrank) if openrank else None,
        'language': language,
        'created_at': created_at,
        'description': description,
        'topics': topics
      })
    else:
      print(f"Failed to fetch data for {repo_name}")
  
  return repo_data

In [9]:
repo_data = fetch_repo_info(repo_names, headers)

# Save the repository data to CSV file
repo_df = pd.DataFrame(repo_data)
if not repo_df.empty:
    repo_df.to_csv('repository_data.csv', index=False)
    print(f"Repository data saved to repository_data.csv")
else:
    print("No repository data to save")

Failed to fetch data for sourcegraph/cody
Repository data saved to repository_data.csv


## 获取 OpenRank > 50 的项目列表

In [None]:

# 查询 github openrank>50 top 项目列表
def execute_query_top_openrank(created_at='2025-01-01'):
  sql_query_top_openrank = """
    SELECT
        repo_id,
        repo_name,
        ROUND(AVG(openrank)) AS avg_openrank_25
    FROM
        opensource.global_openrank
    WHERE
        platform = 'GitHub' AND
        created_at >= %s
    GROUP BY
        repo_id, repo_name
    HAVING
        avg_openrank_25 >= 30 and avg_openrank_25 < 50
    ORDER BY
        avg_openrank_25 DESC

  """
  formatted_query = sql_query_top_openrank % (f"'{created_at}'")
  results = client.query(formatted_query)
  return results


results_openrank = execute_query_top_openrank()
print(f"Found {len(results_openrank.result_rows)} repositories with high OpenRank scores")

# 提取项目名称到列表
repo_names = [row[1] for row in results_openrank.result_rows]
print(f"First 5 repositories: {repo_names[:5]}")


## 添加项目基本信息

In [None]:
import pandas as pd
from datetime import datetime

def get_repo_info(repo_name, headers):
    
    url = f"https://api.github.com/repos/{repo_name}"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        # Process and return the data
        return {
            "id": data["id"],
            "repo_name": data["full_name"],
            "stargazers_count": data["stargazers_count"],
            "forks_count": data["forks_count"],
            "language": data["language"],
            "created_at": data["created_at"],
            "description": data["description"],
            "topics": data.get("topics", [])
        }

results = []

for repo_name in repo_names:
  print(f"Processing {repo_name}...")
  
  # Get repo info from GitHub API
  repo_info = get_repo_info(repo_name, headers)
  
  if repo_info:
        
    # Compile all information
    result = {
      "repo_id": repo_info["id"],
      "repo_name": repo_info["repo_name"],
      "stars": repo_info["stargazers_count"],
      "forks": repo_info["forks_count"],
      "language": repo_info["language"],
      "created_at": repo_info["created_at"].split("T")[0],
      "description": repo_info["description"],
      "topics": ", ".join(repo_info.get("topics", [])) if repo_info.get("topics") else ""
    }
    
    results.append(result)
  else:
    print(f"Failed to get info for {repo_name}")

df_results = pd.DataFrame(results)
columns_order = ["repo_id", "repo_name", "stars", "forks", "language", "created_at", "description", 'topics']
df_results = df_results[columns_order]

## 添加 OpenRank 值并存储

In [None]:
# Extract repo_id and avg_openrank from the results_openrank
repo_id_avg_openrank = {row[1]: (row[0], row[2]) for row in results_openrank.result_rows}

# Initialize new columns with None values
df_results['repo_id'] = None
df_results['avg_openrank_25'] = None

# Fill in the values for repo_id and avg_openrank_25
for idx, repo_name in enumerate(df_results['repo_name']):
  if repo_name in repo_id_avg_openrank:
    df_results.at[idx, 'repo_id'] = repo_id_avg_openrank[repo_name][0]
    df_results.at[idx, 'avg_openrank_25'] = repo_id_avg_openrank[repo_name][1]

# Update columns order to include the new columns
columns_order = columns_order + ['repo_id', 'avg_openrank_25']
df_results_new = df_results[columns_order]

csv_filename = f"repo_info_{datetime.now().strftime('%Y%m%d')}.csv"
df_results_new.to_csv(csv_filename, index=False)

print(f"Successfully processed {len(results)} repositories")
print(f"Results saved to {csv_filename}")

## 标注出已有的项目和相应的分类

In [None]:
# Load the files with explicit encoding
landscape_full = pd.read_csv('landscape-full-08.csv')
landscape527 = pd.read_csv('landscape527_full.csv')

print(f"Loaded landscape-full-08.csv with {landscape_full.shape[0]} entries")
print(f"Loaded landscape527_full.csv with {landscape527.shape[0]} entries")

# Check the column names
print("\nColumns in landscape-full-08.csv:")
print(landscape_full.columns.tolist())
print("\nColumns in landscape527_full.csv:")
print(landscape527.columns.tolist())

# Create a mapping from repo_name to classification in landscape527
classification_map = {}
for _, row in landscape527.iterrows():
  if 'repo_name' in landscape527.columns and 'classification' in landscape527.columns:
    repo_name = row['repo_name']
    classification = row['classification']
    if pd.notna(repo_name) and pd.notna(classification):
      classification_map[repo_name] = classification

# Update the llm field in landscape_full with classifications from landscape527
matches = 0
for idx, row in landscape_full.iterrows():
  if row['repo_name'] in classification_map:
    landscape_full.at[idx, 'llm'] = classification_map[row['repo_name']]
    matches += 1

# Save the updated dataframe
output_file = 'landscape_full_updated.csv'
landscape_full.to_csv(output_file, index=False)
print(f"Updated dataframe saved to {output_file}")

## 获取 Star 数 Top 1K 的 Rust 项目信息

In [None]:
import requests
import pandas as pd
from time import sleep

# GitHub API endpoint
api_url = "https://api.github.com/search/repositories"

# 设置请求头,需要替换为你的GitHub token
headers = {
    "Accept": "application/vnd.github.v3+json",
    # "Authorization": "token YOUR_GITHUB_TOKEN"  # 如果需要更高的API限制,请取消注释并填入token
}

# 搜索参数 - 专门搜索Rust语言的项目
params = {
    "q": "language:rust",  # 指定搜索Rust语言的仓库
    "sort": "stars",       # 按star数排序
    "order": "desc",       # 降序排列
    "per_page": 100       # 每页100条结果
}

all_repos = []
pages = 10  # 获取10页,总共1000个仓库

for page in range(1, pages + 1):
    try:
        params["page"] = page
        response = requests.get(api_url, headers=headers, params=params)
        
        if response.status_code == 200:
            data = response.json()
            repos = data["items"]
            
            for repo in repos:
                repo_data = {
                    "repo_name": f"{repo['owner']['login']}/{repo['name']}",
                    "stars": repo["stargazers_count"],
                    "forks": repo["forks_count"],
                    "created_at": repo["created_at"],
                    "description": repo["description"],
                    "topics": ",".join(repo.get("topics", []))
                }
                all_repos.append(repo_data)
            
            print(f"成功获取第 {page} 页数据，当前已获取 {len(all_repos)} 个仓库")
            
            # 检查是否还有更多数据
            if len(repos) < 100:
                print("已获取所有可用数据")
                break
                
            sleep(2)  # 避免触发API限制
            
        else:
            print(f"获取第 {page} 页数据失败: {response.status_code}")
            if response.status_code == 403:
                print("可能达到API访问限制，请稍后再试或使用GitHub Token")
            break
            
    except Exception as e:
        print(f"处理第 {page} 页时发生错误: {str(e)}")
        break

# 转换为DataFrame并保存
top_repos_df = pd.DataFrame(all_repos)
output_file = "top_rust_repos.csv"
top_repos_df.to_csv(output_file, index=False)
print(f"已保存 {len(all_repos)} 个Rust仓库信息到 {output_file}")

# 显示前10个仓库的基本信息
print("\n前10个最受欢迎的Rust仓库:")
print(top_repos_df[["repo_name", "stars", "forks"]].head(10))

获取 OpenRank 2025 Trend

In [10]:
# 读取 repository_data.csv 并获取 OpenRank 趋势数据
import pandas as pd
import requests
import json
from time import sleep

# 读取 repository_data.csv
df = pd.read_csv('repository_data.csv')

# 遍历每个仓库获取 OpenRank 数据
for idx, row in df.iterrows():
    repo_name = row['repo_name']
    url = f"https://oss.open-digger.cn/github/{repo_name}/openrank.json"
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            
            # 提取2025年的月度数据
            trends = []
            for month in range(1, 13):
                month_key = f"2025-{month:02d}"
                if month_key in data:
                    trends.append(round(data[month_key]))
            
            # 将趋势数据保存到DataFrame中
            df.at[idx, 'trends'] = str(trends)
            
            print(f"成功获取 {repo_name} 的OpenRank趋势数据")
            
        else:
            print(f"获取 {repo_name} 数据失败: {response.status_code}")
            df.at[idx, 'trends'] = '[]'
            
    except Exception as e:
        print(f"处理 {repo_name} 时发生错误: {str(e)}")
        df.at[idx, 'trends'] = '[]'

# 保存更新后的数据
df.to_csv('repository_data.csv', index=False)
print("已完成所有仓库的OpenRank趋势数据获取和保存")


成功获取 pytorch/pytorch 的OpenRank趋势数据
成功获取 vllm-project/vllm 的OpenRank趋势数据
成功获取 langgenius/dify 的OpenRank趋势数据
成功获取 tenstorrent/tt-metal 的OpenRank趋势数据
成功获取 microsoft/vscode-copilot-release 的OpenRank趋势数据
成功获取 huggingface/transformers 的OpenRank趋势数据
成功获取 CherryHQ/cherry-studio 的OpenRank趋势数据
成功获取 sgl-project/sglang 的OpenRank趋势数据
成功获取 ollama/ollama 的OpenRank趋势数据
成功获取 apache/doris 的OpenRank趋势数据
成功获取 apache/airflow 的OpenRank趋势数据
成功获取 openvinotoolkit/openvino 的OpenRank趋势数据
成功获取 BerriAI/litellm 的OpenRank趋势数据
成功获取 open-webui/open-webui 的OpenRank趋势数据
成功获取 ggml-org/llama.cpp 的OpenRank趋势数据
成功获取 ray-project/ray 的OpenRank趋势数据
成功获取 n8n-io/n8n 的OpenRank趋势数据
成功获取 ultralytics/ultralytics 的OpenRank趋势数据
成功获取 PaddlePaddle/Paddle 的OpenRank趋势数据
成功获取 infiniflow/ragflow 的OpenRank趋势数据
成功获取 pytorch/executorch 的OpenRank趋势数据
成功获取 airbytehq/airbyte 的OpenRank趋势数据
成功获取 NVIDIA/NeMo 的OpenRank趋势数据
成功获取 microsoft/onnxruntime 的OpenRank趋势数据
成功获取 supabase/supabase 的OpenRank趋势数据
成功获取 stackblitz/bolt.new 的OpenRank趋势数据
成功获取 apache/