In [None]:
import os
import requests
from dotenv import load_dotenv
import clickhouse_connect
import itertools
import pandas as pd
import json
load_dotenv()
clickhouse_host = os.getenv("CLICKHOUSE_HOST")
username = os.getenv("CLICKHOUSE_USER")
password = os.getenv("CLICKHOUSE_PASSWORD")
github_token = os.getenv("GITHUB_TOKEN")

client = clickhouse_connect.get_client(host=clickhouse_host, port=8123, username=username, password=password)

headers = {
  "Authorization": f"token {github_token}"
}

In [None]:
# 读取 v1 和 v2 的数据
df_v1 = pd.read_csv('landscapev1.csv', encoding='utf-8-sig')
df_v2 = pd.read_csv('landscapev2.csv', encoding='utf-8-sig')

# 找出在 v1 中但不在 v2 中的 repo_id
v1_repo_ids = set(df_v1['repo_id'])
v2_repo_ids = set(df_v2['repo_id'])
missing_repo_ids = v1_repo_ids - v2_repo_ids

# 对于每个缺失的 repo_id，从 v1 中获取对应的数据并添加到 v2
new_rows = []
for repo_id in missing_repo_ids:
    v1_row = df_v1[df_v1['repo_id'] == repo_id].iloc[0]
    new_row = {
        'repo_id': v1_row['repo_id'],
        'repo_name': v1_row['repo_name'],
        'classification': v1_row['classification']
    }
    new_rows.append(new_row)

# 将新行添加到 v2 中
if new_rows:
    df_new = pd.DataFrame(new_rows)
    df_v2 = pd.concat([df_v2, df_new], ignore_index=True)
    
# 保存更新后的 v2 数据
df_v2.to_csv('landscape.csv', index=False, encoding='utf-8-sig')


## 获取 Landscape 所需的 GitHub 仓库信息

In [None]:
import requests  # Import requests for making API calls

# List of repositories to fetch
# repo_names = pd.read_csv('landscape.csv')['repo_name'].tolist()
repo_names = ['openai/codex',
'78/xiaozhi-esp32',
'deepseek-ai/DeepEP',
'THUDM/slime',
'inclusionAI/AReaL']

repo_data = []  # Initialize empty list for repo data
# fetch stars, language and descripiton through repo_name
def fetch_repo_info(repo_names, headers):
  github_repo_url = "https://api.github.com/repos/"
  openrank_repo_url = "https://oss.open-digger.cn/github/{repo_name}/openrank.json"
  for repo_name in repo_names:
    response = requests.get(github_repo_url + repo_name, headers=headers)
    if response.status_code == 200:
      data = response.json()
      repo_id = data['id']
      stars = data['stargazers_count'] 
      forks = data['forks_count']
      language = data['language']
      created_at = data['created_at'].split("T")[0]
      description = data['description']
      topics = ','.join(data.get("topics", [])) # 将topics列表转换为逗号分隔的字符串
      # avatar_url = data['owner']['avatar_url']
      openrank_url = openrank_repo_url.format(repo_name=repo_name)
      openrank_response = requests.get(openrank_url)
      if openrank_response.status_code == 200:
        openrank_json = openrank_response.json()
        openrank = openrank_json.get("2025-07")
      else:
        openrank = None
      
      repo_data.append({
        'repo_id': repo_id,
        'repo_name': repo_name,
        'stars': stars,
        'forks': forks, 
        'openrank_25': round(openrank) if openrank else None,
        'language': language,
        'created_at': created_at,
        'description': description,
        'topics': topics
      })
    else:
      print(f"Failed to fetch data for {repo_name}")
  
  return repo_data

In [None]:
repo_data = fetch_repo_info(repo_names, headers)

# Save the repository data to CSV file
repo_df = pd.DataFrame(repo_data)
if not repo_df.empty:
    repo_df.to_csv('repository_data.csv', index=False)
    print(f"Repository data saved to repository_data.csv")
else:
    print("No repository data to save")

## 获取 OpenRank > 50 的项目列表

In [None]:

# 查询 github openrank>50 top 项目列表
def execute_query_top_openrank(created_at='2025-01-01'):
  sql_query_top_openrank = """
    SELECT
        repo_id,
        repo_name,
        ROUND(AVG(openrank)) AS avg_openrank_25
    FROM
        opensource.global_openrank
    WHERE
        platform = 'GitHub' AND
        created_at >= %s
    GROUP BY
        repo_id, repo_name
    HAVING
        avg_openrank_25 >= 30 and avg_openrank_25 < 50
    ORDER BY
        avg_openrank_25 DESC

  """
  formatted_query = sql_query_top_openrank % (f"'{created_at}'")
  results = client.query(formatted_query)
  return results


results_openrank = execute_query_top_openrank()
print(f"Found {len(results_openrank.result_rows)} repositories with high OpenRank scores")

# 提取项目名称到列表
repo_names = [row[1] for row in results_openrank.result_rows]
print(f"First 5 repositories: {repo_names[:5]}")


## 添加项目基本信息

In [None]:
import pandas as pd
from datetime import datetime

def get_repo_info(repo_name, headers):
    
    url = f"https://api.github.com/repos/{repo_name}"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        # Process and return the data
        return {
            "id": data["id"],
            "repo_name": data["full_name"],
            "stargazers_count": data["stargazers_count"],
            "forks_count": data["forks_count"],
            "language": data["language"],
            "created_at": data["created_at"],
            "description": data["description"],
            "topics": data.get("topics", [])
        }

results = []

for repo_name in repo_names:
  print(f"Processing {repo_name}...")
  
  # Get repo info from GitHub API
  repo_info = get_repo_info(repo_name, headers)
  
  if repo_info:
        
    # Compile all information
    result = {
      "repo_id": repo_info["id"],
      "repo_name": repo_info["repo_name"],
      "stars": repo_info["stargazers_count"],
      "forks": repo_info["forks_count"],
      "language": repo_info["language"],
      "created_at": repo_info["created_at"].split("T")[0],
      "description": repo_info["description"],
      "topics": ", ".join(repo_info.get("topics", [])) if repo_info.get("topics") else ""
    }
    
    results.append(result)
  else:
    print(f"Failed to get info for {repo_name}")

df_results = pd.DataFrame(results)
columns_order = ["repo_id", "repo_name", "stars", "forks", "language", "created_at", "description", 'topics']
df_results = df_results[columns_order]

## 添加 OpenRank 值并存储

In [None]:
# Extract repo_id and avg_openrank from the results_openrank
repo_id_avg_openrank = {row[1]: (row[0], row[2]) for row in results_openrank.result_rows}

# Initialize new columns with None values
df_results['repo_id'] = None
df_results['avg_openrank_25'] = None

# Fill in the values for repo_id and avg_openrank_25
for idx, repo_name in enumerate(df_results['repo_name']):
  if repo_name in repo_id_avg_openrank:
    df_results.at[idx, 'repo_id'] = repo_id_avg_openrank[repo_name][0]
    df_results.at[idx, 'avg_openrank_25'] = repo_id_avg_openrank[repo_name][1]

# Update columns order to include the new columns
columns_order = columns_order + ['repo_id', 'avg_openrank_25']
df_results_new = df_results[columns_order]

csv_filename = f"repo_info_{datetime.now().strftime('%Y%m%d')}.csv"
df_results_new.to_csv(csv_filename, index=False)

print(f"Successfully processed {len(results)} repositories")
print(f"Results saved to {csv_filename}")

## 通过 OSS 获取 OpenRank 当月值 和 Trend 并存储

In [None]:
# 读取 repository_data.csv 并获取 OpenRank 趋势数据
import pandas as pd
import requests
import json
from time import sleep

repo_names = ["microsoft/graphrag",
"letta-ai/letta",
"FoundationAgents/MetaGPT",
"Significant-Gravitas/AutoGPT",
"eosphoros-ai/DB-GPT",
"deepset-ai/haystack",
"chatchat-space/Langchain-Chatchat",
"openxla/xla",
"searxng/searxng",
"ItzCrazyKns/Perplexica",
"zaidmukaddam/scira",
"google/A2A",
"ComposioHQ/composio",
"songquanpeng/one-api",
"SillyTavern/SillyTavern",
"ChatGPTNextWeb/NextChat",
"chatboxai/chatbox",
"oobabooga/text-generation-webui",
"AUTOMATIC1111/stable-diffusion-webui",
"stackblitz/bolt.new",
"Aider-AI/aider",
"sourcegraph/cody",
"TabbyML/tabby",
"qodo-ai/pr-agent",
"PrefectHQ/prefect",
"Zipstack/unstract",
"iterative/datachain",
"dbt-labs/dbt-core",
"Unstructured-IO/unstructured",
"dask/dask",
"open-compass/opencompass",
"lm-sys/FastChat",
"mannaandpoem/OpenManus",
"camel-ai/owl",
"mindverse/Second-Me",
"NVIDIA/nccl",
"triton-inference-server/server",
"nomic-ai/gpt4all",
"kserve/kserve",
"kvcache-ai/Mooncake",
"vllm-project/aibrix",
"mlc-ai/mlc-llm",
"bentoml/BentoML",
"microsoft/onnxruntime",
"kvcache-ai/ktransformers",
"InternLM/lmdeploy",
"huggingface/text-generation-inference",
"deepflowio/deepflow",
"flyteorg/flyte",
"Netflix/metaflow",
"zenml-io/zenml",
"Farama-Foundation/Gymnasium",
"tensorflow/tensorflow",
"keras-team/keras",
"hpcaitech/ColossalAI",
"microsoft/OmniParser",
"unitycatalog/unitycatalog",
"elastic/elasticsearch",
"opensearch-project/OpenSearch",
"lancedb/lancedb",
"pgvector/pgvector"]


results = []

# 遍历每个仓库获取 OpenRank 数据
for repo_name in repo_names:
    url = f"https://oss.open-digger.cn/github/{repo_name}/openrank.json"
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            
            # 获取7月份的openrank值
            july_openrank = None
            if "2024-07" in data:
                july_openrank = round(data["2025-07"])
            
            # 提取2025年的月度数据
            trends_2025 = []
            for month in range(1, 13):
                month_key = f"2025-{month:02d}"
                if month_key in data:
                    trends_2025.append(round(data[month_key]))
            
            # 保存结果
            results.append({
                'repo_name': repo_name,
                'july_2025_openrank': july_openrank,
                'trends_2025': str(trends_2025)
            })
            
            print(f"成功获取 {repo_name} 的OpenRank数据 - 7月值: {july_openrank}, 2025年趋势: {len(trends_2025)}个月")
            
        else:
            print(f"获取 {repo_name} 数据失败: {response.status_code}")
            results.append({
                'repo_name': repo_name,
                'july_2024_openrank': None,
                'trends_2025': '[]'
            })
            
    except Exception as e:
        print(f"处理 {repo_name} 时发生错误: {str(e)}")
        results.append({
            'repo_name': repo_name,
            'july_2024_openrank': None,
            'trends_2025': '[]'
        })

# 创建DataFrame并保存数据
df_openrank = pd.DataFrame(results)
df_openrank.to_csv('repository_openrank_data.csv', index=False)
print(f"已完成所有{len(repo_names)}个仓库的OpenRank数据获取和保存")
print(f"数据已保存到 repository_openrank_data.csv")


## 标注出已有的项目和相应的分类

In [None]:
# Load the files with explicit encoding
landscape_full = pd.read_csv('landscape-full-08.csv')
landscape527 = pd.read_csv('landscape527_full.csv')

print(f"Loaded landscape-full-08.csv with {landscape_full.shape[0]} entries")
print(f"Loaded landscape527_full.csv with {landscape527.shape[0]} entries")

# Check the column names
print("\nColumns in landscape-full-08.csv:")
print(landscape_full.columns.tolist())
print("\nColumns in landscape527_full.csv:")
print(landscape527.columns.tolist())

# Create a mapping from repo_name to classification in landscape527
classification_map = {}
for _, row in landscape527.iterrows():
  if 'repo_name' in landscape527.columns and 'classification' in landscape527.columns:
    repo_name = row['repo_name']
    classification = row['classification']
    if pd.notna(repo_name) and pd.notna(classification):
      classification_map[repo_name] = classification

# Update the llm field in landscape_full with classifications from landscape527
matches = 0
for idx, row in landscape_full.iterrows():
  if row['repo_name'] in classification_map:
    landscape_full.at[idx, 'llm'] = classification_map[row['repo_name']]
    matches += 1

# Save the updated dataframe
output_file = 'landscape_full_updated.csv'
landscape_full.to_csv(output_file, index=False)
print(f"Updated dataframe saved to {output_file}")

## 获取 Star 数 Top 1K 的 Rust 项目信息

In [None]:
import requests
import pandas as pd
from time import sleep

# GitHub API endpoint
api_url = "https://api.github.com/search/repositories"

# 设置请求头,需要替换为你的GitHub token
headers = {
    "Accept": "application/vnd.github.v3+json",
    # "Authorization": "token YOUR_GITHUB_TOKEN"  # 如果需要更高的API限制,请取消注释并填入token
}

# 搜索参数 - 专门搜索Rust语言的项目
params = {
    "q": "language:rust",  # 指定搜索Rust语言的仓库
    "sort": "stars",       # 按star数排序
    "order": "desc",       # 降序排列
    "per_page": 100       # 每页100条结果
}

all_repos = []
pages = 10  # 获取10页,总共1000个仓库

for page in range(1, pages + 1):
    try:
        params["page"] = page
        response = requests.get(api_url, headers=headers, params=params)
        
        if response.status_code == 200:
            data = response.json()
            repos = data["items"]
            
            for repo in repos:
                repo_data = {
                    "repo_name": f"{repo['owner']['login']}/{repo['name']}",
                    "stars": repo["stargazers_count"],
                    "forks": repo["forks_count"],
                    "created_at": repo["created_at"],
                    "description": repo["description"],
                    "topics": ",".join(repo.get("topics", []))
                }
                all_repos.append(repo_data)
            
            print(f"成功获取第 {page} 页数据，当前已获取 {len(all_repos)} 个仓库")
            
            # 检查是否还有更多数据
            if len(repos) < 100:
                print("已获取所有可用数据")
                break
                
            sleep(2)  # 避免触发API限制
            
        else:
            print(f"获取第 {page} 页数据失败: {response.status_code}")
            if response.status_code == 403:
                print("可能达到API访问限制，请稍后再试或使用GitHub Token")
            break
            
    except Exception as e:
        print(f"处理第 {page} 页时发生错误: {str(e)}")
        break

# 转换为DataFrame并保存
top_repos_df = pd.DataFrame(all_repos)
output_file = "top_rust_repos.csv"
top_repos_df.to_csv(output_file, index=False)
print(f"已保存 {len(all_repos)} 个Rust仓库信息到 {output_file}")

# 显示前10个仓库的基本信息
print("\n前10个最受欢迎的Rust仓库:")
print(top_repos_df[["repo_name", "stars", "forks"]].head(10))

获取新增/拿掉的项目

In [None]:
# 读取两个CSV文件
df_v1 = pd.read_csv('landscapev1.csv')
df_v2 = pd.read_csv('landscapev2.csv')

# 找出在v1中但不在v2中的repo_id
removed_repos = set(df_v1['repo_id']) - set(df_v2['repo_id'])

# 在v1中添加新列'removed'
df_v1['removed'] = ''

# 标记被移除的项目
df_v1.loc[df_v1['repo_id'].isin(removed_repos), 'removed'] = 'x'

# 保存更新后的v1文件
df_v1.to_csv('landscapev1.csv', index=False)

print(f"已标记 {len(removed_repos)} 个在v2中被移除的项目")


## 根据 Landscape2.0 中所有项目的 description，获得词云图

In [None]:

# 将所有描述文本合并成一个字符串
text = """Milvus is a high-performance, cloud-native vector database built for scalable vector ANN search	anns,cloud-native,diskann,distributed,embedding-database,embedding-similarity,embedding-store,faiss,golang,hnsw,image-search,llm,nearest-neighbor-search,rag,vector-database,vector-search,vector-similarity,vector-store
Open-source search and retrieval database for AI applications.	document-retrieval,embeddings,llms
Weaviate is an open-source vector database that stores both objects and vectors, allowing for the combination of vector search with structured filtering with the fault tolerance and scalability of a cloud-native database​.	approximate-nearest-neighbor-search,generative-search,grpc,hnsw,hybrid-search,image-search,information-retrieval,mlops,nearest-neighbor-search,neural-search,recommender-system,search-engine,semantic-search,semantic-search-engine,similarity-search,vector-database,vector-search,vector-search-engine,vectors,weaviate
Qdrant - High-performance, massive-scale Vector Database and Vector Search Engine for the next generation of AI. Also available in the cloud https://cloud.qdrant.io/	ai-search,ai-search-engine,embeddings-similarity,hnsw,image-search,knn-algorithm,machine-learning,mlops,nearest-neighbor-search,neural-network,neural-search,recommender-system,search,search-engine,search-engines,similarity-search,vector-database,vector-search,vector-search-engine
Tensors and Dynamic neural networks in Python with strong GPU acceleration	autograd,deep-learning,gpu,machine-learning,neural-network,numpy,python,tensor
PArallel Distributed Deep LEarning: Machine Learning Framework from Industrial Practice （『飞桨』核心框架，深度学习&机器学习高性能单机、分布式训练和跨平台部署）	deep-learning,distributed-training,efficiency,machine-learning,neural-network,paddlepaddle,python,scalability
Composable transformations of Python+NumPy programs: differentiate, vectorize, JIT to GPU/TPU, and more	jax
verl: Volcano Engine Reinforcement Learning for LLMs	
Distributed RL System for LLM Reasoning	llm,llm-reasoning,machine-learning-systems,mlsys,reinforcement-learning,rl
slime is a LLM post-training framework aiming for RL Scaling.	
Full-stack framework for building Multi-Agent Systems with memory, knowledge and reasoning.	agents,agi,ai,developer-tools,framework,python
🐫 CAMEL: The first and the best multi-agent framework. Finding the Scaling Law of Agents. https://www.camel-ai.org	agent,ai-societies,artificial-intelligence,communicative-ai,cooperative-ai,deep-learning,large-language-models,multi-agent-systems,natural-language-processing
A lightweight, powerful framework for multi-agent workflows	agents,ai,framework,llm,openai,python
Autonomous agents for everyone	agent,agentic,ai,autonomous,chatbot,crypto,discord,eliza,elizaos,framework,plugins,rag,slack,swarm,telegram
Framework for orchestrating role-playing, autonomous AI agents. By fostering collaborative intelligence, CrewAI empowers agents to work together seamlessly, tackling complex tasks.	agents,ai,ai-agents,aiagentframework,llms
The open source developer platform to build AI/LLM applications and models with confidence. Enhance your AI applications with end-to-end tracking, observability, and evaluations, all in one integrated platform.	agentops,agents,ai,ai-governance,apache-spark,evaluation,langchain,llm-evaluation,llmops,machine-learning,ml,mlflow,mlops,model-management,observability,open-source,openai,prompt-engineering
🔥 1Panel provides an intuitive web interface and MCP Server to manage websites, files, containers, databases, and LLMs on a Linux server.	1panel,cockpit,docker,docker-ui,lamp,linux,lnmp,ollama,webmin
🪢 Open source LLM engineering platform: LLM Observability, metrics, evals, prompt management, playground, datasets. Integrates with OpenTelemetry, Langchain, OpenAI SDK, LiteLLM, and more. 🍊YC W23	analytics,autogen,evaluation,langchain,large-language-models,llama-index,llm,llm-evaluation,llm-observability,llmops,monitoring,observability,open-source,openai,playground,prompt-engineering,prompt-management,self-hosted,ycombinator
The AI developer platform. Use Weights & Biases to train and fine-tune models, and manage models from experimentation to production.	ai,collaboration,data-science,data-versioning,deep-learning,experiment-track,hyperparameter-optimization,hyperparameter-search,hyperparameter-tuning,jax,keras,machine-learning,ml-platform,mlops,model-versioning,pytorch,reinforcement-learning,reproducibility,tensorflow
Debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards.	langchain,llama-index,llm,llm-evaluation,llm-observability,llmops,open-source,openai,playground,prompt-engineering
AI Observability & Evaluation	agents,ai-monitoring,ai-observability,aiengineering,anthropic,datasets,evals,langchain,llamaindex,llm-eval,llm-evaluation,llmops,llms,openai,prompt-engineering,smolagents
MLRun is an open source MLOps platform for quickly building and managing continuous ML applications across their lifecycle. MLRun integrates into your development and CI/CD environment and automates the delivery of production data, ML pipelines, and online applications.	data-engineering,data-science,experiment-tracking,kubernetes,machine-learning,mlops,mlops-workflow,model-serving,python,workflow
Test your prompts, agents, and RAGs. AI Red teaming, pentesting, and vulnerability scanning for LLMs. Compare performance of GPT, Claude, Gemini, Llama, and more. Simple declarative configs with command line and CI/CD integration.	ci,ci-cd,cicd,evaluation,evaluation-framework,llm,llm-eval,llm-evaluation,llm-evaluation-framework,llmops,pentesting,prompt-engineering,prompt-testing,prompts,rag,red-teaming,testing,vulnerability-scanners
An open-source runtime for composable workflows. Great for AI agents and CI/CD.	agents,ai,caching,ci-cd,containers,continuous-deployment,continuous-integration,dag,dagger,devops,docker,graphql,workflows
A high-throughput and memory-efficient inference and serving engine for LLMs	amd,cuda,deepseek,gpt,hpu,inference,inferentia,llama,llm,llm-serving,llmops,mlops,model-serving,pytorch,qwen,rocm,tpu,trainium,transformer,xpu
SGLang is a fast serving framework for large language models and vision language models.	blackwell,cuda,deepseek,deepseek-r1,deepseek-v3,inference,kimi,llama,llama3,llama4,llama5,llava,llm,llm-serving,moe,openai,pytorch,qwen3,transformer,vlm
TensorRT-LLM provides users with an easy-to-use Python API to define Large Language Models (LLMs) and support state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs. TensorRT-LLM also contains components to create Python and C++ runtimes that orchestrate the inference execution in performant way.	blackwell,cuda,llm-serving,moe,pytorch
LLM inference in C/C++	ggml
OpenVINO™ is an open source toolkit for optimizing and deploying AI inference	ai,computer-vision,deep-learning,deploy-ai,diffusion-models,generative-ai,good-first-issue,inference,llm-inference,natural-language-processing,nlp,openvino,optimize-ai,performance-boost,recommendation-system,speech-recognition,stable-diffusion,transformers,yolo
Get up and running with Llama 3.3, DeepSeek-R1, Phi-4, Gemma 3, Mistral Small 3.1 and other large language models.	deepseek,gemma,gemma3,gemma3n,go,golang,llama,llama2,llama3,llava,llm,llms,mistral,ollama,phi4,qwen
A Datacenter Scale Distributed Inference Serving Framework	
Replace OpenAI GPT with another LLM in your app by changing a single line of code. Xinference gives you the freedom to use any LLM you need. With Xinference, you're empowered to run inference with any open-source language models, speech recognition models, and multimodal models, whether in the cloud, on-premises, or even on your laptop.	artificial-intelligence,chatglm,deployment,flan-t5,gemma,ggml,glm4,inference,llama,llama3,llamacpp,llm,machine-learning,mistral,openai-api,pytorch,qwen,vllm,whisper,wizardlm
RamaLama is an open-source developer tool that simplifies the local serving of AI models from any source and facilitates their use for inference in production, all through the familiar language of containers.	ai,containers,cuda,hip,inference-server,intel,llamacpp,llm,podman,vllm
Simple, scalable AI model deployment on GPU clusters	ascend,cuda,deepseek,distributed-inference,genai,heterogeneous-cluster,inference,llama,llamacpp,llm,llm-inference,llm-serving,local-ai,maas,metal,mindie,openai,qwen,rocm,vllm
Use PEFT or Full-parameter to CPT/SFT/DPO/GRPO 500+ LLMs (Qwen3, Qwen3-MoE, Llama4, GLM4.5, InternLM3, DeepSeek-R1, ...) and 200+ MLLMs (Qwen2.5-VL, Qwen2.5-Omni, Qwen2-Audio, Ovis2, InternVL3, Llava, GLM4v, Phi4, ...) (AAAI 2025).	deepseek-r1,deploy,embedding,grpo,internvl,liger,llama,llama4,llm,lora,megatron,multimodal,omni,open-r1,peft,qwen2-vl,qwen3,qwen3-moe,rft,sft
Fine-tuning & Reinforcement Learning for LLMs. 🦥 Train Qwen3, Llama 4, DeepSeek-R1, Gemma 3, TTS 2x faster with 70% less VRAM.	agent,ai,deepseek,deepseek-r1,fine-tuning,gemma,gemma3,llama,llama3,llm,llms,lora,mistral,openai,pytorch,qwen,qwen3,text-to-speech,tts,unsloth
Unified Efficient Fine-Tuning of 100+ LLMs & VLMs (ACL 2024)	agent,ai,deepseek,fine-tuning,gemma,gpt,instruction-tuning,large-language-models,llama,llama3,llm,lora,moe,nlp,peft,qlora,quantization,qwen,rlhf,transformers
An MCP-based chatbot | 一个基于MCP的聊天机器人	chatbot,esp32,mcp
A generative world for general-purpose robotics & embodied AI learning.	
A scalable generative AI framework built for researchers and developers working on Large Language Models, Multimodal, and Speech AI (Automatic Speech Recognition and Text-to-Speech)	asr,deeplearning,generative-ai,large-language-models,machine-translation,multimodal,neural-networks,speaker-diariazation,speaker-recognition,speech-synthesis,speech-translation,tts
DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective.	billion-parameters,compression,data-parallelism,deep-learning,gpu,inference,machine-learning,mixture-of-experts,model-parallelism,pipeline-parallelism,pytorch,trillion-parameters,zero
Ongoing research training transformer models at scale	large-language-models,model-para,transformers
Ray is an AI compute engine. Ray consists of a core distributed runtime and a set of AI Libraries for accelerating ML workloads.	data-science,deep-learning,deployment,distributed,hyperparameter-optimization,hyperparameter-search,large-language-models,llm,llm-inference,llm-serving,machine-learning,optimization,parallel,python,pytorch,ray,reinforcement-learning,rllib,serving,tensorflow
Apache Spark - A unified analytics engine for large-scale data processing	big-data,java,jdbc,python,r,scala,spark,sql
A Cloud Native Batch System (Project under CNCF)	ai,batch-systems,bigdata,gene,golang,hpc,kubernetes,machine-learning,serving,training
Label Studio is a multi-type data labeling and annotation tool with standardized output format	annotation,annotation-tool,annotations,boundingbox,computer-vision,data-labeling,dataset,datasets,deep-learning,image-annotation,image-classification,image-labeling,image-labelling-tool,label-studio,labeling,labeling-tool,mlops,semantic-segmentation,text-annotation,yolo
Annotate better with CVAT, the industry-leading data engine for machine learning. Used and trusted by teams at any scale, for data of any scale.	annotation,annotation-tool,annotations,boundingbox,computer-vision,computer-vision-annotation,dataset,deep-learning,image-annotation,image-classification,image-labeling,image-labelling-tool,imagenet,labeling,labeling-tool,object-detection,pytorch,semantic-segmentation,tensorflow,video-annotation
AI + Data, online. https://vespa.ai	ai,big-data,cpp,java,machine-learning,search-engine,server,serving,serving-recommendation,tensorflow,vector-search,vespa
Apache Airflow - A platform to programmatically author, schedule, and monitor workflows	airflow,apache,apache-airflow,automation,dag,data-engineering,data-integration,data-orchestrator,data-pipelines,data-science,elt,etl,machine-learning,mlops,orchestration,python,scheduler,workflow,workflow-engine,workflow-orchestration
The leading data integration platform for ETL / ELT data pipelines from APIs, databases & files to data warehouses, data lakes & data lakehouses. Both self-hosted and Cloud-hosted.	bigquery,change-data-capture,data,data-analysis,data-collection,data-engineering,data-integration,data-pipeline,elt,etl,java,mssql,mysql,pipeline,postgresql,python,redshift,s3,self-hosted,snowflake
An orchestration platform for the development, production, and observation of data assets.	analytics,dagster,data-engineering,data-integration,data-orchestrator,data-pipelines,data-science,etl,metadata,mlops,orchestration,python,scheduler,workflow,workflow-automation
Apache Iceberg	apache,hacktoberfest,iceberg
OpenMetadata is a unified metadata platform for data discovery, data observability, and data governance powered by a central metadata repository, in-depth column level lineage, and seamless team collaboration.	data-catalog,data-collaboration,data-contracts,data-discovery,data-governance,data-lineage,data-observability,data-profiling,data-quality,data-quality-checks,data-science,data-validation,datadiscovery,dataengineering,dataquality,dbt,metadata,metadata-management,snowflake
The Metadata Platform for your Data and AI Stack	data-catalog,data-discovery,data-governance,datahub,metadata
World's most powerful open data catalog for building a high-performance, geo-distributed and federated metadata lake.	ai-catalog,data-catalog,datalake,federated-query,lakehouse,metadata,metalake,model-catalog,opendatacatalog,skycomputing,stratosphere
An open-source storage framework that enables building a Lakehouse architecture with compute engines including Spark, PrestoDB, Flink, Trino, and Hive and APIs	acid,analytics,big-data,delta-lake,spark
Apache Paimon is a lake format that enables building a Realtime Lakehouse Architecture with Flink and Spark for both streaming and batch operations.	big-data,data-ingestion,flink,paimon,real-time-analytics,spark,streaming-datalake,table-store
Upserts, Deletes And Incremental Processing on Big Data.	apacheflink,apachehudi,apachespark,bigdata,data-integration,datalake,hudi,incremental-processing,stream-processing
🍒 Cherry Studio is a desktop client that supports for multiple LLM providers.	agent,anthropic,assistant,chatbot,chatbotai,electron,llm,mcp-client,openai
User-friendly AI Interface (Supports Ollama, OpenAI API, ...)	ai,llm,llm-ui,llm-webui,llms,mcp,ollama,ollama-webui,open-webui,openai,openapi,rag,self-hosted,ui,webui
🤯 Lobe Chat - an open-source, modern design AI chat framework. Supports multiple AI providers (OpenAI / Claude 4 / Gemini / DeepSeek / Ollama / Qwen), Knowledge Base (file upload / RAG ), one click install MCP Marketplace and Artifacts / Thinking. One-click FREE deployment of your private AI Agent application.	agent,ai,artifacts,chat,chatgpt,claude,deepseek,deepseek-r1,function-calling,gemini,gpt,knowledge-base,mcp,nextjs,ollama,openai,rag
Enhanced ChatGPT Clone: Features Agents, DeepSeek, Anthropic, AWS, OpenAI, Responses API, Azure, Groq, o1, GPT-4o, Mistral, OpenRouter, Vertex AI, Gemini, Artifacts, AI model switching, message search, Code Interpreter, langchain, DALL-E-3, OpenAPI Actions, Functions, Secure Multi-User Auth, Presets, open-source for self-hosting. Active project.	ai,anthropic,artifacts,aws,azure,chatgpt,chatgpt-clone,claude,clone,dall-e-3,deepseek,gemini,google,librechat,o1,openai,plugins,responses-api,vision,webui
✨ 易上手的多平台 LLM 聊天机器人及开发框架 ✨ 支持 QQ、QQ频道、Telegram、企微、飞书、钉钉 | 知识库、MCP 服务器、OpenAI、DeepSeek、Gemini、硅基流动、月之暗面、Ollama、OneAPI、Dify	agent,ai,chatbot,chatgpt,docker,gemini,gpt,llama,llm,mcp,openai,python,qq,qqbot,qqchannel,telegram
A privacy-first, self-hosted, fully open source personal knowledge management software, written in typescript and golang.	anki,chatgpt,deepseek,electron,evernote,knowledge-base,local-first,markdown,note-taking,notes-app,notion,obsidian,ocr,ollama,openai,pdf,s3,self-hosted,webdav
Get your documents ready for gen AI	ai,convert,document-parser,document-parsing,documents,docx,html,markdown,pdf,pdf-converter,pdf-to-json,pdf-to-text,pptx,tables,xlsx
The all-in-one Desktop & Docker AI application with built-in RAG, AI agents, No-code agent builder, MCP compatibility,  and more.	ai-agents,custom-ai-agents,deepseek,kimi,llama3,llm,lmstudio,local-llm,localai,mcp,mcp-servers,moonshot,multimodal,no-code,ollama,qwen3,rag,vector-database,web-scraping
Streamlit — A faster way to build and share data apps.	data-analysis,data-science,data-visualization,deep-learning,developer-tools,machine-learning,python,streamlit
Build and share delightful machine learning apps, all in Python. 🌟 Star to support our work!	data-analysis,data-science,data-visualization,deep-learning,deploy,gradio,gradio-interface,hacktoberfest,interface,machine-learning,models,python,python-notebook,ui,ui-components
cuDF - GPU DataFrame Library	arrow,cpp,cuda,cudf,dask,data-analysis,data-science,dataframe,gpu,pandas,pydata,python,rapids
A library for accelerating Transformer models on NVIDIA GPUs, including using 8-bit floating point (FP8) precision on Hopper, Ada and Blackwell GPUs, to provide better performance with lower memory utilization in both training and inference.	cuda,deep-learning,fp8,gpu,jax,machine-learning,python,pytorch
FlashInfer: Kernel Library for LLM Serving	attention,cuda,distributed-inference,gpu,jit,large-large-models,llm-inference,moe,nvidia,pytorch
CUDA Templates for Linear Algebra Subroutines	cpp,cuda,deep-learning,deep-learning-library,gpu,nvidia
MLX: An array framework for Apple silicon	mlx
DeepEP: an efficient expert-parallel communication library	
Fast and memory-efficient exact attention	
Development repository for the Triton language and compiler	
The Modular Platform (includes MAX & Mojo)	ai,language,machine-learning,max,modular,mojo,programming-language
An open-source AI agent that brings the power of Gemini directly into your terminal.	gemini,gemini-api
AI coding agent, built for the terminal.	
Autonomous coding agent right in your IDE, capable of creating/editing files, executing commands, using the browser, and more with your permission every step of the way.	
an open source, extensible AI agent that goes beyond code suggestions - install, execute, edit, and test with any LLM	
⏩ Create, share, and use custom AI code assistants with our open-source IDE extensions and hub of rules, tools, and models	ai,chatgpt,copilot,developer-tools,intellij,jetbrains,llm,open-source,openai,pycharm,software-development,visual-studio-code,vscode
🙌 OpenHands: Code Less, Make More	agent,artificial-intelligence,chatgpt,claude-ai,cli,developer-tools,gpt,llm,openai
A reactive notebook for Python — run reproducible experiments, query with SQL, execute as a script, deploy as an app, and version with git. All in a modern, AI-native editor.	artificial-intelligence,dag,data-science,data-visualization,dataflow,developer-tools,machine-learning,notebooks,pipeline,python,reactive,sql,web-app
Lightweight coding agent that runs in your terminal	
Use your Neovim like using Cursor AI IDE!	
Production-ready platform for agentic workflow development.	agent,agentic-ai,agentic-framework,agentic-workflow,ai,automation,gemini,genai,gpt,gpt-4,llm,low-code,mcp,nextjs,no-code,openai,orchestration,python,rag,workflow
Fair-code workflow automation platform with native AI capabilities. Combine visual building with custom code, self-host or cloud, 400+ integrations.	ai,apis,automation,cli,data-flow,development,integration-framework,integrations,ipaas,low-code,low-code-platform,mcp,mcp-client,mcp-server,n8n,no-code,self-hosted,typescript,workflow,workflow-automation
The Postgres development platform. Supabase gives you a dedicated Postgres database to build your web, mobile, and AI applications.	ai,alternative,auth,database,deno,embeddings,example,firebase,nextjs,oauth2,pgvector,postgis,postgres,postgresql,postgrest,realtime,supabase,vectors,websockets
RAGFlow is an open-source RAG (Retrieval-Augmented Generation) engine based on deep document understanding.	agent,agentic,agentic-ai,agentic-workflow,ai,ai-search,deep-learning,deep-research,deepseek,deepseek-r1,document-parser,document-understanding,graphrag,llm,mcp,multi-agent,ollama,openai,rag,retrieval-augmented-generation
Langflow is a powerful tool for building and deploying AI-powered agents and workflows.	agents,chatgpt,generative-ai,large-language-models,multiagent,react-flow
The TypeScript AI agent framework. ⚡ Assistants, RAG, observability. Supports any LLM: GPT-4, Claude, Gemini, Llama.	agents,ai,chatbots,evals,javascript,llm,mcp,nextjs,nodejs,reactjs,tts,typescript,workflows
AI Agents & MCPs & AI Workflow Automation • (280+ MCP servers for AI agents) • AI Automation / AI Agent with MCPs • AI Workflows & AI Agents • MCPs for AI Agents	ai-agent,ai-agent-tools,ai-agents,ai-agents-framework,mcp,mcp-server,mcp-tools,mcps,n8n-alternative,no-code-automation,workflow,workflow-automation,workflows
🔥 MaxKB is an open-source platform for building enterprise-grade agents.  MaxKB 是强大易用的开源企业级智能体平台。	agent,agentic-ai,chatbot,deepseek-r1,knowledgebase,langchain,llama3,llm,maxkb,mcp-server,ollama,pgvector,qwen3,rag
FastGPT is a knowledge-based platform built on the LLMs, offers a comprehensive suite of out-of-the-box capabilities such as data processing, RAG retrieval, and visual AI workflow orchestration, letting you easily develop and deploy complex question-answering systems without the need for extensive setup or configuration.	agent,claude,deepseek,llm,mcp,nextjs,openai,qwen,rag,workflow
Build AI Agents, Visually	agentic-ai,agentic-workflow,agents,artificial-intelligence,chatbot,chatgpt,javascript,langchain,large-language-models,low-code,multiagent-systems,no-code,openai,rag,react,typescript,workflow-automation
Python SDK, Proxy Server (LLM Gateway) to call 100+ LLM APIs in OpenAI format - [Bedrock, Azure, OpenAI, VertexAI, Cohere, Anthropic, Sagemaker, HuggingFace, Replicate, Groq]	ai-gateway,anthropic,azure-openai,bedrock,gateway,langchain,litellm,llm,llm-gateway,llmops,mcp-gateway,openai,openai-proxy,vertex-ai
The AI Toolkit for TypeScript. From the creators of Next.js, the AI SDK is a free open-source library for building AI-powered applications and agents	anthropic,artificial-intelligence,gemini,generative-ai,generative-ui,javascript,language-model,llm,nextjs,openai,react,svelte,typescript,vercel,vue
The most powerful and modular diffusion model GUI, api and backend with a graph/nodes interface.	ai,python,pytorch,stable-diffusion
An open-source, code-first Python toolkit for building, evaluating, and deploying sophisticated AI agents with flexibility and control.	agent,agentic,agentic-ai,agents,agents-sdk,ai,ai-agents,aiagentframework,genai,genai-chatbot,llm,llms,multi-agent,multi-agent-systems,multi-agents,multi-agents-collaboration
🌐 Make websites accessible for AI agents. Automate tasks online with ease.	ai-agents,ai-tools,browser-automation,browser-use,llm,playwright,python
Model Context Protocol Servers	
Universal memory layer for AI Agents; Announcing OpenMemory MCP - local and secure memory management.	agent,ai,aiagent,application,chatbots,chatgpt,embeddings,llm,long-term-memory,memory,memory-management,python,rag,state-management,vector-database
Build resilient language agents as graphs.	
Agent Framework / shim to use Pydantic with LLMs	agent-framework,llms,pydantic,python
🦜🔗 Build context-aware reasoning applications	ai,anthropic,gemini,langchain,llm,openai,python
A powerful framework for building realtime voice AI agents 🤖🎙️📹	agents,ai,openai,real-time,video,voice
An Application Framework for AI Engineering	artificial-intelligence,java,spring-ai
LlamaIndex is the leading framework for building LLM-powered agents over your data.	agents,application,data,fine-tuning,framework,llamaindex,llm,multi-agents,rag,vector-database
Integrate cutting-edge LLM technology quickly and easily into your apps	ai,artificial-intelligence,llm,openai,sdk
Open Source framework for voice and multimodal conversational AI	ai,chatbot-framework,chatbots,real-time,voice,voice-assistant
A programming framework for agentic AI 🤖 PyPi: autogen-agentchat Discord: https://aka.ms/autogen-discord Office Hour: https://aka.ms/autogen-officehour	agentic,agentic-agi,agents,ai,autogen,autogen-ecosystem,chatgpt,framework,llm-agent,llm-framework"""

# 定义常用词列表
common_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 
                'of', 'with', 'by', 'from', 'up', 'about', 'into', 'over', 'after',
                'is', 'are', 'was', 'were', 'be', 'been', 'being',
                'i', 'you', 'he', 'she', 'it', 'we', 'they',
                'this', 'that', 'these', 'those',
                'your', 'my', 'his', 'her', 'its', 'our', 'their'}

# 创建一个字典来存储单词频率
word_freq = {}

# 第一步：按空格分割
words = []
for word in text.lower().split():
    # 处理破折号分隔的单词
    if '-' in word:
        words.extend(word.split('-'))
    else:
        words.append(word)

# 第二步：按逗号分割
comma_words = []
for word in words:
    if ',' in word:
        comma_words.extend(word.split(','))
    else:
        comma_words.append(word)

# 更新words列表
words = comma_words

for word in words:
    # 去除标点符号
    word = word.strip('.,!?()[]{}":;-/&')
    # 确保单词不是空字符串且不是常用词
    if word and word not in common_words:
        # 将复数形式转为单数形式(简单的s结尾情况)
        if word.endswith('s') and word[:-1] in word_freq:
            base_word = word[:-1]
            word_freq[base_word] = word_freq.get(base_word, 0) + 1
        else:
            word_freq[word] = word_freq.get(word, 0) + 1

print(words)

In [None]:

# 按频率降序排序
sorted_word_freq = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True))

print("去除常用词后的词频统计结果（前100个）：")
for word, freq in list(sorted_word_freq.items())[:100]:
    print(f"{word}: {freq}")
