In [3]:
import os
import requests
from dotenv import load_dotenv
import clickhouse_connect
import itertools
import pandas as pd
import json

load_dotenv()
clickhouse_host = os.getenv("CLICKHOUSE_HOST")
username = os.getenv("CLICKHOUSE_USER")
password = os.getenv("CLICKHOUSE_PASSWORD")
github_token = os.getenv("GITHUB_TOKEN")
hf_token = os.getenv("HF_TOKEN")

client = clickhouse_connect.get_client(host=clickhouse_host, port=8123, username=username, password=password)

headers_github = {
  "Authorization": f"token {github_token}"
}

headers_hf = {
  "Authorization": f"token {hf_token}"
}

### 获取 landscape 所需的 GitHub 仓库信息


In [None]:
import requests  # Import requests for making API calls

# List of repositories to fetch
repo_names = [
]

repo_data = []  # Initialize empty list for repo data
# fetch stars, language and descripiton through repo_name
def fetch_repo_info(repo_names, headers):
  github_repo_url = "https://api.github.com/repos/"
  openrank_repo_url = "https://oss.open-digger.cn/github/{repo_name}/openrank.json"
  for repo_name in repo_names:
    response = requests.get(github_repo_url + repo_name, headers=headers)
    if response.status_code == 200:
      data = response.json()
      repo_id = data['id']
      stars = data['stargazers_count']
      language = data['language']
      created_at = data['created_at'].split("T")[0]
      description = data['description']
      # avatar_url = data['owner']['avatar_url']
      openrank_url = openrank_repo_url.format(repo_name=repo_name)
      openrank_response = requests.get(openrank_url)
      if openrank_response.status_code == 200:
        openrank_json = openrank_response.json()
        openrank_2025 = openrank_json.get("2025")
      else:
        openrank_2025 = None
      
      repo_data.append({
        'repo_id': repo_id, 
        'repo_name': repo_name, 
        'stars': stars, 
        'openrank_2025': round(openrank_2025) if openrank_2025 else None,
        'language': language, 
        'created_at': created_at,
        'description': description
      })
    else:
      print(f"Failed to fetch data for {repo_name}")
  
  return repo_data




In [4]:
# Collect repository data
repo_data = fetch_repo_info(repo_names, headers_github)

# Save the repository data to CSV file
repo_df = pd.DataFrame(repo_data)
if not repo_df.empty:
    repo_df.to_csv('repository_data.csv', index=False)
    print(f"Repository data saved to repository_data.csv")
else:
    print("No repository data to save")

Repository data saved to repository_data.csv


### 获取 landscape 所需的 huggingface 模型信息

In [None]:
def fetch_huggingface_models_info(model_names):
  """
  Fetch information about Hugging Face models via their API.
  
  Args:
    model_names (list): List of model repository names (e.g., 'deepseek-ai/DeepSeek-R1')
    
  Returns:
    list: List of dictionaries containing model information
  """
  models_data = []
  base_url = "https://huggingface.co/api/models/"
  
  for model_name in model_names:
    try:
      response = requests.get(base_url + model_name, headers=headers_hf)
      response.raise_for_status()
      if response.status_code == 200:
        data = response.json()
        # Extract the required fields
        model_info = {
          'model_name': id,
          'pipeline_tag': data.get('pipeline_tag'),
          'tags': data.get('tags', []),
          'downloads': data.get('downloads'),
          'likes': data.get('likes'),
          'lastModified': data.get('lastModified'),
          'createdAt': data.get('createdAt'),
          'inference': data.get('inference')
        }
        
        models_data.append(model_info)
        print(f"Successfully fetched data for {model_name}")
      else:
        print(f"Failed to fetch data for {model_name}. Status code: {response.status_code}")
    except Exception as e:
      print(f"Error fetching data for {model_name}: {str(e)}")
  
  return models_data

# Example usage:
huggingface_model_names = [
  "deepseek-ai/DeepSeek-R1",
  "meta-llama/Llama-3-70b-chat-hf",
  "mistralai/Mistral-7B-Instruct-v0.3",
  "Qwen/Qwen2-72B-Instruct"
]

models_data = fetch_huggingface_models_info(huggingface_model_names)

# Convert to DataFrame and save as CSV
models_df = pd.DataFrame(models_data)
if not models_df.empty:
  models_df.to_csv('huggingface_models_data.csv', index=False)
  print(f"Model data saved to huggingface_models_data.csv")
else:
  print("No model data to save")

# Display the data
if not models_df.empty:
  print("\nHuggingFace Models Data:")
  print(models_df)

In [3]:
import pandas as pd
from datetime import datetime
import requests
repo_names = ['langgenius/dify',
    'n8n-io/n8n',
    'infiniflow/ragflow',
    'langchain-ai/langchain',
    'run-llama/llama_index',
    'microsoft/autogen',
    'microsoft/semantic-kernel',
    'labring/FastGPT',
    'langchain-ai/langgraph',
    'crewAIInc/crewAI']

openrank_repo_url = "https://oss.open-digger.cn/github/{repo_name}/openrank.json"

def fetch_monthly_openrank(repo_names):
  # Define date range from Mar 2023 to Apr 2025
  start_date = datetime(2023, 3, 1)
  end_date = datetime(2025, 4, 30)
  
  # Generate all month keys in format YYYY-MM
  months = []
  current_date = start_date
  while current_date <= end_date:
    months.append(current_date.strftime("%Y-%m"))
    # Move to next month
    month = current_date.month + 1
    year = current_date.year + month // 13
    month = month % 12 or 12
    current_date = datetime(year, month, 1)
  
  # Initialize dataframe with months as columns
  result_df = pd.DataFrame(index=repo_names, columns=months)
  
  # Fetch data for each repository
  for repo_name in repo_names:
    url = openrank_repo_url.format(repo_name=repo_name)
    try:
      response = requests.get(url)
      if response.status_code == 200:
        data = response.json()
        
        # Extract monthly values and add to dataframe
        for month in months:
          if month in data:
            result_df.loc[repo_name, month] = data[month]
      else:
        print(f"Failed to fetch OpenRank data for {repo_name}: Status {response.status_code}")
    except Exception as e:
      print(f"Error processing {repo_name}: {str(e)}")
  
  # Reset index to make repo_name a column
  result_df = result_df.reset_index().rename(columns={'index': 'repo_name'})
  
  # Save to CSV
  result_df.to_csv('monthly_openrank_data.csv', index=False)
  print(f"Monthly OpenRank data saved to monthly_openrank_data.csv")
  
  return result_df

# Use the repo_names list we already have
monthly_openrank_df = fetch_monthly_openrank(repo_names)
monthly_openrank_df.head()

Monthly OpenRank data saved to monthly_openrank_data.csv


Unnamed: 0,repo_name,2023-03,2023-04,2023-05,2023-06,2023-07,2023-08,2023-09,2023-10,2023-11,...,2024-07,2024-08,2024-09,2024-10,2024-11,2024-12,2025-01,2025-02,2025-03,2025-04
0,langgenius/dify,,,30.07,44.26,54.52,71.47,66.2,69.8,78.26,...,298.05,320.44,345.89,377.59,387.0,384.22,355.18,415.8,560.84,560.04
1,n8n-io/n8n,109.88,103.67,92.95,93.75,89.53,90.26,91.33,91.59,95.39,...,104.96,101.24,111.24,124.79,136.57,147.71,159.36,196.77,233.24,261.67
2,infiniflow/ragflow,,,,,,,,,,...,86.35,94.26,106.17,112.63,131.0,137.03,133.13,189.65,260.49,246.89
3,langchain-ai/langchain,196.34,365.61,487.19,546.28,557.17,524.13,492.3,468.57,493.84,...,364.1,323.09,300.74,266.89,232.77,209.34,186.25,169.92,143.15,137.12
4,run-llama/llama_index,81.43,116.29,212.33,199.85,152.81,143.9,153.85,163.11,178.64,...,221.36,227.53,205.79,184.75,164.41,143.96,134.93,121.01,113.59,110.12
