In [18]:
import os
import xml.etree.ElementTree as ET
from xml.dom import minidom
from datetime import datetime

def generate_sitemap(directory, base_url):
    urls = []
    
    # 遍历文件夹并获取所有文件的相对路径
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            # 过滤出常见的网页相关文件类型，可以根据需要增加其他类型
            if filename.endswith(('.html', '.pdf')):
                # 计算文件的相对路径
                file_path = os.path.relpath(os.path.join(dirpath, filename), directory)
                # 转换为 URL
                url = base_url + '/' + file_path.replace("\\", "/")
                urls.append(url)
    
    return urls

def create_sitemap(urls):
    # 创建根节点
    urlset = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
    
    for url in urls:
        url_element = ET.SubElement(urlset, "url")
        loc = ET.SubElement(url_element, "loc")
        loc.text = url
        lastmod = ET.SubElement(url_element, "lastmod")
        lastmod.text = datetime.now().strftime("%Y-%m-%d")  # 使用当前日期
        changefreq = ET.SubElement(url_element, "changefreq")
        changefreq.text = "monthly"  # 假设每月更新
        priority = ET.SubElement(url_element, "priority")
        priority.text = "0.8"  # 设置优先级

    # 将 XML 转换为字符串
    rough_string = ET.tostring(urlset, encoding="utf-8", xml_declaration=True)
    # 使用 minidom 格式化 XML
    reparsed = minidom.parseString(rough_string)
    pretty_xml = reparsed.toprettyxml(indent="  ", encoding="utf-8")  # 设置缩进并确保包含声明
    
    # 将格式化后的 XML 写入文件
    with open("sitemap.xml", "wb") as f:  # 使用二进制写入以支持 UTF-8
        f.write(pretty_xml)

if __name__ == "__main__":
    directory = "."  # 替换为你的网站目录
    base_url = "https://zhangrui4041.github.io/awesome-paper-test.github.io"  # 替换为你的网站的基础 URL
    urls = generate_sitemap(directory, base_url)
    create_sitemap(urls)
    print("站点地图已生成：sitemap.xml")


站点地图已生成：sitemap.xml


In [15]:
import os

def extract_title(file_path):
    """
    Extract the title from an HTML file.

    :param file_path: Path to the HTML file.
    :return: The content of the <title> tag, or the file name if no title is found.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            for line in file:
                if "<title>" in line and "</title>" in line:
                    return line.split("<title>")[1].split("</title>")[0].strip()
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
    return os.path.basename(file_path)

def generate_index_html(directory):
    """
    Generate an index.html file listing all HTML files in the given directory.

    :param directory: Path to the directory containing HTML files.
    """
    # List to store HTML file names and titles
    html_files = []

    # Traverse the directory to find all HTML files
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".html") and file != "index.html":
                # Get the relative path to the HTML file and replace backslashes with forward slashes
                relative_path = os.path.relpath(os.path.join(root, file), directory).replace("\\", "/")
                full_path = os.path.join(root, file)
                title = extract_title(full_path)
                html_files.append((relative_path, title))

    # Create the index.html content
    index_content = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <meta name="description" content="A curated collection of high-quality academic papers across various fields of computer science, showcasing groundbreaking research and innovative ideas.">
    <meta name="keywords" content="academic papers, computer science, artificial intelligence, computation and language, computational complexity, computational engineering, finance, science, computational geometry, game theory, computer vision, pattern recognition, cryptography, security, data structures, algorithms, databases, digital libraries, discrete mathematics, distributed computing, formal languages, automata theory, hardware architecture, human-computer interaction, information retrieval, information theory, machine learning, multimedia, networking, operating systems, robotics, software engineering, systems and control">
    <meta name="author" content="Awesome Paper Test">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta name="robots" content="index, follow">
    <title>High-Quality Computer Science Papers - Index</title>

    <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
    <link rel="stylesheet" href="./static/css/bulma.min.css">
    <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
    <link rel="stylesheet" href="./static/css/index.css">
</head>
<body>
    <section class="section">
        <div class="container">
            <h1 class="title">High-Quality Computer Science Papers Index</h1>
            <ul>
"""

    base_url = "https://zhangrui4041.github.io/awesome-paper-test.github.io/"

    for file, title in html_files:
        index_content += f'                <li><a href="{base_url}{file}" class="is-link">{title}</a></li>\n'

    index_content += """
            </ul>
        </div>
    </section>

    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
    <script defer src="./static/js/fontawesome.all.min.js"></script>
</body>
</html>
"""

    # Write the index.html file
    index_file_path = os.path.join(directory, "index.html")
    with open(index_file_path, "w", encoding="utf-8") as index_file:
        index_file.write(index_content)

    print(f"Index file generated at: {index_file_path}")

# Specify the directory containing HTML files
directory_path = input("Enter the directory path containing HTML files: ").strip()
generate_index_html(directory_path)


Index file generated at: .\index.html


# Index Now

In [None]:
import os
import json

# 定义函数生成IndexNow的JSON请求体
def generate_indexnow_payload(directory, host, key, key_location):
    # 获取指定目录下的所有HTML文件
    html_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".html"):
                # 将文件路径转换为URL格式
                relative_path = os.path.relpath(os.path.join(root, file), directory)
                url = f"https://{host}/{relative_path.replace(os.sep, '/')}"
                html_files.append(url)
    
    # 构造JSON请求体
    payload = {
        "host": host,
        "key": key,
        "keyLocation": key_location,
        "urlList": html_files
    }
    return json.dumps(payload, indent=4)

# 使用示例
directory = "./"  # 本地 HTML 文件的根目录
host = "zhangrui4041.github.io/awesome-paper-test.github.io"
key = "4e269251d6bf46a39ccbd8abb1ec76f3"
key_location = f"https://{host}/{key}.txt"

indexnow_payload = generate_indexnow_payload(directory, host, key, key_location)

# 打印生成的请求体
print(indexnow_payload)

# 保存到文件
with open("indexnow_payload.json", "w") as f:
    f.write(indexnow_payload)

print("IndexNow 请求体已保存为 indexnow_payload.json")


In [None]:
import requests

url = "https://api.indexnow.org"
headers = {"Content-Type": "application/json"}
response = requests.post(url, headers=headers, data=indexnow_payload)

print("Response status:", response.status_code)
print("Response body:", response.text)


请求成功！响应已保存为 response_body.json
