In [1]:
#!/usr/bin/env python3
"""
论文摘要生成脚本
使用 Gemini API 分析 PDF 论文并生成 Hugo 格式的 markdown 文件
"""

import os
import re
import google.generativeai as genai
from pathlib import Path
import time

# 配置 Gemini API
GEMINI_API_KEY = "YOUR_GEMINI_API_KEY"  # 请替换为您的API密钥
genai.configure(api_key=GEMINI_API_KEY)

# 文件路径配置
PAPERS_DIR = "/mnt/e/weifangweb/static/papers"
PUBLICATION_DIR = "/mnt/e/weifangweb/content/publication"

# PDF文件名到文件夹的映射
PDF_TO_FOLDER = {
    "2014 UAR Neighborhood Change in Metropolitan America, 1990 to 2010.pdf": "2014-uar-neighborhood-change",
    "2015 EnvironPlanA Spatial transformation of metropolitan cities.pdf": "2015-epa-spatial-transformation",
    "2016 国际城市规划 美国开放空间规划控制研究与启示.pdf": "2016-gjcsgh-open-space-usa",
    "2016 城市规划 城市开放空间规划标准研究.pdf": "2016-csgh-open-space-standard",
    "2019 Sustainability Delineating Urban Growth Boundaries with Ecosystem Service Evaluation.pdf": "2019-sustainability-ugb",
    "2019 城市规划 基于2SFCA法的杭州体育活力空间可达性评价.pdf": "2019-csgh-2sfca-hangzhou",
    "2019 建筑与文化  城市空间管控中生境质量的影响要素及评估方法.pdf": "2019-jzywh-habitat-quality",
    "2019浙大学报(工学版) 基于生境质量与碳储量的城市刚性开发边界划定.pdf": "2019-zju-habitat-carbon",
    "2021 城市发展研究 空间规划中生态系统服务评估方法研究.pdf": "2021-csfzyj-es-evaluation",
    "2021 西南大学学报 基于 POI的城市功能区及其混合度识别研究.PDF": "2021-xndxxb-poi-functional-zone",
    "2021 西部人居环境学刊  基于PCA-ESDA的成渝城市群经济发展空间差异研究.pdf": "2021-xbrjhj-chengyu-esda",
    "2022  land A Multi-Objective Optimization of Physical Activity Spaces.pdf": "2022-land-physical-activity",
    "2022 Land The Effect of Flood Risk on Residential Land Prices.pdf": "2022-land-flood-risk",
    "2022 sustainability Contributions of Natural Carbon Sink Capacity and Carbon Neutrality in the Context of Net-Zero Carbon Cities.pdf": "2022-sustainability-carbon-neutrality",
    "2022 建筑与文化 建成环境与体力活动关系的研究及应用.pdf": "2022-jzywh-built-environment",
    "2022 西部人居环境学刊 城市体力活动空间供需均衡与空间优化研究.pdf": "2022-xbrjhj-physical-activity-optimization",
    "2023 Environmental Science and Policy  A review of ES knowledge use in spatial planning.pdf": "2023-esp-es-spatial-planning",
    "2023 Sustainability Commercial Culture as a Key Impetus in Shaping and Transforming Urban Structure.pdf": "2023-sustainability-commercial-culture",
    "2024 Journal of Cleaner Production. Impact of territorial spatial landscape pattern on PM2.5 and O3   concentrations in the Yangtze River delta urban agglomeration.pdf": "2024-jcp-pm25-o3",
    "2024 sustainability.  Evaluation of Urban Land Suitability under Multiple Sea Level Rise Scenarios.pdf": "2024-sustainability-sea-level-rise",
    "2025 Air Quality, Atmosphere  Health. Influence of urban forest size and form on PM2.5 and O3 concentrations- A perspective of size threshold.pdf": "2025-aqah-forest-pm25-o3",
    "2025 Atmospheric Pollution Research. Reducing PM2.5 and O3 through optimizing urban ecological land form.pdf": "2025-apr-ecological-land",
    "2025 Building and Environment Combined effects of urban morphology on land surface temperature.pdf": "2025-be-urban-morphology-lst",
    "2025 农业资源与环境学报 基于SD PLUS耦合模型的杭州市土地利用多情景模拟.pdf": "2025-nyzyyhjxb-sd-plus",
}

def upload_pdf_to_gemini(pdf_path):
    """上传PDF文件到Gemini"""
    print(f"上传文件: {pdf_path}")
    file = genai.upload_file(pdf_path)
    print(f"上传完成: {file.uri}")
    return file

def generate_paper_summary(pdf_file):
    """使用Gemini生成论文摘要"""
    model = genai.GenerativeModel("gemini-1.5-flash")
    
    prompt = """
请仔细阅读这篇学术论文PDF，提取以下信息并以JSON格式返回（只返回JSON，不要其他文字）：

{
  "title": "论文标题（原文）",
  "title_cn": "论文标题（中文翻译，如果原文是中文则相同）",
  "authors": ["作者1", "作者2", "作者3"],
  "publication": "期刊名称",
  "publication_short": "期刊缩写",
  "date": "YYYY-MM-DD",
  "doi": "DOI号（如果有）",
  "abstract": "英文摘要（100-200词）",
  "abstract_cn": "中文摘要（100-200字）",
  "tags": ["关键词1", "关键词2", "关键词3"],
  "categories": ["学科分类1", "学科分类2"]
}

请确保：
1. 日期格式为 YYYY-MM-DD（如果只有年份，使用 YYYY-01-01）
2. 摘要要准确简洁，突出研究目的、方法和主要发现
3. 标签应该是具体的研究主题关键词
4. 分类应该是更广泛的学科领域
"""
    
    response = model.generate_content([pdf_file, prompt])
    return response.text

def create_markdown_file(folder_name, paper_info, pdf_filename):
    """创建Hugo格式的markdown文件"""
    
    # 清理JSON格式
    import json
    json_text = paper_info.strip()
    if json_text.startswith("```json"):
        json_text = json_text[7:]
    if json_text.startswith("```"):
        json_text = json_text[3:]
    if json_text.endswith("```"):
        json_text = json_text[:-3]
    json_text = json_text.strip()
    
    try:
        data = json.loads(json_text)
    except json.JSONDecodeError as e:
        print(f"JSON解析错误: {e}")
        print(f"原始内容: {json_text}")
        return False
    
    # 构建markdown内容
    markdown_content = f"""---
title: "{data.get('title', '')}"
authors:
{chr(10).join([f'- {author}' for author in data.get('authors', [])])}
date: "{data.get('date', '')}"
doi: "{data.get('doi', '')}"

# Schedule page publish date (NOT publication's date).
publishDate: "{data.get('date', '')}"

# Publication type.
# Accepts a single type but formatted as a YAML list (for Hugo requirements).
# Enter a publication type from the CSL standard.
publication_types: ["article-journal"]

# Publication name and optional abbreviated publication name.
publication: "{data.get('publication', '')}"
publication_short: "{data.get('publication_short', '')}"

abstract: "{data.get('abstract', '')}"

# Summary. An optional shortened abstract.
summary: "{data.get('abstract_cn', '')}"

tags:
{chr(10).join([f'- {tag}' for tag in data.get('tags', [])])}

# Display this page in the Featured widget?
featured: true

# Custom links (uncomment lines below)
# links:
# - name: Custom Link
#   url: http://example.org

url_pdf: '/papers/{pdf_filename}'
url_code: ''
url_dataset: ''
url_poster: ''
url_project: ''
url_slides: ''
url_source: ''
url_video: ''

# Featured image
# To use, add an image named `featured.jpg/png` to your page's folder.
image:
  caption: ''
  focal_point: ''
  preview_only: false

# Associated Projects (optional).
#   Associate this publication with one or more of your projects.
#   Simply enter your project's folder or file name without extension.
#   E.g. `internal-project` references `content/project/internal-project/index.md`.
#   Otherwise, set `projects: []`.
projects: []

# Slides (optional).
#   Associate this publication with Markdown slides.
#   Simply enter your slide deck's filename without extension.
#   E.g. `slides: "example"` references `content/slides/example/index.md`.
#   Otherwise, set `slides: ""`.
slides: ""
---

## 摘要

{data.get('abstract_cn', '')}

## 研究亮点

- 待补充

## 引用格式

```bibtex
@article{{{folder_name.replace('-', '_')},
  title={{{data.get('title', '')}}},
  author={{{' and '.join(data.get('authors', []))}}},
  journal={{{data.get('publication', '')}}},
  year={{{data.get('date', '')[:4]}}},
  doi={{{data.get('doi', '')}}}
}}

SyntaxError: incomplete input (3070119901.py, line 109)

In [None]:
# 创建文件夹并保存
folder_path = Path(PUBLICATION_DIR) / folder_name
folder_path.mkdir(parents=True, exist_ok=True)

output_file = folder_path / "index.md"
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(markdown_content)

print(f"✓ 已创建: {output_file}")
return True