<a href="https://colab.research.google.com/github/youkiti/ARE/blob/main/extract_study_info_Claude_API_with_citations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PDF Study Information Extractor for Google Colab

このノートブックは学術論文のPDFから以下の情報を抽出します：
1. 出版年（受理年）
2. 包含基準
3. 分析対象となった文書の総数

# 必要なもの
1. AnthropicのAPIキー
2. 読ませる論文pdf (コピー保護になっていないもの）

# 参考 Citations機能
https://docs.anthropic.com/en/docs/build-with-claude/citations

## セットアップ

In [2]:
# 必要なライブラリのインストール
!pip install anthropic

Collecting anthropic
  Downloading anthropic-0.45.0-py3-none-any.whl.metadata (23 kB)
Downloading anthropic-0.45.0-py3-none-any.whl (222 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m222.3/222.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.45.0


In [11]:
import os
import json
import base64
from datetime import datetime
import glob
import anthropic
from google.colab import userdata
from google.colab import files
from google.colab.userdata import SecretNotFoundError

## PDFファイルのアップロード
以下のセルを実行して、分析したいPDFファイルをアップロードしてください。

In [4]:
uploaded = files.upload()
print(f"アップロードされたファイル: {list(uploaded.keys())}")

Saving bmj-2024-081820.full.pdf to bmj-2024-081820.full.pdf
アップロードされたファイル: ['bmj-2024-081820.full.pdf']


## ユーティリティ関数の定義

In [20]:
def load_pdf_file(filepath):
    """PDFファイルを読み込みbase64エンコード"""
    try:
        with open(filepath, 'rb') as f:
            pdf_bytes = f.read()
        return base64.b64encode(pdf_bytes).decode('utf-8')
    except FileNotFoundError:
        print(f"エラー: ファイル {filepath} が見つかりません")
        return None
    except Exception as e:
        print(f"PDF読み込みエラー: {e}")
        return None

def create_document_message(base64_pdf, title):
    """PDFドキュメントメッセージの作成"""
    return {
        "type": "document",
        "source": {
            "type": "base64",
            "media_type": "application/pdf",
            "data": base64_pdf
        },
        "title": title,
        "citations": {"enabled": True}
    }

def validate_json_structure(data):
    """JSONデータの構造と型を検証"""
    required_fields = ["publication_year", "inclusion_criteria", "total_documents"]

    for field in required_fields:
        if field not in data:
            raise ValueError(f"必須フィールドが不足: {field}")
        if "value" not in data[field] or "citation" not in data[field]:
            raise ValueError(f"{field}にvalueまたはcitationが不足")

    try:
        data["publication_year"]["value"] = int(data["publication_year"]["value"])
        data["total_documents"]["value"] = int(data["total_documents"]["value"])
    except (ValueError, TypeError):
        raise ValueError("数値フィールドは整数である必要があります")

    if not isinstance(data["inclusion_criteria"]["value"], str):
        raise ValueError("選定基準は文字列である必要があります")

    return data

def extract_json_from_text(text):
    """テキストからJSONを抽出"""
    text = text.strip().replace("```json", "").replace("```", "")
    start = text.find("{")
    end = text.rfind("}") + 1

    if start == -1 or end == 0:
        raise ValueError("JSONオブジェクトが見つかりません")

    return json.loads(text[start:end])

def extract_info(client, pdf_content, filename, max_retries=3):
    """情報抽出（リトライ機能付き）"""
    tools = [{
        "name": "record_study_info",
        "description": "研究情報を構造化形式で記録",
        "input_schema": {
            "type": "object",
            "properties": {
                "publication_year": {
                    "type": "object",
                    "properties": {
                        "value": {"type": "integer", "description": "出版年（整数）"},
                        "citation": {"type": "string", "description": "根拠テキスト"}
                    },
                    "required": ["value", "citation"]
                },
                "inclusion_criteria": {
                    "type": "object",
                    "properties": {
                        "value": {"type": "string", "description": "選定基準（原文保持）"},
                        "citation": {"type": "string", "description": "根拠テキスト"}
                    },
                    "required": ["value", "citation"]
                },
                "total_documents": {
                    "type": "object",
                    "properties": {
                        "value": {"type": "integer", "description": "分析文書総数"},
                        "citation": {"type": "string", "description": "根拠テキスト"}
                    },
                    "required": ["value", "citation"]
                }
            },
            "required": ["publication_year", "inclusion_criteria", "total_documents"]
        }
    }]

    prompt = """PDFから以下の情報を抽出してください（図表も確認し、根拠テキストを明記）:
1. 出版年/受理年
2. 選定基準
3. 分析対象文書の総数（ClinicalTrials.govの登録数ではなく、実際に分析された研究数）

情報はrecord_study_infoツールを使用し、原文の言語を保持して記録してください。"""

    for attempt in range(max_retries):
        try:
            message = client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=1000,
                temperature=0,
                tools=tools,
                tool_choice={"type": "tool", "name": "record_study_info"},
                messages=[
                    {
                        "role": "user",
                        "content": [
                            create_document_message(pdf_content, filename),
                            {"type": "text", "text": prompt}
                        ]
                    }
                ]
            )

            for content in message.content:
                if content.type == "tool_use":
                    validated_data = validate_json_structure(content.input)
                    return validated_data

        except Exception as e:
            if attempt == max_retries - 1:
                raise Exception(f"{max_retries}回試行後に失敗: {str(e)}")

    return None

def show_api_key_error():
    """APIキーエラー表示"""
    print("""
❌ Anthropic APIキーが設定されていません

設定手順:
1. Anthropicのウェブサイト(https://www.anthropic.com)でAPIキーを取得
2. 以下のコードでAPIキーを設定:

   from google.colab import userdata
   userdata.set('ANTHROPIC_API_KEY', 'your-api-key-here')

詳細: https://qiita.com/suzuki_sh/items/4817e3423f2989bbb9ed
""")

def main_colab(pdf_path):
    """Colabメイン処理関数"""
    try:
        # APIキーチェック
        try:
            api_key = userdata.get('ANTHROPIC_API_KEY')
        except userdata.SecretNotFoundError:
            show_api_key_error()
            return None

        # PDF読み込み
        filename = os.path.basename(pdf_path)
        pdf_content = load_pdf_file(pdf_path)
        if not pdf_content:
            return None

        # APIクライアント初期化
        client = anthropic.Client(api_key=api_key)

        # 情報抽出
        results = extract_info(client, pdf_content, filename)
        if not results:
            raise ValueError("PDFからの情報抽出に失敗")

        # 結果整形
        return {
            "filename": filename,
            "extraction_date": datetime.now().isoformat(),
            "status": "success",
            "results": results
        }

    except Exception as e:
        return {
            "filename": filename if 'filename' in locals() else None,
            "extraction_date": datetime.now().isoformat(),
            "status": "error",
            "error": str(e)
        }

## メイン処理
アップロードされたPDFファイルから情報を抽出します。

In [21]:
result = main_colab(next(iter(uploaded.keys())))
print(json.dumps(result, indent=2, ensure_ascii=False))

{
  "filename": "bmj-2024-081820.full.pdf",
  "extraction_date": "2025-01-24T00:28:07.360262",
  "status": "success",
  "results": {
    "publication_year": {
      "value": 2025,
      "citation": "Cite this as: BMJ2025;388:e081820"
    },
    "inclusion_criteria": {
      "value": "Eligible participants were aged 20 to 70 years, had a body mass index greater than 25, had a diagnosis of type 2 diabetes made within the previous six years, and had HbA1c between 6.5% and 10% in patients who had no use of any antidiabetic agents or less than 10% in those taking metformin at screening.",
      "citation": "Eligible participants were aged 20 to 70 years, had a body mass index greater than 25, had a diagnosis of type 2 diabetes made within the previous six years, and had HbA1c between 6.5% and 10% in patients who had no use of any antidiabetic agents or less than 10% in those taking metformin at screening."
    },
    "total_documents": {
      "value": 328,
      "citation": "Between 12 Jun