In [2]:
!pip install openai tiktoken pymupdf pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [50]:
import fitz  # PyMuPDF
import tiktoken
from openai import OpenAI
import re
import pandas as pd

In [51]:
# クライアント初期化
client = OpenAI(api_key="apiキー")

# List of PDF files
pdf_paths = {
    "US": "Americas-AI-Action-Plan.pdf",
    "Japan": "japan_AI_stratagy2022.pdf"
}


In [52]:
# Step 1: PDF読み込み＆テキスト抽出（PyMuPDF）
def load_pdfs(paths: dict) -> dict:
    texts = {}
    for country, path in paths.items():
        print(f"[load_pdfs] Loading {country} from {path}")               # ← デバッグ出力
        doc = fitz.open(path)
        full_text = ""
        for i, page in enumerate(doc):
            page_text = page.get_text()
            print(f"[load_pdfs]  Page {i+1}: {len(page_text)} chars")    # ← 各ページ文字数
            full_text += page_text
        print(f"[load_pdfs]  Total {country}: {len(full_text)} chars\n")  # ← 全文文字数
        texts[country] = full_text
    return texts

In [54]:
# 関数呼び出しセル（load_pdfs 定義セルの直後に配置）
texts = load_pdfs(pdf_paths)

# 任意でテキスト長も確認
for country, txt in texts.items():
    print(f"{country}: {len(txt)} characters")


[load_pdfs] Loading US from Americas-AI-Action-Plan.pdf
[load_pdfs]  Page 1: 88 chars
[load_pdfs]  Page 2: 690 chars
[load_pdfs]  Page 3: 4496 chars
[load_pdfs]  Page 4: 3622 chars
[load_pdfs]  Page 5: 1385 chars
[load_pdfs]  Page 6: 3042 chars
[load_pdfs]  Page 7: 3067 chars
[load_pdfs]  Page 8: 3118 chars
[load_pdfs]  Page 9: 3527 chars
[load_pdfs]  Page 10: 3239 chars
[load_pdfs]  Page 11: 3044 chars
[load_pdfs]  Page 12: 2806 chars
[load_pdfs]  Page 13: 2512 chars
[load_pdfs]  Page 14: 3252 chars
[load_pdfs]  Page 15: 2928 chars
[load_pdfs]  Page 16: 1132 chars
[load_pdfs]  Page 17: 3357 chars
[load_pdfs]  Page 18: 3408 chars
[load_pdfs]  Page 19: 2767 chars
[load_pdfs]  Page 20: 3415 chars
[load_pdfs]  Page 21: 2877 chars
[load_pdfs]  Page 22: 1918 chars
[load_pdfs]  Page 23: 2726 chars
[load_pdfs]  Page 24: 2824 chars
[load_pdfs]  Page 25: 3100 chars
[load_pdfs]  Page 26: 1380 chars
[load_pdfs]  Page 27: 71 chars
[load_pdfs]  Page 28: 32 chars
[load_pdfs]  Total US: 69823 chars



In [55]:
# Step 2: テキストのチャンク分割（tiktoken, 4000トークン）
def chunk_texts(texts: dict, max_tokens: int = 4000) -> dict:
    enc = tiktoken.get_encoding("cl100k_base")
    chunked = {}
    for country, text in texts.items():
        tokens = enc.encode(text)
        chunks = [
            enc.decode(tokens[i : i + max_tokens])
            for i in range(0, len(tokens), max_tokens)
        ]
        chunked[country] = chunks
    return chunked


In [8]:
# Step 2 呼び出し＆デバッグ出力
text_chunks = chunk_texts(texts, max_tokens=4000)

# 各国のチャンク数と最初のチャンクの文字数を確認
for country, chunks in text_chunks.items():
    print(f"{country}: {len(chunks)} chunks")
    if chunks:
        print(f" First chunk length: {len(chunks[0])} chars\n")


US: 4 chunks
 First chunk length: 21464 chars

Japan: 5 chunks
 First chunk length: 20212 chars



In [87]:
# Step 3 カテゴリのキーワードとスコア
import json
import re

def extract_common_keywords(chunked_texts: dict, category: str, top_n: int = 10) -> list:
    """
    新SDK対応版：US/Japanのチャンクから「category」と同義のキーワード候補を抽出し、
    両国共通の上位top_nを返す関数。パースを堅牢化。
    """
    all_candidates = {"US": [], "Japan": []}

    for country, chunks in chunked_texts.items():
        for idx, chunk in enumerate(chunks, start=1):
            print(f"[extract_common_keywords] {country} chunk {idx}/{len(chunks)}")
            prompt = f"""
You are a policy consultant with deep expertise in both AI regulation and the advancement of AI-driven innovation.
You understand the balance between responsible governance and the strategic promotion of AI technologies.
You provide nuanced advice that reflects both regulatory considerations and opportunities for AI adoption.
You are analyzing a national AI policy document.
Only output a JSON object without markdown or code fences.

From the following text chunk—do not refer to or use any information outside this chunk—extract candidate keywords
that are synonymous with "{category}"
(including the term "{category}" itself if present), and return JSON:

{{
  "candidates": ["keyword1", "keyword2", ...]
}}

Text:
\"\"\"{chunk}\"\"\"
"""
            # GPT呼び出し（新SDK）
            resp = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0
            )

            content = resp.choices[0].message.content.strip()
            print(f"[extract_common_keywords] response snippet: {content[:100].replace(chr(10), ' ')}...")

            # JSON部分だけ抽出
            try:
                body = re.search(r'\{.*\}', content, flags=re.DOTALL).group()
                data = json.loads(body)
                candidates = data.get("candidates", [])
            except Exception as e:
                print(f"[extract_common_keywords] JSON parse error: {e}")
                # フォールバック：配列内要素のみ抽出
                match = re.search(r'"candidates"\s*:\s*\[([^\]]+)\]', content)
                if match:
                    items = [s.strip().strip('"') for s in match.group(1).split(',')]
                    candidates = items
                else:
                    candidates = []

            all_candidates[country].extend(candidates)

    # 両国共通のキーワードを抽出し、上位top_nを返す
    common = list(set(all_candidates["US"]) & set(all_candidates["Japan"]))
    print(f"[extract_common_keywords] common keywords: {common[:top_n]}")
    return common[:top_n]


In [96]:
# Step 3 呼び出し＆デバッグ出力
print("[Step 3] Extracting regulation keywords…")
regulation_keywords = extract_common_keywords(text_chunks, "regulation", top_n=20)
print(f"[Step 3] regulation_keywords: {regulation_keywords}\n")

print("[Step 3] Extracting promotion keywords…")
promotion_keywords = extract_common_keywords(text_chunks, "promotion", top_n=20)
print(f"[Step 3] promotion_keywords: {promotion_keywords}\n")


[Step 3] Extracting regulation keywords…
[extract_common_keywords] US chunk 1/4
[extract_common_keywords] response snippet: {   "candidates": ["regulation", "regulatory", "regulatory regime", "regulatory barriers", "regulato...
[extract_common_keywords] US chunk 2/4
[extract_common_keywords] response snippet: {   "candidates": ["guidance", "authorities", "regulations", "rules", "law", "compliance", "mandate"...
[extract_common_keywords] US chunk 3/4
[extract_common_keywords] response snippet: {   "candidates": ["regulation", "guidance", "rules", "standards", "requirements", "laws"] }...
[extract_common_keywords] US chunk 4/4
[extract_common_keywords] response snippet: {   "candidates": ["regulations", "governance", "standards", "controls", "requirements", "protection...
[extract_common_keywords] Japan chunk 1/5
[extract_common_keywords] response snippet: {   "candidates": ["governance", "legal systems", "policy obstacles", "institutional obstacles", "go...
[extract_common_keywords] Jap

In [97]:
# Step 4: スコア計算のみ行う関数（CSV保存ナシ）
def count_and_score(texts: dict, regulation_keywords: list, promotion_keywords: list) -> pd.DataFrame:
    """
    textsとキーワードリストを小文字化した上で出現回数を集計し、
    スコアを計算してDataFrameで返す関数。
    """
    results = []
    for country, text in texts.items():
        text_lower = text.lower()

        # 規制キーワードの小文字版リスト
        reg_keywords_lower = [kw.lower() for kw in regulation_keywords]
        pro_keywords_lower = [kw.lower() for kw in promotion_keywords]

        # 出現回数を .count() でカウント
        reg_count = sum(text_lower.count(kw) for kw in reg_keywords_lower)
        pro_count = sum(text_lower.count(kw) for kw in pro_keywords_lower)

        # スコア計算
        score = 0 if (reg_count + pro_count) == 0 else (pro_count - reg_count) / (pro_count + reg_count)

        results.append({
            "country": country,
            "regulation_count": reg_count,
            "promotion_count": pro_count,
            "score": score
        })

    return pd.DataFrame(results)



In [95]:
# Step 4: スコアデバック
df_scores = count_and_score(texts, regulation_keywords, promotion_keywords)
df_scores 

Unnamed: 0,country,regulation_count,promotion_count,score
0,US,103,42,-0.42069
1,Japan,29,101,0.553846


In [98]:
# Step 4: スコアダウンロード
df_scores.to_csv("sentiment_scores.csv", index=False, encoding="utf-8-sig")

In [99]:
# Step 4　全キーワードとカテゴリをまとめる
all_keywords = regulation_keywords + promotion_keywords

# 国別にキーワード出現回数を集計
rows = []
for kw in all_keywords:
    for country in texts:
        count = len(re.findall(rf"\b{kw}\b", texts[country], flags=re.IGNORECASE))
        rows.append({
            "keyword": kw,
            "category": "regulation" if kw in regulation_keywords else "promotion",
            "country": country,
            "count": count
        })

# rows を作成したあとに DataFrame を生成
df_keyword_counts = pd.DataFrame(rows)

# そして pivot_table を作成
pivot_table = pd.pivot_table(
    df_keyword_counts, 
    index=["keyword", "category"],
    columns="country",
    values="count",
    aggfunc="sum",
    fill_value=0
).reset_index()

pivot_table


country,keyword,category,Japan,US
0,accelerate,promotion,7,9
1,advancement,promotion,1,1
2,compliance,regulation,0,2
3,encourage,promotion,1,6
4,facilitate,promotion,0,2
5,foster,promotion,1,2
6,governance,regulation,2,8
7,laws,regulation,0,3
8,promote,promotion,29,10
9,promoting,promotion,12,3


In [100]:
# Step 4　キーワードCSV出力セル
pivot_table.to_csv(
    "keyword_pivot.csv",
    index=False,
    encoding="utf-8-sig"  # Excel 互換のUTF-8 BOM付き
)


In [None]:
# 3: 共通 regulation/promption キーワード抽出
regulation_keywords = extract_common_keywords(text_chunks, "regulation", top_n=10)
promotion_keywords = extract_common_keywords(text_chunks, "promotion", top_n=10)

In [69]:
# 4: カウント＆スコア算出
df_scores = count_and_score(texts, regulation_keywords, promotion_keywords)
df_score

NameError: name 'df_score' is not defined

In [37]:
import json
import re
import pandas as pd

# Step 5: 国ごとの特徴キーワード抽出（Prompt の３リストをパース）
def extract_policy_keywords(chunked_texts: dict, top_n: int = 10) -> dict:
    """
    各国のテキストチャンクから下記を GPT-4o で抽出し、
    ・regulation_targets
    ・promotion_targets
    ・unique_terms
    の３つを辞書で返す関数。
    """
    results = {}

    for country, chunks in chunked_texts.items():
        print(f"[STEP5] START {country}: {len(chunks)} chunks")
        regs_all = []
        promos_all = []
        uniques_all = []

        for idx, chunk in enumerate(chunks, start=1):
            print(f"[STEP5] {country} chunk {idx}/{len(chunks)}")
            prompt = f"""
You are an expert in policy text analysis.

Given the following policy document text chunk, extract three distinct lists of English keywords:
1. “Regulation Targets”: terms that indicate what is being regulated.
2. “Promotion Targets”: terms that indicate what is being promoted.
3. “Unique Terms”: unusually significant words computed by a TF-IDF–like approach, excluding proper nouns such as “AI”, country names, and organization names.

Return a JSON object with these three arrays:
{{
  "regulation_targets": [...],
  "promotion_targets": [...],
  "unique_terms": [...]
}}

Text:
\"\"\"{chunk}\"\"\"
"""
            resp = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role":"system","content":"You are a helpful assistant."},
                    {"role":"user",  "content":prompt}
                ],
                temperature=0
            )
            content = resp.choices[0].message.content.strip()
            print(f"[STEP5] snippet: {content[:100]}...")

            # JSON パース
            try:
                data = json.loads(content)
                regs = data.get("regulation_targets", [])
                promos = data.get("promotion_targets", [])
                uniques = data.get("unique_terms", [])
            except json.JSONDecodeError as e:
                print(f"[STEP5] JSON error: {e}")
                # フォールバック：キーごとに正規表現で抜き出す
                regs = re.findall(r'"regulation_targets"\s*:\s*\[([^\]]+)\]', content)
                promos = re.findall(r'"promotion_targets"\s*:\s*\[([^\]]+)\]', content)
                uniques = re.findall(r'"unique_terms"\s*:\s*\[([^\]]+)\]', content)
                # 上記は文字列なのでカンマで分割＆クオート除去などの後処理が必要です

            regs_all.extend(regs)
            promos_all.extend(promos)
            uniques_all.extend(uniques)

        # 重複除去しつつ先頭 top_n を切り出し
        regs_final   = list(dict.fromkeys(regs_all))[:top_n]
        promos_final = list(dict.fromkeys(promos_all))[:top_n]
        uniques_final= list(dict.fromkeys(uniques_all))[:top_n]

        print(f"[STEP5] {country} regulation_targets: {regs_final}")
        print(f"[STEP5] {country} promotion_targets: {promos_final}")
        print(f"[STEP5] {country} unique_terms: {uniques_final}\n")

        results[country] = {
            "regulation_targets": regs_final,
            "promotion_targets": promos_final,
            "unique_terms": uniques_final
        }

    return results

# 実行例
policy_keywords = extract_policy_keywords(text_chunks, top_n=10)


[STEP5] START US: 4 chunks
[STEP5] US chunk 1/4
[STEP5] snippet: ```json
{
  "regulation_targets": [
    "AI innovation",
    "AI adoption",
    "AI-related Federal ...
[STEP5] JSON error: Expecting value: line 1 column 1 (char 0)
[STEP5] US chunk 2/4
[STEP5] snippet: ```json
{
  "regulation_targets": [
    "AI-related training",
    "AI adoption",
    "AI systems",
...
[STEP5] JSON error: Expecting value: line 1 column 1 (char 0)
[STEP5] US chunk 3/4
[STEP5] snippet: ```json
{
  "regulation_targets": [
    "deepfake standard",
    "Federal Rules of Evidence",
    "e...
[STEP5] JSON error: Expecting value: line 1 column 1 (char 0)
[STEP5] US chunk 4/4
[STEP5] snippet: ```json
{
  "regulation_targets": [
    "AI governance frameworks",
    "facial recognition",
    "s...
[STEP5] JSON error: Expecting value: line 1 column 1 (char 0)
[STEP5] US regulation_targets: ['\n    "AI innovation",\n    "AI adoption",\n    "AI-related Federal funding",\n    "state AI regulations",\n    "AI developm

In [39]:
# --- Step 5 の呼び出し結果を受け取る（関数名は例です） ---
policy_keywords = extract_policy_keywords(text_chunks, top_n=10)

# 辞書からそれぞれのリストを取り出す
regulation_targets  = {c: policy_keywords[c]["regulation_targets"]  for c in policy_keywords}
promotion_targets   = {c: policy_keywords[c]["promotion_targets"]   for c in policy_keywords}
unique_terms        = {c: policy_keywords[c]["unique_terms"]        for c in policy_keywords}

# --- Step 6: CSV出力（Updated） ---
import pandas as pd

df_keywords = pd.DataFrame([
    {
        "country": country,
        "regulation_targets": ", ".join(regulation_targets[country]),
        "promotion_targets": ", ".join(promotion_targets[country]),
        "unique_terms": ", ".join(unique_terms[country])
    }
    for country in texts
])

# DataFrame 確認
display(df_keywords)

# CSV 保存
df_keywords.to_csv("ai_policy_keywords.csv", index=False)


[STEP5] START US: 4 chunks
[STEP5] US chunk 1/4
[STEP5] snippet: ```json
{
  "regulation_targets": [
    "AI innovation",
    "AI adoption",
    "AI-related Federal ...
[STEP5] JSON error: Expecting value: line 1 column 1 (char 0)
[STEP5] US chunk 2/4
[STEP5] snippet: ```json
{
  "regulation_targets": [
    "AI-related training",
    "AI adoption",
    "AI systems",
...
[STEP5] JSON error: Expecting value: line 1 column 1 (char 0)
[STEP5] US chunk 3/4
[STEP5] snippet: ```json
{
  "regulation_targets": [
    "deepfake standard",
    "Federal Rules of Evidence",
    "e...
[STEP5] JSON error: Expecting value: line 1 column 1 (char 0)
[STEP5] US chunk 4/4
[STEP5] snippet: ```json
{
  "regulation_targets": [
    "AI governance frameworks",
    "facial recognition",
    "s...
[STEP5] JSON error: Expecting value: line 1 column 1 (char 0)
[STEP5] US regulation_targets: ['\n    "AI innovation",\n    "AI adoption",\n    "AI-related Federal funding",\n    "state AI regulations",\n    "Federal pro

Unnamed: 0,country,regulation_targets,promotion_targets,unique_terms
0,US,"\n ""AI innovation"",\n ""AI adoption"",\n ...","\n ""AI innovation"",\n ""Open-Source and O...","\n ""frontier"",\n ""transformative"",\n ..."
1,Japan,"\n ""cybersecurity"",\n ""AI ethics"",\n ...","\n ""education reform"",\n ""human resource...","\n ""resilience"",\n ""sustainability"",\n ..."


In [43]:
# Step 6: CSV出力（Updated）
df_keywords = pd.DataFrame([
    {
        "country": country,
        "regulation_targets": ", ".join(regulation_targets[country]),
        "promotion_targets": ", ".join(promotion_targets[country]),
        "unique_terms": ", ".join(unique_terms[country])
    }
    for country in texts
])

# DataFrame 確認
display(df_keywords)

# CSV 保存
df_keywords.to_csv("ai_policy_keywords2.csv", index=False)

# UTF-8 (BOM付き) で書き出す
df_keywords.to_csv(
    "ai_policy_keywords3.csv",
    index=False,
    encoding="utf-8-sig",
    header=True
)


Unnamed: 0,country,regulation_targets,promotion_targets,unique_terms
0,US,"\n ""AI innovation"",\n ""AI adoption"",\n ...","\n ""AI innovation"",\n ""Open-Source and O...","\n ""frontier"",\n ""transformative"",\n ..."
1,Japan,"\n ""cybersecurity"",\n ""AI ethics"",\n ...","\n ""education reform"",\n ""human resource...","\n ""resilience"",\n ""sustainability"",\n ..."


In [45]:
# Step 6: CSV出力（Updated）
def clean_terms(terms: list[str]) -> list[str]:
    """
    各キーワードから改行コードと余分なダブルクオートを除去し、
    前後の空白をトリムするヘルパー関数
    """
    cleaned = []
    for t in terms:
        s = t.replace("\n", " ").replace('"', "").strip()
        cleaned.append(s)
    return cleaned

# Step 6: CSV出力（改良版）
rows = []
for country in texts:
    regs   = clean_terms(regulation_targets[country])
    promos = clean_terms(promotion_targets[country])
    uniques= clean_terms(unique_terms[country])
    rows.append({
        "country": country,
        "regulation_targets": ", ".join(regs),
        "promotion_targets": ", ".join(promos),
        "unique_terms": ", ".join(uniques)
    })

df_keywords = pd.DataFrame(rows)

# DataFrame 確認
display(df_keywords)

# UTF-8 BOM付きで書き出し（Excel 互換性向上）
df_keywords.to_csv(
    "ai_policy_keywords_clean.csv",
    index=False,
    encoding="utf-8-sig",
    header=True
)


Unnamed: 0,country,regulation_targets,promotion_targets,unique_terms
0,US,"AI innovation, AI adoption, AI-related...","AI innovation, Open-Source and Open-Weight...","frontier, transformative, unencumbered..."
1,Japan,"cybersecurity, AI ethics, data-related...","education reform, human resources developm...","resilience, sustainability, diversity,..."


In [None]:
# Step 6: 結果統合＆CSV出力
texts = load_pdfs(pdf_paths)
text_chunks = chunk_texts(texts)
