In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# app.py(ipynb)でimportするために `news_summarizer_model.py` を同一フォルダに生成
NB="/content/drive/MyDrive/Colab Notebooks/news-summarizer_demo/news-summarizer_model.ipynb"
OUTDIR="/content/drive/MyDrive/Colab Notebooks/news-summarizer_demo"

!jupyter nbconvert --to python "$NB" \
  --output "news-summarizer_model" \
  --output-dir "$OUTDIR"

[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/news-summarizer_demo/news-summarizer_model.ipynb to python
[NbConvertApp] Writing 2646 bytes to /content/drive/MyDrive/Colab Notebooks/news-summarizer_demo/news-summarizer_model.py


In [None]:
!pip install transformers langchain langchain-community langchain-huggingface

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydant

In [1]:
from transformers import pipeline
# from langchain.llms import HuggingFacePipeline
# from langchain.prompts import PromptTemplate
# from langchain_huggingface.llms import HuggingFacePipeline
# from langchain_core.prompts import PromptTemplate

In [22]:
# 入力テキストの長さによって出力トークン数をダイナミックに設定する
from math import ceil

# 言語ごとに適切な圧縮率レンジと上限キャップを指定

# ----  汎用デフォルト（空白区切り系を想定：英/独/仏/西/アラビア語 等） ----
DEFAULT_PROFILE = {
    "ratio":  {"short": (0.11, 0.18), "medium": (0.18, 0.28), "long": (0.28, 0.43)},
    "cap":    {"short": (40, 190),    "medium": (120, 380),   "long": (190, 620)},
    "no_repeat": 3,
    "length_penalty": {"short": 0.9, "medium": 1.05, "long": 1.2},
}

# ---- 特殊レンジ・キャップが要る言語だけを個別定義 ----
SPECIAL_LANG_PROFILES = {
    # 日本語
    "ja": {
        "ratio":  {"short": (0.08, 0.15), "medium": (0.15, 0.25), "long": (0.25, 0.40)},
        "cap":    {"short": (32, 160),    "medium": (100, 320),   "long": (160, 520)},
        "no_repeat": 3,
        "length_penalty": {"short": 0.9, "medium": 1.05, "long": 1.2},
    },
    # 中国語（簡/繁共通）
    "zh": {
        "ratio":  {"short": (0.05, 0.12), "medium": (0.12, 0.20), "long": (0.20, 0.35)},
        "cap":    {"short": (28, 130),    "medium": (85, 260),    "long": (140, 420)},
        "no_repeat": 4,
        "length_penalty": {"short": 0.9, "medium": 1.05, "long": 1.2},
    },
    # 韓国語
    "ko": {
        "ratio":  {"short": (0.07, 0.14), "medium": (0.14, 0.23), "long": (0.23, 0.38)},
        "cap":    {"short": (30, 145),    "medium": (95, 290),    "long": (150, 470)},
        "no_repeat": 3,
        "length_penalty": {"short": 0.9, "medium": 1.05, "long": 1.2},
    },
    # 分かち書きが乏しいスクリプト系（タイ/ラオス/クメール/ビルマなど）
    # モデル依存で重複が出やすいので no_repeat=4、比率はCJK寄りにやや低め
    "th": {
        "ratio":  {"short": (0.07, 0.14), "medium": (0.14, 0.22), "long": (0.22, 0.36)},
        "cap":    {"short": (30, 145),    "medium": (95, 280),    "long": (150, 450)},
        "no_repeat": 4,
        "length_penalty": {"short": 0.9, "medium": 1.05, "long": 1.2},
    },
    "lo": {  # Lao
        "ratio":  {"short": (0.07, 0.14), "medium": (0.14, 0.22), "long": (0.22, 0.36)},
        "cap":    {"short": (30, 145),    "medium": (95, 280),    "long": (150, 450)},
        "no_repeat": 4,
        "length_penalty": {"short": 0.9, "medium": 1.05, "long": 1.2},
    },
    "km": {  # Khmer
        "ratio":  {"short": (0.07, 0.14), "medium": (0.14, 0.22), "long": (0.22, 0.36)},
        "cap":    {"short": (30, 145),    "medium": (95, 280),    "long": (150, 450)},
        "no_repeat": 4,
        "length_penalty": {"short": 0.9, "medium": 1.05, "long": 1.2},
    },
    "my": {  # Burmese
        "ratio":  {"short": (0.07, 0.14), "medium": (0.14, 0.22), "long": (0.22, 0.36)},
        "cap":    {"short": (30, 145),    "medium": (95, 280),    "long": (150, 450)},
        "no_repeat": 4,
        "length_penalty": {"short": 0.9, "medium": 1.05, "long": 1.2},
    },
}

In [26]:
# 文字スクリプトからの言語推定
import re

_CJK_RE = re.compile(r"[\u4E00-\u9FFF\u3040-\u30FF\uAC00-\uD7AF]")
_ZH_RE  = re.compile(r"[\u4E00-\u9FFF]")      # 漢字のみが多ければ zh 寄せ
_JA_RE  = re.compile(r"[\u3040-\u30FF]")      # かなを含めば ja
_KO_RE  = re.compile(r"[\uAC00-\uD7AF]")      # ハングル音節
_TH_RE  = re.compile(r"[\u0E00-\u0E7F]")      # タイ
_LO_RE  = re.compile(r"[\u0E80-\u0EFF]")      # ラオ
_KM_RE  = re.compile(r"[\u1780-\u17FF]")      # クメール
_MY_RE  = re.compile(r"[\u1000-\u109F]")      # ビルマ

def infer_lang(text: str) -> str | None:
    if _JA_RE.search(text): return "ja"
    if _KO_RE.search(text): return "ko"
    if _ZH_RE.search(text): return "zh"
    if _TH_RE.search(text): return "th"
    if _LO_RE.search(text): return "lo"
    if _KM_RE.search(text): return "km"
    if _MY_RE.search(text): return "my"
    if _CJK_RE.search(text): return "zh"  # フォールバックで zh
    return None

# プロファイルの選択ロジック
def pick_profile(lang_code: str | None = None, text: str | None = None):
    code = (lang_code or "").lower()
    if code in SPECIAL_LANG_PROFILES:
        return SPECIAL_LANG_PROFILES[code]

    # lang_code が無い・未知の場合は`infer_lang_from_script`で推定
    if text:
        inferred = infer_lang(text)
        if inferred and inferred in SPECIAL_LANG_PROFILES:
            return SPECIAL_LANG_PROFILES[inferred]

    # どれにも当てはまらなければ汎用デフォルト
    return DEFAULT_PROFILE


# 既存のダイナミックパラメータ関数をプロファイル対応に
def _target_len_by_ratio(n_in: int, mode: str, prof: dict) -> int:
    lo, hi = prof["ratio"][mode]
    cap_lo, cap_hi = prof["cap"][mode]
    target = int(n_in * ((lo + hi) / 2))
    return max(cap_lo, min(target, cap_hi))

def dynamic_params(
    n_in_tokens: int,   # 入力のトークン数
    mode: str,          # "short" / "medium" / "long" のいずれか
    lang_code: str | None = None,  # 言語コード
    text: str | None = None        # 要約する入力テキスト→言語推定にも使う
):
    prof = pick_profile(lang_code, text)
    target = _target_len_by_ratio(n_in_tokens, mode, prof)
    min_new = max(10, int(target * 0.6))
    max_new = max(min_new + 10, target)
    return {
        "min_new_tokens": min_new,
        "max_new_tokens": max_new,
        "length_penalty": prof["length_penalty"][mode],
        "no_repeat_ngram_size": prof["no_repeat"],
    }

In [32]:
_summarizer_ja = pipeline(
    "summarization",
    model="tsmatz/mt5_summarize_japanese",
    tokenizer="tsmatz/mt5_summarize_japanese",
)

# 言語ごとの summarizerパイプライン
_summarizers = {
    # 日本語モデル
    "ja": pipeline(
        "summarization",
        model="tsmatz/mt5_summarize_japanese",
        tokenizer="tsmatz/mt5_summarize_japanese",
    ),
    # 英語用モデル
    "en": pipeline(
        "summarization",
        model="google/bigbird-pegasus-large-arxiv",
        tokenizer="google/bigbird-pegasus-large-arxiv",
    ),
}

Device set to use cpu
Device set to use cpu


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.31G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.31G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.92M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

Device set to use cpu


In [48]:
# 出力のクリーンアップ
import re
import unicodedata

def clean_summary(text: str) -> str:
    if not text:
        return text

    # Unicode正規化（全角/半角・合成文字などの揺れを抑える）
    s = unicodedata.normalize("NFKC", text)

    # <n> -> 改行
    s = re.sub(r"\s*<\s*n\s*>\s*", "\n", s, flags=re.IGNORECASE)
    # 学習ノイズ記号を削除
    s = re.sub(r"\s*[\[\]\{\}\|]{2,}\s*", " ", s)
    s = re.sub(r"\s*<{2,}\s*|>{2,}\s*", " ", s)

    # 句読点前スペース削除
    s = re.sub(r"\s+([,.:;!?%])", r"\1", s)

    # コロン直後の引用符にスペースを入れる:  says:"the -> says: "the
    s = re.sub(r":(\"|')", r": \1", s)

    # 開く括弧の直後や閉じる括弧の直前の余分なスペース
    s = re.sub(r"([(\[\{])\s+", r"\1", s)
    s = re.sub(r"\s+([)\]\}])", r"\1", s)

    # 引用符の前後の余分なスペース
    s = re.sub(r"\s+([\"'])", r"\1", s)
    s = re.sub(r"([\"'])\s+", r"\1", s)

    # ピリオド重複・句点の連続を軽減（……等を壊さない範囲で）
    s = re.sub(r"([\.!?]){3,}", r"\1\1", s)

    # 連続空白・連続改行の縮約
    s = re.sub(r"[ \t]{2,}", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)

    # 先頭/末尾のホワイトスペース除去
    s = s.strip()

    return s

In [42]:
def run_summary(text: str, mode: str = "medium", lang_code: str | None = None):
    # 言語コードが無ければ推定
    lang = lang_code or infer_lang(text)
    if lang not in _summarizers:
        lang = "en"  # 未対応言語はデフォルト英語にフォールバック
    print("Original text language: ", lang)

    summarizer = _summarizers[lang]

    # 入力トークン数の算出（tokenizer は summarizer から取得）
    n_in = len(summarizer.tokenizer.encode(text, add_special_tokens=False))

    # 言語 + モードに応じてパラメータ算出
    params = dynamic_params(n_in, mode, lang_code=lang, text=text)

    # サマリ生成
    out = summarizer(
        text,
        do_sample=False,
        num_beams=4,
        truncation=True,
        repetition_penalty=1.1,
        **params
    )
    raw =  out[0]["summary_text"]
    return clean_summary(raw)

In [37]:
news_sample_ja = """
最近、「フィールグッド・ブック」つまり「気分が良くなる小説」が欧米を中心に人気を集めており、なかでも日本人作家の作品が注目されている。スペイン「エル・パイス」紙は、このジャンルについてこう説明している。

「そこには現実に似た世界が広がっているが、他人を傷つけるような要素はない。過度な期待はなく、各人が静かに、そして謙虚に『これが幸福だ』と思えるものだけがある」

こうしたフィールグッド作品として、八木沢里志の『森崎書店の日々』や柏井壽の『鴨川食堂』シリーズが紹介されている。

このブームは、英国に起因するとエル・パイスは指摘する。同国では2024年、翻訳書のうち43％が日本作品だったのだ。スペインで柏井作品の発掘者である編集者のアニック・ラポワントは同紙にこう話す。

「日本文化への関心は長年かけて高まってきました。食、マンガ、アニメ、音楽、映画──そしていま、これらの本が新しい扉を開こうとしています。より内省的で、深く心に響く感性への扉を」

ラポワントが『鴨川食堂』に出会った頃、川口俊和の『コーヒーが冷めないうちに』がすでに道を切り開いていた。2015年に刊行され、世界で500万部以上を売り上げている。さらにラポワントは、フィールグッド作品の必要性をこう説明する。

「パンデミック後、多くの人が心の避難所になる本を求めました。単なる娯楽ではなく、心をケアしてくれる物語。世界的に広がる不安のなかで、争いも怒鳴り合いではなく、理解で解決される物語を求めたのです」

また、スペインのヒホンで18年前に創業した日本文化・文学専門の出版社「Satori Ediciones」の編集者マリアン・バンゴはこう語っている。

「日本文学は物語の途中から始まり、結末をはっきり示さない。それでも、読後に残る感覚が時間とともに深まります。日本文学は、内向的で本物志向。だからブームを超えて人を惹きつけるのです」
"""

In [29]:
run_summary(news_sample_ja, "short")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


'日本文化・文学専門出版社「フィールグッド・ブック」が、世界中で人気を集めている。この作品は、世界的に流行している。'

In [47]:
news_sample_en = """
The Israeli military says it has begun the "preliminary actions" of a planned ground offensive to capture and occupy all of Gaza City and already has a hold on its outskirts.

A military spokesman said troops were already operating in the Zeitoun and Jabalia areas to lay the groundwork for the offensive, which Defence Minister Israel Katz approved on Tuesday and which will be put to the security cabinet later this week.

About 60,000 reservists are being called up for the beginning of September to free up active-duty personnel for the operation.

Hamas has accused Israel of obstructing a ceasefire deal in favour of continuing a "brutal war against innocent civilians", Reuters news agency reported.


Hundreds of thousands of Palestinians in Gaza City are expected to be ordered to evacuate and head to shelters in southern Gaza as preparations for Israel's takeover plan get under way.

Many of Israel's allies have condemned its plan, with French President Emmanuel Macron warning on Wednesday that it "can only lead to disaster for both peoples and risks plunging the entire region into a cycle of permanent war".

The International Committee of the Red Cross (ICRC) meanwhile said further displacement and an intensification of hostilities "risk worsening an already catastrophic situation" for Gaza's 2.1 million population.

Israel's government announced its intention to conquer the entire Gaza Strip after indirect talks with Hamas on a ceasefire and hostage release deal broke down last month.

Speaking at a televised briefing on Wednesday, Israel Defense Forces (IDF) spokesperson Brigadier General Effie Defrin said Hamas was "battered and bruised" after 22 months of war.

"We will deepen the damage to Hamas in Gaza City, a stronghold of governmental and military terror for the terrorist organisation," he added. "We will deepen the damage to the terror infrastructure above and below the ground and sever the population's dependence on Hamas."

But Defrin said the IDF was "not waiting" to begin the operation.

"We have begun the preliminary actions, and already now, IDF troops are holding the outskirts of Gaza City."

Two brigades were operating on the ground in the Zeitoun neighbourhood, where in recent days they had located an underground tunnel that contained weapons, and a third brigade was operating in the Jabalia area, he added.

In order to "minimise harm to civilians," he said, Gaza City's civilian population would be warned to evacuate for their safety.

A spokesman for Gaza's Hamas-run Civil Defence agency, Mahmoud Bassal, told AFP news agency on Tuesday that the situation was "very dangerous and unbearable" in the city's Zeitoun and Sabra neighbourhoods.

The agency reported that Israeli strikes and fire had killed 25 people across the territory on Wednesday. They included three children and their parents whose home in the Badr area of Shati refugee camp, west of Gaza City, was bombed, it said.

Defrin said the IDF was also doing everything possible to prevent harm to the 50 hostages still being held by Hamas in Gaza, 20 of whom are believed to be alive. Their families have expressed fears that those in Gaza City could be endangered by a ground offensive.

The ICRC warned of a catastrophic situation for both Palestinian civilians and the hostages if military activity in Gaza intensified.

"After months of relentless hostilities and repeated displacement, the people in Gaza are utterly exhausted. What they need is not more pressure, but relief. Not more fear, but a chance to breathe. They must have access to the essentials to live in dignity: food, medical and hygiene supplies, clean water, and safe shelter," a statement said.

"Any further intensification of military operations will only deepen the suffering, tear more families apart, and threaten an irreversible humanitarian crisis. The lives of hostages may also be put at risk," it added.

It called for an immediate ceasefire and the rapid and unimpeded passage of humanitarian assistance across Gaza.

Mediators Qatar and Egypt are trying to secure a ceasefire deal and have presented a new proposal for a 60-day truce and the release of around half of the hostages, which Hamas said it had accepted on Monday.

Israel has not yet submitted a formal response, but Israeli officials insisted on Tuesday that they would no longer accept a partial deal and demanded a comprehensive one that would see all the hostages released.

On Wednesday Hamas accused Israeli Prime Minister Benjamin Netanyahu of disregarding the mediators' ceasefire proposal and said he was the "real obstructionist of any agreement", according to a statement cited by Reuters.

The Israeli military launched a campaign in Gaza in response to the Hamas-led attack on southern Israel on 7 October 2023, in which about 1,200 people were killed and 251 others were taken hostage.

At least 62,122 people have been killed in Gaza since then, according to the territory's health ministry. The ministry's figures are quoted by the UN and others as the most reliable source of statistics available on casualties.
"""

In [43]:
run_summary(news_sample_en, "short")

Original text language:  en


'according to a report in the zeitschrift fr theoretische physik, on 15 may 2015, the government of argentina announced the beginning of an operation to capture and occupy the territory of iberia.\nthe statement says:"the government wishes to express its deepest regrets for the deaths that have taken place during the operation and for the injuries that have been sustained by the projectiles that have projectiled at the positions of the government and its people.'

In [49]:
run_summary(news_sample_en, "short")

Original text language:  en


'according to a report in the zeitschrift fr theoretische physik, on 15 may 2015, the government of argentina announced the beginning of an operation to capture and occupy the territory of iberia.\nthe statement says:"the government wishes to express its deepest regrets for the deaths that have taken place during the operation and for the injuries that have been sustained by the projectiles that have projectiled at the positions of the government and its people.'

In [36]:
run_summary(news_sample_ja, "medium")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Original text language:  ja


'日本文化・文学専門出版社「フィールグッド・ブック」が、世界中で人気を集めている。この作品は、世界各地で話題を集めており、多くの人が心をケアしてくれる本を求めている。こうした本は、日本文化への関心が高まるなか、多くの人は心をケアしてくれる――。'

In [14]:
news_sample_short = """
［テルアビブ／エルサレム／カイロ　２０日　ロイター］ - イスラエル軍報道官のデフリン准将は２０日、イスラエル軍がパレスチナ自治区のガザ市制圧に向け計画する作戦の第一歩を踏み出したと発表した。
デフリン准将は、ガザ南部ハンユニスで起きたイスラム組織ハマスとの衝突を受け、ハマスの拠点である「ガザ市でのハマスへの攻撃を強化する」と言明。「われわれはガザ市攻撃の予備作戦および第一段階を開始した」とし、イスラエル国防軍はすでにガザ市郊外を包囲し始め、ハマスは今や「打ちのめされ傷ついている」と述べた。
"""