In [None]:
MODEL_NAME = "gpt-oss:20b"
NUM_CTX = 8192
MAX_TOKENS = 2048

INPUT_DIRECTORY = "/content/drive/MyDrive/in"
OUTPUT_DIRECTORY = "/content/drive/MyDrive/out"

SYSTEM_PROMPT = """You are a concise Japanese summarizer.
Summarize the following passage in ≤ 30 words, using plain Japanese.
Do not add any introduction, conclusion, or extra explanation.
Return the summary as a single line.
reasoning: low
"""

In [None]:
%env OLLAMA_FLASH_ATTENTION=1
%env OLLAMA_KV_CACHE_TYPE="q4_k_m"
%env OLLAMA_NO_HISTORY=1

In [None]:
from google.colab import drive

drive.mount("/content/drive")

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download ja_core_news_sm

In [None]:
import spacy


def split_by_token_size(text, max_tokens, lang="en"):
    if lang == "ja":
        nlp = spacy.load("ja_core_news_sm")
    else:
        nlp = spacy.load("en_core_web_sm")

    doc = nlp(text)

    chunks = []
    current = []
    cur_len = 0
    for sent in doc.sents:
        sent_len = len(sent)
        if cur_len + sent_len > max_tokens and current:
            chunks.append(" ".join([t.text_with_ws for t in current]).strip())
            current = []
            cur_len = 0
        current.extend(sent)
        cur_len += sent_len
    if current:
        chunks.append(" ".join([t.text_with_ws for t in current]).strip())

    return chunks

In [None]:
!sudo apt update
!sudo apt install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
import threading
import subprocess
import time


def run_ollama_serve():
    subprocess.Popen(["ollama", "serve"])


thread = threading.Thread(target=run_ollama_serve)
thread.start()
time.sleep(5)

In [None]:
!pip install ollama

In [None]:
import ollama
import os
import glob

ollama.pull(MODEL_NAME)

In [None]:
def summarize_file(file):
    with open(file, "r") as f:
        text = f.read()

    for i, chunk in enumerate(split_by_token_size(text, max_tokens=MAX_TOKENS)):
        result = ollama.generate(
            model=MODEL_NAME,
            prompt=text,
            system=SYSTEM_PROMPT,
            options={"num_ctx": NUM_CTX},
        )

        with open(
            f"{OUTPUT_DIRECTORY}/{os.path.splitext(os.path.basename(file))[0]}_summary_{i:03d}.txt",
            "w",
            encoding="utf-8",
        ) as f:
            f.write(str(result["response"]))

In [None]:
file_list = sorted(glob.glob(os.path.join(INPUT_DIRECTORY, "*.txt")))
for file in file_list:
    summarize_file(file)