# Summarize large papers

In [9]:
from rolling.processing import list_processed, load_document
processed = list_processed()
idx = [i for i, p in enumerate(processed) if "deepseek-v3" in p]
print(processed[idx[0]])
doc = load_document(processed[idx[0]])
doc

./arxiv_downloads_processed\2412.19437_deepseek-v3_technical_report


Content(type='document', level=-1, data={}, children=[Content(type='chapter', level=0, data={'type': 'text', 'text': 'DeepSeek-V3 Technical Report ', 'text_level': 1, 'page_idx': 0}, children=[Content(type='text', level=1, data={'type': 'text', 'text': 'DeepSeek-AI ', 'page_idx': 0}, children=[]), Content(type='text', level=1, data={'type': 'text', 'text': 'research@deepseek.com ', 'page_idx': 0}, children=[])]), Content(type='chapter', level=0, data={'type': 'text', 'text': 'Abstract ', 'text_level': 1, 'page_idx': 0}, children=[Content(type='text', level=1, data={'type': 'text', 'text': 'We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load b

In [10]:
from rolling.processing import printable_content, get_content_hierarchy, Content
print(printable_content(doc, indent=None))

{"type": "document", "level": -1, "data": {}, "children": [{"type": "chapter", "level": 0, "data": {"text": "DeepSeek-V3 Technical Report "}, "children": [{"type": "text", "level": 1, "data": {"text": "DeepSeek-AI "}}, {"type": "text", "level": 1, "data": {"text": "research@deepseek.com "}}]}, {"type": "chapter", "level": 0, "data": {"text": "Abstract "}, "children": [{"type": "text", "level": 1, "data": {"text": "We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and high-quality tokens, followed by Supe

In [16]:
import json
import pydantic
import ollama
import random

class ChapterSummary(pydantic.BaseModel):
    chapter_title: str
    summaries: list[str]


def summarize_doc(doc: Content, llm_model="mistral-small3.1", min_tokens=4096):
    hierarchy = get_content_hierarchy(doc)

    summaries = []
    for chapter in doc.children:
        assert chapter.type == "chapter", "Chapter type mismatch"

        if len(chapter.children) == 0:
            summaries.append(
                ChapterSummary(
                    chapter_title=chapter.data["text"],
                    summaries=[],
                ).__dict__
            )
            continue

        prompt = f"""
**Role:** Expert Knowledge Distiller.

**Objective:** Analyze the 'Current Chapter Content'. Your **sole focus** is to extract and synthesize the **absolute core knowledge** (key findings, critical methods/definitions, essential results, crucial context) presented within. Disregard chapter structure and meta-commentary. The goal is a highly condensed summary enabling understanding of the chapter's *contribution* without reading it.

**CRITICAL Instructions:**
1.  **NO OUTLINES / STRUCTURE:** **Do NOT** list section titles, subsection numbers, or describe the chapter's organization (e.g., AVOID "This chapter discusses X in section Y"). Summarize the *information itself*, not the chapter's structure.
2.  **SUBSTANCE ONLY:** Generate summary points **only** if the chapter presents significant *new substantive information* (e.g., a new technique, a key result, a core definition) compared to previous summaries. Ignore introductory phrases, forward references, or purely structural sections.
3.  **EXTREME CONCISENESS:** Aim for **highly condensed** points (like bullet points). Capture the *essence* only. If a detail isn't critical for understanding the main point, omit it. Think "executive summary" level for each key concept.
4.  **Accuracy:** Base summaries strictly on the provided 'Current Chapter Content'. No external knowledge or interpretation.
5.  **Empty if Necessary:** If a chapter contains primarily structure, table of contents, people, introductions, references, or information already covered, return an empty `summaries` list (`[]`). Do not summarize just for the sake of summarizing.
6.  **Output Format:**
    * Return `[]` for `summaries` if no new *substantive* core knowledge is found.
    * Otherwise, return a list of concise strings in `summaries`.
    * The `chapter_title` field *must* exactly match the 'Current Chapter Title'.

**Context:**
* Document Hierarchy: ```{json.dumps(hierarchy)}``` (Provides background context only)
* Previous Chapter Summaries: ```{json.dumps(summaries)}```
* Current Chapter Title: ``{chapter.data['text']}`` (Use this exact title in output)

**Current Chapter Content to Distill:**
```{printable_content(chapter, indent=None)}```

Generate the JSON output conforming to the ChapterSummary structure.
"""

        try:
            print("-"*80)
            print(prompt)
            print("-"*40)
            context_length = max(min_tokens, len(prompt.split()) * 3)
            seed = random.randint(0, 2**30 - 1)
            print(f"Context length: {context_length} / Seed: {seed}")
            format = ChapterSummary.model_json_schema()
            response = ollama.generate(
                llm_model,
                prompt,
                format=format,
                options={
                    "num_ctx": context_length,
                    "seed": seed,
                },
            )
            response = ChapterSummary.model_validate_json(response["response"])
            print(response.__dict__)

            assert response is not None, "Response is None"
            assert response.chapter_title.lower().strip() == chapter.data["text"].lower().strip(), "Chapter title mismatch"

            summaries.append(response.__dict__)
        except Exception as e:
            print(e)
            return None
        
    return summaries

summary = summarize_doc(doc, llm_model='gemma3:12b', min_tokens=16384)
summary

--------------------------------------------------------------------------------

**Role:** Expert Knowledge Distiller.

**Objective:** Analyze the 'Current Chapter Content'. Your **sole focus** is to extract and synthesize the **absolute core knowledge** (key findings, critical methods/definitions, essential results, crucial context) presented within. Disregard chapter structure and meta-commentary. The goal is a highly condensed summary enabling understanding of the chapter's *contribution* without reading it.

**CRITICAL Instructions:**
1.  **NO OUTLINES / STRUCTURE:** **Do NOT** list section titles, subsection numbers, or describe the chapter's organization (e.g., AVOID "This chapter discusses X in section Y"). Summarize the *information itself*, not the chapter's structure.
2.  **SUBSTANCE ONLY:** Generate summary points **only** if the chapter presents significant *new substantive information* (e.g., a new technique, a key result, a core definition) compared to previous summaries

[{'chapter_title': 'DeepSeek-V3 Technical Report ', 'summaries': []},
 {'chapter_title': 'Abstract',
  'summaries': ['DeepSeek-V3 is a 671B parameter Mixture-of-Experts (MoE) language model with 37B activated per token.',
   'The architecture incorporates Multi-head Latent Attention (MLA) and DeepSeekMoE, and utilizes an auxiliary-loss-free load balancing strategy with a multi-token prediction training objective.',
   'DeepSeek-V3 was pre-trained on 14.8 trillion tokens and fine-tuned using Supervised Fine-Tuning and Reinforcement Learning.',
   'Evaluations show DeepSeek-V3 outperforms other open-source models and rivals leading closed-source models.',
   'Training required 2.788M H800 GPU hours and exhibited remarkable stability without significant loss spikes or rollbacks.',
   'Model checkpoints are available on GitHub.']},
 {'chapter_title': 'Contents ', 'summaries': []},
 {'chapter_title': '1 Introduction 4 ', 'summaries': []},
 {'chapter_title': '2 Architecture 6',
  'summaries'

In [17]:
for e in summary:
    print(e["chapter_title"])
    for s in e["summaries"]:
        print(f' - {s}')
    print()
    print("-"*80)

DeepSeek-V3 Technical Report 

--------------------------------------------------------------------------------
Abstract
 - DeepSeek-V3 is a 671B parameter Mixture-of-Experts (MoE) language model with 37B activated per token.
 - The architecture incorporates Multi-head Latent Attention (MLA) and DeepSeekMoE, and utilizes an auxiliary-loss-free load balancing strategy with a multi-token prediction training objective.
 - DeepSeek-V3 was pre-trained on 14.8 trillion tokens and fine-tuned using Supervised Fine-Tuning and Reinforcement Learning.
 - Evaluations show DeepSeek-V3 outperforms other open-source models and rivals leading closed-source models.
 - Training required 2.788M H800 GPU hours and exhibited remarkable stability without significant loss spikes or rollbacks.
 - Model checkpoints are available on GitHub.

--------------------------------------------------------------------------------
Contents 

--------------------------------------------------------------------------------

In [19]:

def summarize_content(chapter: Content, llm_model="gemma3:12b", min_tokens=8192):
    hierarchy = get_content_hierarchy(doc)


    prompt = f"""
**Role:** Expert Knowledge Distiller.

**Objective:** Analyze the 'Current Chapter Content'. Your **sole focus** is to extract and synthesize the **absolute core knowledge** (key findings, critical methods/definitions, essential results, crucial context) presented within. Disregard chapter structure and meta-commentary. The goal is a highly condensed summary enabling understanding of the chapter's *contribution* without reading it.

**CRITICAL Instructions:**
1.  **NO OUTLINES / STRUCTURE:** **Do NOT** list section titles, subsection numbers, or describe the chapter's organization (e.g., AVOID "This chapter discusses X in section Y"). Summarize the *information itself*, not the chapter's structure.
2.  **SUBSTANCE ONLY:** Generate summary points **only** if the chapter presents significant *new substantive information* (e.g., a new technique, a key result, a core definition) compared to previous summaries. Ignore introductory phrases, forward references, or purely structural sections.
3.  **EXTREME CONCISENESS:** Aim for **highly condensed** points (like bullet points). Capture the *essence* only. If a detail isn't critical for understanding the main point, omit it. Think "executive summary" level for each key concept.
4.  **Accuracy:** Base summaries strictly on the provided 'Current Chapter Content'. No external knowledge or interpretation.
5.  **Empty if Necessary:** If a chapter contains primarily structure, table of contents, people, introductions, references, or information already covered, return an empty `summaries` list (`[]`). Do not summarize just for the sake of summarizing.
6.  **Output Format:**
    * Return `[]` for `summaries` if no new *substantive* core knowledge is found.
    * Otherwise, return a list of concise strings in `summaries`.
    * The `chapter_title` field *must* exactly match the 'Current Chapter Title'.

**Context:**
* Current Chapter Title: ``{chapter.data['text']}`` (Use this exact title in output)

**Current Chapter Content to Distill:**
```{printable_content(chapter, indent=None)}```

Generate the JSON output conforming to the ChapterSummary structure.
"""

    try:
        context_length = max(min_tokens, len(prompt.split()) * 3)
        seed = random.randint(0, 2**30 - 1)
        print(f"Context length: {context_length} / Seed: {seed}")
        format = ChapterSummary.model_json_schema()
        response = ollama.generate(
            llm_model,
            prompt,
            format=format,
            options={
                "num_ctx": context_length,
                "seed": seed,
            },
        )

        response = ChapterSummary.model_validate_json(response["response"])
        assert response is not None, "Response is None"
        assert response.chapter_title.lower().strip() == chapter.data["text"].lower().strip(), "Chapter title mismatch"

        return response
    except Exception as e:
        print(e)
        return None
    

summary = summarize_content(doc.children[1], llm_model='gemma3:12b', min_tokens=8192)
summary

Context length: 8192 / Seed: 738712360


ChapterSummary(chapter_title='Abstract ', summaries=['DeepSeek-V3 is a 671B Mixture-of-Experts (MoE) language model with 37B active parameters per token.', 'DeepSeek-V3 utilizes Multi-head Latent Attention (MLA) and DeepSeekMoE architectures.', 'A novel auxiliary-loss-free load balancing strategy and multi-token prediction training objective are introduced.', 'DeepSeek-V3 was pre-trained on 14.8 trillion tokens and fine-tuned with Supervised Fine-Tuning and Reinforcement Learning.', 'DeepSeek-V3 outperforms other open-source models and performs comparably to leading closed-source models.', 'Training DeepSeek-V3 required 2.788M H800 GPU hours and was remarkably stable.', 'Model checkpoints are available at https://github.com/deepseek-ai/DeepSeek-V3.'])