In [2]:
from pathlib import Path
from typing import Optional

import fenic as fc

config = fc.SessionConfig(
    app_name="markdown_processing",
    semantic=fc.SemanticConfig(
        language_models= {
            "mini": fc.OpenAIModelConfig(
                model_name="gpt-4o-mini",
                rpm=500,
                tpm=200_000
            )
        }
    )
)

# Initialize fenic session
session = fc.Session.get_or_create(config)

In [None]:
# Load the academic paper markdown content from file
paper_path = Path("attention_is_all_you_need.md")
with open(paper_path, 'r', encoding='utf-8') as f:
    paper_content = f.read()

# Create DataFrame with the paper content as a single row
df = session.create_dataframe({
    "paper_title": ["Attention Is All You Need"],
    "content": [paper_content]
})

# Cast content to MarkdownType to enable markdown-specific functions
df = df.select(
    fc.col("paper_title"),
    fc.col("content").cast(fc.MarkdownType).alias("markdown")
)

print("=== PAPER LOADED ===")
result = df.select(fc.col('paper_title')).to_polars()
print(f"Paper: {result['paper_title'][0]}")
print()

In [None]:
# 1. Generate Table of Contents using markdown.generate_toc()
toc_df = df.select(
    fc.col("paper_title"),
    fc.markdown.generate_toc(fc.col("markdown")).alias("toc")
)

toc_df.show()


In [None]:
# 2. Extract all document sections and convert to structured DataFrame
sections_df = df.select(
    fc.col("paper_title"),
    fc.markdown.generate_toc(fc.col("markdown")).alias("toc"),
    # Extract sections up to level 2 headers, returning array of section objects
    fc.markdown.extract_header_chunks(fc.col("markdown"), header_level=2).alias("sections")
).explode("sections").unnest("sections")  # Convert array to rows and flatten struct

sections_df.show()

In [None]:
# 3. Filter for specific section (References) and parse its content
references_df = sections_df.filter(
    fc.col("heading").contains("References")
)

# Split references content on [1], [2], etc. patterns to separate individual citations
references_df.select(
    fc.text.split(fc.col("content"), r"\[\d+\]").alias("references")
).explode("references").show()
print()

In [None]:
# 4. Extract references using JSON + jq approach
# Convert the original document to JSON structure
document_json_df = df.select(
    fc.col("paper_title"),
    fc.markdown.to_json(fc.col("markdown")).alias("document_json")
)

# Extract individual references using pure jq
# References are nested under "7 Conclusion" -> "References" heading
individual_refs_df = document_json_df.select(
    fc.col("paper_title"),
    fc.json.jq(
        fc.col("document_json"),
        # Navigate to References section and split text into individual citations
        '.children[-1].children[] | select(.type == "heading" and (.content[0].text == "References")) | .children[0].content[0].text | split("\\n") | .[]'
    ).alias("reference_text")
).explode("reference_text").select(
    fc.col("paper_title"),
    fc.col("reference_text").cast(fc.StringType).alias("reference_text")
).filter(
    fc.col("reference_text") != ""
)

individual_refs_df.show()

In [None]:
# Extract reference number and content using text.extract() with template
print("Extracting reference numbers and content using text.extract():")
parsed_refs_df = individual_refs_df.select(
    fc.col("paper_title"),
    fc.text.extract(
        fc.col("reference_text"),
        "[${ref_number:none}] ${content:none}"
    ).alias("parsed_ref")
).select(
    fc.col("paper_title"),
    fc.col("parsed_ref").get_item("ref_number").alias("reference_number"),
    fc.col("parsed_ref").get_item("content").alias("citation_content")
)

print("References with separated numbers and content:")
parsed_refs_df.show()
print()

# Clean up session resources
session.stop()