In [1]:
import wikipediaapi
import json
import os
import time
import re

In [2]:
# Configuration
OUTPUT_DIR = "wiki_data_api_math"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

USER_AGENT_STRING = 'MyStudentProject/1.0 (tehzhijian0513@gmail.com)' 

# Initialize the API
wiki_wiki = wikipediaapi.Wikipedia(
    user_agent=USER_AGENT_STRING,
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI
)


In [3]:
# Provided Links
URL_DISCIPLINE_MAP = {
    "Geometry & Shapes": [
        "https://en.wikipedia.org/wiki/Triangle",
        "https://en.wikipedia.org/wiki/Angle",
        "https://en.wikipedia.org/wiki/Square",
        "https://en.wikipedia.org/wiki/Rectangle",
        "https://en.wikipedia.org/wiki/Trapezoid",
        "https://en.wikipedia.org/wiki/Isosceles_trapezoid",
        "https://en.wikipedia.org/wiki/Parallelogram",
        "https://en.wikipedia.org/wiki/Rhombus",
        "https://en.wikipedia.org/wiki/Quadrilateral",
        "https://en.wikipedia.org/wiki/Isosceles_triangle",
        "https://en.wikipedia.org/wiki/Special_right_triangle",
        "https://en.wikipedia.org/wiki/Ailles_rectangle",
        "https://en.wikipedia.org/wiki/Golden_triangle_(mathematics)"
    ],
    "Measure Theory & Probability": [
        "https://en.wikipedia.org/wiki/Measure_(mathematics)",
        "https://en.wikipedia.org/wiki/Probability_theory",
        "https://en.wikipedia.org/wiki/Brownian_motion",
        "https://en.wikipedia.org/wiki/Harmonic_measure",
        "https://en.wikipedia.org/wiki/Geometric_measure_theory"
    ],
    "Surfaces & Physics": [
        "https://en.wikipedia.org/wiki/Soap_film",
        "https://en.wikipedia.org/wiki/Minimal_surface"
    ],
    "Graph Theory": [
        "https://en.wikipedia.org/wiki/Spectral_graph_theory",
        "https://en.wikipedia.org/wiki/Graph_isomorphism",
        "https://en.wikipedia.org/wiki/Glossary_of_graph_theory",
        "https://en.wikipedia.org/wiki/Distance-regular_graph",
        "https://en.wikipedia.org/wiki/Strongly_regular_graph",
        "https://en.wikipedia.org/wiki/Petersen_graph",
        "https://en.wikipedia.org/wiki/Alfred_Kempe",
        "https://en.wikipedia.org/wiki/Snark_(graph_theory)",
        "https://en.wikipedia.org/wiki/Graph_property",
        "https://en.wikipedia.org/wiki/Hypergraph"
    ]
}

In [4]:
# Main Functions 
def get_page_via_api(url):
    """
    Extracts the title from URL and fetches via API.
    """
    # 1. Extract title from URL (e.g., "Levee" from ".../wiki/Levee")
    # unquote helps handle %20 or %E2 characters in URLs
    from urllib.parse import unquote
    title_from_url = unquote(url.split("/wiki/")[-1])
    
    # 2. Fetch Page
    page = wiki_wiki.page(title_from_url)
    
    if not page.exists():
        print(f"  > Page '{title_from_url}' does not exist.")
        return None, None

    # 3. Extract Text
    # The API returns the full text. We need to split it into paragraphs manually.
    # It is already "cleaned" (no HTML tags), but we want to exclude "References" sections.
    
    raw_text = page.text
    
    # Simple heuristic to remove bottom sections if they appear in the text blob
    # (The API usually gives clean text, but sometimes includes headers)
    stop_markers = ["See also", "References", "Further reading", "External links"]
    
    valid_paragraphs = []
    
    # Split by double newlines to get paragraphs
    paragraphs = raw_text.split('\n')
    
    for p in paragraphs:
        p_clean = p.strip()
        
        # Check if this line is a "Stop" header
        if any(marker == p_clean for marker in stop_markers):
            break # Stop processing this page
            
        # Filter out short headings or empty lines
        if len(p_clean) > 50:
            valid_paragraphs.append(p_clean)
            
    return page.title, valid_paragraphs


In [5]:

# --- Main Pipeline ---

knowledge_base = []

print(f"Starting API Extraction...")

for discipline, urls in URL_DISCIPLINE_MAP.items():
    for url in urls:
        print(f"Processing: {url}")
        
        # Be polite to the API (even though it's permissive)
        time.sleep(1) 
        
        try:
            title, paragraphs = get_page_via_api(url)
        except Exception as e:
            print(f"  > API Error: {e}")
            continue
            
        if not title:
            continue

        # --- Step 1: Save MD File ---
        safe_title = re.sub(r'[\\/*?:"<>|]', "", title).replace(" ", "_")
        md_filename = os.path.join(OUTPUT_DIR, f"{safe_title}.md")
        
        with open(md_filename, 'w', encoding='utf-8') as f:
            f.write(f"# {title}\n\n")
            f.write("\n\n".join(paragraphs))
            
        # --- Step 3 & 4: Chunk & Metadata ---
        for i, chunk_text in enumerate(paragraphs):
            chunk_data = {
                "chunk_id": f"{safe_title}_{i}",
                "metadata": {
                    "source": url,
                    "discipline": discipline,
                    "page_title": title
                },
                "raw_text": chunk_text
            }
            knowledge_base.append(chunk_data)

# --- Step 5: Save JSON ---
json_filename = "wiki_math_knowledge_base_api.json"
with open(json_filename, 'w', encoding='utf-8') as f:
    json.dump(knowledge_base, f, indent=2, ensure_ascii=False)

print("-" * 30)
print(f"Success!")
print(f"Total Chunks: {len(knowledge_base)}")
print(f"JSON saved to: {json_filename}")

Starting API Extraction...
Processing: https://en.wikipedia.org/wiki/Triangle
Processing: https://en.wikipedia.org/wiki/Angle
Processing: https://en.wikipedia.org/wiki/Square
Processing: https://en.wikipedia.org/wiki/Rectangle
Processing: https://en.wikipedia.org/wiki/Trapezoid
Processing: https://en.wikipedia.org/wiki/Isosceles_trapezoid
Processing: https://en.wikipedia.org/wiki/Parallelogram
Processing: https://en.wikipedia.org/wiki/Rhombus
Processing: https://en.wikipedia.org/wiki/Quadrilateral
Processing: https://en.wikipedia.org/wiki/Isosceles_triangle
Processing: https://en.wikipedia.org/wiki/Special_right_triangle
Processing: https://en.wikipedia.org/wiki/Ailles_rectangle
Processing: https://en.wikipedia.org/wiki/Golden_triangle_(mathematics)
Processing: https://en.wikipedia.org/wiki/Measure_(mathematics)
Processing: https://en.wikipedia.org/wiki/Probability_theory
Processing: https://en.wikipedia.org/wiki/Brownian_motion
Processing: https://en.wikipedia.org/wiki/Harmonic_measur