In [10]:
import wikipediaapi
import json
import os
import time
import re

In [None]:
# Configuration
OUTPUT_DIR = "wiki_data_api"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

USER_AGENT_STRING = 'MyStudentProject/1.0 (tehzhijian0513@gmail.com)' 

# Initialize the API
wiki_wiki = wikipediaapi.Wikipedia(
    user_agent=USER_AGENT_STRING,
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI
)


In [9]:
# Provided Links
URL_DISCIPLINE_MAP = {
    "Core flood protection": [
        "https://en.wikipedia.org/wiki/Levee",
        "https://en.wikipedia.org/wiki/Flood_control"
    ],
    "Hydraulics & coastal engineering": [
        "https://en.wikipedia.org/wiki/Hydraulic_engineering",
        "https://en.wikipedia.org/wiki/Storm_surge",
        "https://en.wikipedia.org/wiki/Wave_run-up",
        "https://en.wikipedia.org/wiki/Return_period"
    ],
    "Geotechnical engineering": [
        "https://en.wikipedia.org/wiki/Geotechnical_engineering",
        "https://en.wikipedia.org/wiki/Soil_mechanics",
        "https://en.wikipedia.org/wiki/Slope_stability",
        "https://en.wikipedia.org/wiki/Piping_(geotechnical_engineering)",
        "https://en.wikipedia.org/wiki/Embankment_dam"
    ],
    "Structural & materials engineering": [
        "https://en.wikipedia.org/wiki/Revetment",
        "https://en.wikipedia.org/wiki/Geotextile",
        "https://en.wikipedia.org/wiki/Concrete",
        "https://en.wikipedia.org/wiki/Sheet_pile"
    ],
    "Spatial design & GIS": [
        "https://en.wikipedia.org/wiki/Geographic_information_system",
        "https://en.wikipedia.org/wiki/Digital_elevation_model",
        "https://en.wikipedia.org/wiki/Land-use_planning",
        "https://en.wikipedia.org/wiki/Alignment_(engineering)"
    ],
    "Ecology & environment": [
        "https://en.wikipedia.org/wiki/Environmental_impact_assessment",
        "https://en.wikipedia.org/wiki/River_ecology",
        "https://en.wikipedia.org/wiki/Wetland",
        "https://en.wikipedia.org/wiki/Nature-based_solutions"
    ],
    "Requirements & standards": [
        "https://en.wikipedia.org/wiki/Engineering_standard",
        "https://en.wikipedia.org/wiki/Factor_of_safety",
        "https://en.wikipedia.org/wiki/Cost%E2%80%93benefit_analysis"
    ]
}

Starting API Extraction...
Processing: https://en.wikipedia.org/wiki/Levee
Processing: https://en.wikipedia.org/wiki/Flood_control
Processing: https://en.wikipedia.org/wiki/Hydraulic_engineering
Processing: https://en.wikipedia.org/wiki/Storm_surge
Processing: https://en.wikipedia.org/wiki/Wave_run-up
Processing: https://en.wikipedia.org/wiki/Return_period
Processing: https://en.wikipedia.org/wiki/Geotechnical_engineering
Processing: https://en.wikipedia.org/wiki/Soil_mechanics
Processing: https://en.wikipedia.org/wiki/Slope_stability
Processing: https://en.wikipedia.org/wiki/Piping_(geotechnical_engineering)
  > Page 'Piping_(geotechnical_engineering)' does not exist.
Processing: https://en.wikipedia.org/wiki/Embankment_dam
Processing: https://en.wikipedia.org/wiki/Revetment
Processing: https://en.wikipedia.org/wiki/Geotextile
Processing: https://en.wikipedia.org/wiki/Concrete
Processing: https://en.wikipedia.org/wiki/Sheet_pile
Processing: https://en.wikipedia.org/wiki/Geographic_inf

In [None]:
# Main Functions 
def get_page_via_api(url):
    """
    Extracts the title from URL and fetches via API.
    """
    # 1. Extract title from URL (e.g., "Levee" from ".../wiki/Levee")
    # unquote helps handle %20 or %E2 characters in URLs
    from urllib.parse import unquote
    title_from_url = unquote(url.split("/wiki/")[-1])
    
    # 2. Fetch Page
    page = wiki_wiki.page(title_from_url)
    
    if not page.exists():
        print(f"  > Page '{title_from_url}' does not exist.")
        return None, None

    # 3. Extract Text
    # The API returns the full text. We need to split it into paragraphs manually.
    # It is already "cleaned" (no HTML tags), but we want to exclude "References" sections.
    
    raw_text = page.text
    
    # Simple heuristic to remove bottom sections if they appear in the text blob
    # (The API usually gives clean text, but sometimes includes headers)
    stop_markers = ["See also", "References", "Further reading", "External links"]
    
    valid_paragraphs = []
    
    # Split by double newlines to get paragraphs
    paragraphs = raw_text.split('\n')
    
    for p in paragraphs:
        p_clean = p.strip()
        
        # Check if this line is a "Stop" header
        if any(marker == p_clean for marker in stop_markers):
            break # Stop processing this page
            
        # Filter out short headings or empty lines
        if len(p_clean) > 50:
            valid_paragraphs.append(p_clean)
            
    return page.title, valid_paragraphs


In [None]:

# --- Main Pipeline ---

knowledge_base = []

print(f"Starting API Extraction...")

for discipline, urls in URL_DISCIPLINE_MAP.items():
    for url in urls:
        print(f"Processing: {url}")
        
        # Be polite to the API (even though it's permissive)
        time.sleep(1) 
        
        try:
            title, paragraphs = get_page_via_api(url)
        except Exception as e:
            print(f"  > API Error: {e}")
            continue
            
        if not title:
            continue

        # --- Step 1: Save MD File ---
        safe_title = re.sub(r'[\\/*?:"<>|]', "", title).replace(" ", "_")
        md_filename = os.path.join(OUTPUT_DIR, f"{safe_title}.md")
        
        with open(md_filename, 'w', encoding='utf-8') as f:
            f.write(f"# {title}\n\n")
            f.write("\n\n".join(paragraphs))
            
        # --- Step 3 & 4: Chunk & Metadata ---
        for i, chunk_text in enumerate(paragraphs):
            chunk_data = {
                "chunk_id": f"{safe_title}_{i}",
                "metadata": {
                    "source": url,
                    "discipline": discipline,
                    "page_title": title
                },
                "raw_text": chunk_text
            }
            knowledge_base.append(chunk_data)

# --- Step 5: Save JSON ---
json_filename = "wiki_knowledge_base_api.json"
with open(json_filename, 'w', encoding='utf-8') as f:
    json.dump(knowledge_base, f, indent=2, ensure_ascii=False)

print("-" * 30)
print(f"Success!")
print(f"Total Chunks: {len(knowledge_base)}")
print(f"JSON saved to: {json_filename}")