# Create Dataset from WikiPedia

The logic is
1. Get data from Wikipedia for the below Titles:
    1. Cricket_World_Cup
	  2. FIFA_World_Cup
	  3. Olympic_Games
		4. 2027_Cricket_World_Cup
2. Clean up the data to readable format. Remove unwanted lines (\n), other characters (===), etc.
3. Create a python dataframe: Create multiple rows for each titles, each row can be sections or sub sections; The columns are: Title, Sections, Subsections, Content
4. Save it to csv.

**NOTE: Code generated using Gemini**

In [6]:
!pip install wikipedia
import wikipedia
import pandas as pd
import re



In [18]:
# List of Wikipedia page titles to fetch
TITLES = [
    "Cricket_World_Cup",
    "FIFA_World_Cup",
    "Olympic_Games",
    "2027_Cricket_World_Cup"
]

In [19]:
def clean_wikitext(text):
    """
    Cleans up the raw wikitext content by removing common markup and citations.
    """
    if not text:
        return ""

    # 1. Remove reference tags and content (e.g., [1], [a])
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\[\w+\]', '', text)

    # 2. Remove internal links/formatting like [[Article|Link Text]] or [[Article]]
    # This is a basic removal to keep the text clean.
    text = re.sub(r'\[\[[^|\]]+\|([^\]]+)\]\]', r'\1', text) # [[Article|Link Text]] -> Link Text
    text = re.sub(r'\[\[([^\]]+)\]\]', r'\1', text)         # [[Article]] -> Article

    # 3. Remove wikitext bold/italic markers (''', '')
    text = re.sub(r"'''", '', text)
    text = re.sub(r"''", '', text)

    # 4. Remove templates and embedded tables/metadata ({{...}})
    text = re.sub(r'{{.*?}}', '', text, flags=re.DOTALL)

    # 5. Clean up multiple newlines and leading/trailing whitespace
    text = re.sub(r'\n{2,}', '\n', text)
    text = text.strip()

    return text

In [20]:
def parse_page_content(title):
    """
    Fetches the raw content of a Wikipedia page and parses it into structured
    sections and subsections.
    """
    try:
        page = wikipedia.page(title, auto_suggest=False)
        content = page.content
    except wikipedia.exceptions.PageError:
        print(f"Error: Page '{title}' not found.")
        return []
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Error: Disambiguation needed for '{title}'. Options: {e.options}")
        return []

    data = []

    # 1. Separate the introduction (content before the first ==)
    match_intro = re.match(r'(.*?)^==', content, re.MULTILINE | re.DOTALL)
    intro_text = ""
    if match_intro:
        intro_text = match_intro.group(1).strip()
        # Clean the main introduction text
        cleaned_intro = clean_wikitext(intro_text)
        if cleaned_intro:
            data.append({
                'Title': title,
                'Section': 'Introduction',
                'Subsection': '',
                'Content': cleaned_intro
            })

        # Remove the intro text from content for the next step
        content = content[len(intro_text):].strip()

    # 2. Split the rest of the content by section headers (==...== or ===...===)
    section_chunks = re.split(r'(^={2,3}[^=]+={2,3})', content, flags=re.MULTILINE)

    # Filter out empty strings that result from the split
    section_chunks = [chunk.strip() for chunk in section_chunks if chunk.strip()]

    current_section = ''

    # The chunks alternate between Header Title and Content Block
    for i, chunk in enumerate(section_chunks):
        if chunk.startswith('=='):
            # This is a header chunk (e.g., '==History==')

            # Determine header level (e.g., 2 for '==' or 3 for '===')
            level = chunk.count('=') // 2

            # Extract the clean title text (remove leading/trailing '=')
            header_text = chunk.strip('=').strip()

            # Content should be the next chunk if it exists and is not another header
            content_chunk = section_chunks[i + 1] if i + 1 < len(section_chunks) and not section_chunks[i + 1].startswith('=') else ''

            cleaned_content = clean_wikitext(content_chunk)

            if level == 2:
                # Main section
                current_section = header_text
                current_subsection = ''
                if cleaned_content:
                    data.append({
                        'Title': title,
                        'Section': current_section,
                        'Subsection': current_subsection,
                        'Content': cleaned_content
                    })
            elif level == 3:
                # Subsection
                current_subsection = header_text
                if cleaned_content:
                    data.append({
                        'Title': title,
                        'Section': current_section, # Use the last major section
                        'Subsection': current_subsection,
                        'Content': cleaned_content
                    })
            # Skip the next chunk as it was processed as content, but only if it exists and is content
            if i + 1 < len(section_chunks) and not section_chunks[i + 1].startswith('='):
                section_chunks[i + 1] = '' # Mark content chunk as processed to avoid double counting

    return data

In [21]:
def main():
    """
    Main function to orchestrate the data fetching, parsing, and saving.
    """
    print("Starting Wikipedia data extraction...")

    all_data = []

    for title in TITLES:
        print(f"Processing page: {title}...")
        page_data = parse_page_content(title)
        all_data.extend(page_data)

    if not all_data:
        print("No data was successfully extracted.")
        return

    # 3. Create a python dataframe
    df = pd.DataFrame(all_data)

    # Print the first few rows of the DataFrame for verification
    print("\n--- Extracted Data Sample ---")
    print(df.head(10).to_markdown(index=False))
    print(f"\nTotal rows collected: {len(df)}")

    # 4. Save it to csv
    output_filename = 'wikipedia_sports_data.csv'
    df.to_csv(output_filename, index=False, encoding='utf-8')

    print(f"\nSuccessfully saved data to '{output_filename}'")
    print("Columns: Title, Section, Subsection, Content")

In [22]:
main()

Starting Wikipedia data extraction...
Processing page: Cricket_World_Cup...
Processing page: FIFA_World_Cup...
Processing page: Olympic_Games...
Processing page: 2027_Cricket_World_Cup...

--- Extracted Data Sample ---
| Title             | Section      | Subsection                        | Content                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             