In [1]:
import requests
import json
import re
import pprint
pp = pprint.PrettyPrinter(indent=4)


title, page_id, content = fetch_article_content('Normal_distribution')
cleaned_text = tokenize_and_clean(content)
split_secs = split_and_filter_sections(cleaned_text)
print(cleaned_text)


def fetch_article_content(article_title):
    URL = "https://en.wikipedia.org/w/api.php"
    PARAMS = {
        "action": "query",
        "prop": "extracts",
        "titles": article_title,
        "explaintext": True,
        "format": "json"
    }
    
    url_title = article_title
    response = requests.get(url=URL, params=PARAMS)
    data = response.json()
    pages = data["query"]["pages"]
    
    # Since there's typically one page per title, you can get the page ID like this
    page_id = next(iter(pages))
    content = pages[page_id].get("extract", "")
    title = pages[page_id].get("title", "")
    wiki_url = f"https://en.wikipedia.org/wiki/{url_title}"
    return title, page_id, content


def tokenize_and_clean(text):
    # Tokenize by whitespace
    tokens = text.split()
    
    # Remove unwanted LaTeX commands or mathematical expressions from tokens
    cleaned_tokens = [token for token in tokens if not re.match(r'\\[a-zA-Z]+', token)]
    #cleaned_tokens = [token for token in tokens if not re.match(r'({.*?})|(\\[a-zA-Z]+)', token)]

    
    # Rejoin the cleaned tokens
    cleaned_text = ' '.join(cleaned_tokens)
    #cleaned_text = re.sub(r"({.*?})","", clean_text)
    return cleaned_text

def remove_nested_curly_braces(text):
    stack = []
    to_remove = []
    text_list = list(text)

    for i, char in enumerate(text_list):
        if char == '{':
            stack.append(i - 1) 
        elif char == '}':
            if stack:
                start = stack.pop()
                if not stack:
                    to_remove.append((start, i))

    for start, end in reversed(to_remove):
        del text_list[start:end + 1]

    return ''.join(text_list)

def remove_whitespace(text):
    # Replace \n and \t with an empty string
    cleaned_text = re.sub(r'[\n\t]+', '', text)
    return cleaned_text


import re

SECTIONS_TO_IGNORE = [
    "See also",
    "References",
    "External links",
    "Further reading",
    "Footnotes",
    "Bibliography",
    "Sources",
    "Citations",
    "Literature",
    "Footnotes",
    "Notes and references",
    "Photo gallery",
    "Works cited",
    "Photos",
    "Gallery",
    "Notes",
    "References and sources",
    "References and notes",
]

def split_and_filter_sections(article_text):
    # Split the article into sections using regex pattern
    sections = re.split(r'(==\s*[^=]+\s*==)', article_text)
    # Pair up the section titles with their content
    sections = list(zip(sections[1::2], sections[2::2]))
    
    # Filter out the sections to ignore
    filtered_sections = {title.strip('= \n'): content.strip()
                         for title, content in sections
                         if title.strip('= \n') not in SECTIONS_TO_IGNORE}
    
    return filtered_sections


In [4]:
from pydantic import BaseModel
import requests

class HuggingFaceAPI(BaseModel):
    """
    Llama model deployed to HF for text cleaning and summarization

    """
    token: str
    endpoint: str

    def fetch_summary(self, text_chunk: str) -> str:
        """
        :param text_chunk: Chunk of tokenized text that is going to get summarized
        :return: summary str
        """
        headers = {"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"}
        payload = {"inputs": f"<s>[INST] <<SYS>> Remove all latex and equations. Do Not summarize<</SYS>> {text_chunk}[/INST]</s>",
                  "parameters": {"max_new_tokens": 500, "top_k": 40}}
    
        response = requests.post(self.endpoint, headers=headers, data=json.dumps(payload))
        response_json = response.json()
        print
        return response_json[0]["generated_text"]

def all_subsections_from_title(
    title: str,
    sections_to_ignore: set[str] = SECTIONS_TO_IGNORE,
    site_name: str = WIKI_SITE,
) -> list[tuple[list[str], str]]:
    """From a Wikipedia page title, return a flattened list of all nested subsections.
    Each subsection is a tuple, where:
        - the first element is a list of parent subtitles, starting with the page title
        - the second element is the text of the subsection (but not any children)
    """
    site = mwclient.Site(site_name)
    page = site.pages[title]
    text = page.text()
    parsed_text = mwparserfromhell.parse(text)
    headings = [str(h) for h in parsed_text.filter_headings()]
    if headings:
        summary_text = str(parsed_text).split(headings[0])[0]
    else:
        summary_text = str(parsed_text)
    results = [([title], summary_text)]
    for subsection in parsed_text.get_sections(levels=[2]):
        results.extend(all_subsections_from_section(subsection, [title], sections_to_ignore))
    return results

In [33]:
subsection_list = all_subsections_from_title('Normal_distribution')
subsections = [i[1] for i in subsection_list]
pp.pprint(subsections)

[   '{{Short description|Probability distribution}}\n'
    '{{Redirect|Bell curve}}\n'
    '{{Use mdy dates|date=August 2012}}\n'
    '{{Infobox probability distribution\n'
    '  | name       = Normal distribution\n'
    '  | type       = density\n'
    '  | pdf_image  = Normal Distribution PDF.svg\n'
    "  | pdf_caption = The red curve is the ''standard normal distribution''\n"
    '  | cdf_image  = Normal Distribution CDF.svg\n'
    '  | cdf_caption = \n'
    '  | notation   = <math>\\mathcal{N}(\\mu,\\sigma^2)</math>\n'
    '  | parameters = <math>\\mu\\in\\R</math> = mean ([[location '
    'parameter|location]])<br /><math>\\sigma^2\\in\\R_{>0}</math> = variance '
    '(squared [[scale parameter|scale]])<br />\n'
    '  | support    = <math>x\\in\\R</math>\n'
    '  | pdf        = <math>\\frac{1}{\\sigma\\sqrt{2\\pi}} '
    'e^{-\\frac{1}{2}\\left(\\frac{x - \\mu}{\\sigma}\\right)^2}</math>\n'
    '  | cdf        = <math>\\Phi\\left(\\frac{x-\\mu}{\\sigma}\\right) = '
    '\\frac

In statistics, a normal distribution or Gaussian distribution is a type of continuous probability distribution for a real-valued random variable. The general form of its probability density function is f ( x ) = 1 σ 2 π e − 1 2 ( x − μ σ ) 2 {\displaystyle f(x)={\frac {1}{\sigma {\sqrt {2\pi }}}}e^{-{\frac {1}{2}}\left({\frac {x-\mu }{\sigma }}\right)^{2}}} The parameter μ {\displaystyle } is the mean or expectation of the distribution (and also its median and mode), while the parameter σ {\displaystyle } is its standard deviation. The variance of the distribution is σ 2 {\displaystyle ^{2}} . A random variable with a Gaussian distribution is said to be normally distributed, and is called a normal deviate. Normal distributions are important in statistics and are often used in the natural and social sciences to represent real-valued random variables whose distributions are not known. Their importance is partly due to the central limit theorem. It states that, under some conditions, the 

In [6]:
hf_token, hf_endpoint = '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', 'https://rni9jdayjgc6ea8b.us-east-1.aws.endpoints.huggingface.cloud' 
hf_api = HuggingFaceAPI(token=hf_token, endpoint=hf_endpoint)
text = hf_api.fetch_summary(hf_text)
print(text)

KeyError: 0

In [8]:
# imports
import mwclient
import mwparserfromhell
import openai
import pandas as pd
import re
import tiktoken

In [34]:
# get Wikipedia pages about the 2022 Winter Olympics

CATEGORY_TITLE = "Category:Statistics"
WIKI_SITE = "en.wikipedia.org"


def titles_from_category(
    category: mwclient.listing.Category, max_depth: int
) -> set[str]:
    """Return a set of page titles in a given Wiki category and its subcategories."""
    titles = set()
    for cm in category.members():
        if type(cm) == mwclient.page.Page:
            # ^type() used instead of isinstance() to catch match w/ no inheritance
            titles.add(cm.name)
        elif isinstance(cm, mwclient.listing.Category) and max_depth > 0:
            deeper_titles = titles_from_category(cm, max_depth=max_depth - 1)
            titles.update(deeper_titles)
    return titles


site = mwclient.Site(WIKI_SITE)
category_page = site.pages[CATEGORY_TITLE]
titles = titles_from_category(category_page, max_depth=1)
# ^note: max_depth=1 means we go one level deep in the category tree
print(f"Found {len(titles)} article titles in {CATEGORY_TITLE}.")


Found 426 article titles in Category:Statistics.


In [35]:
# define functions to split Wikipedia pages into sections

SECTIONS_TO_IGNORE = [
    "See also",
    "References",
    "External links",
    "Further reading",
    "Footnotes",
    "Bibliography",
    "Sources",
    "Citations",
    "Literature",
    "Footnotes",
    "Notes and references",
    "Photo gallery",
    "Works cited",
    "Photos",
    "Gallery",
    "Notes",
    "References and sources",
    "References and notes",
]


def all_subsections_from_section(
    section: mwparserfromhell.wikicode.Wikicode,
    parent_titles: list[str],
    sections_to_ignore: set[str],
) -> list[tuple[list[str], str]]:
    """
    From a Wikipedia section, return a flattened list of all nested subsections.
    Each subsection is a tuple, where:
        - the first element is a list of parent subtitles, starting with the page title
        - the second element is the text of the subsection (but not any children)
    """
    headings = [str(h) for h in section.filter_headings()]
    title = headings[0]
    if title.strip("=" + " ") in sections_to_ignore:
        # ^wiki headings are wrapped like "== Heading =="
        return []
    titles = parent_titles + [title]
    full_text = str(section)
    section_text = full_text.split(title)[1]
    if len(headings) == 1:
        return [(titles, section_text)]
    else:
        first_subtitle = headings[1]
        section_text = section_text.split(first_subtitle)[0]
        results = [(titles, section_text)]
        for subsection in section.get_sections(levels=[len(titles) + 1]):
            results.extend(all_subsections_from_section(subsection, titles, sections_to_ignore))
        return results


def all_subsections_from_title(
    title: str,
    sections_to_ignore: set[str] = SECTIONS_TO_IGNORE,
    site_name: str = WIKI_SITE,
) -> list[tuple[list[str], str]]:
    """From a Wikipedia page title, return a flattened list of all nested subsections.
    Each subsection is a tuple, where:
        - the first element is a list of parent subtitles, starting with the page title
        - the second element is the text of the subsection (but not any children)
    """
    site = mwclient.Site(site_name)
    page = site.pages[title]
    text = page.text()
    parsed_text = mwparserfromhell.parse(text)
    headings = [str(h) for h in parsed_text.filter_headings()]
    if headings:
        summary_text = str(parsed_text).split(headings[0])[0]
    else:
        summary_text = str(parsed_text)
    results = [([title], summary_text)]
    for subsection in parsed_text.get_sections(levels=[2]):
        results.extend(all_subsections_from_section(subsection, [title], sections_to_ignore))
    return results

In [None]:
# split pages into sections
# may take ~1 minute per 100 articles
wikipedia_sections = []
for title in titles:
    wikipedia_sections.extend(all_subsections_from_title(title))
print(f"Found {len(wikipedia_sections)} sections in {len(titles)} pages.")

In [None]:
# clean text
def clean_section(section: tuple[list[str], str]) -> tuple[list[str], str]:
    """
    Return a cleaned up section with:
        - <ref>xyz</ref> patterns removed
        - leading/trailing whitespace removed
    """
    titles, text = section
    text = re.sub(r"<ref.*?</ref>", "", text)
    text = text.strip()
    return (titles, text)


wikipedia_sections = [clean_section(ws) for ws in wikipedia_sections]

# filter out short/blank sections
def keep_section(section: tuple[list[str], str]) -> bool:
    """Return True if the section should be kept, False otherwise."""
    titles, text = section
    if len(text) < 16:
        return False
    else:
        return True


original_num_sections = len(wikipedia_sections)
wikipedia_sections = [ws for ws in wikipedia_sections if keep_section(ws)]
print(f"Filtered out {original_num_sections-len(wikipedia_sections)} sections, leaving {len(wikipedia_sections)} sections.")

In [None]:
# print example data
for ws in wikipedia_sections[:5]:
    print(ws[0])
    display(ws[1][:77] + "...")
    print()

In [None]:
GPT_MODEL = "gpt-3.5-turbo"  # only matters insofar as it selects which tokenizer to use


def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def halved_by_delimiter(string: str, delimiter: str = "\n") -> list[str, str]:
    """Split a string in two, on a delimiter, trying to balance tokens on each side."""
    chunks = string.split(delimiter)
    if len(chunks) == 1:
        return [string, ""]  # no delimiter found
    elif len(chunks) == 2:
        return chunks  # no need to search for halfway point
    else:
        total_tokens = num_tokens(string)
        halfway = total_tokens // 2
        best_diff = halfway
        for i, chunk in enumerate(chunks):
            left = delimiter.join(chunks[: i + 1])
            left_tokens = num_tokens(left)
            diff = abs(halfway - left_tokens)
            if diff >= best_diff:
                break
            else:
                best_diff = diff
        left = delimiter.join(chunks[:i])
        right = delimiter.join(chunks[i:])
        return [left, right]


def truncated_string(
    string: str,
    model: str,
    max_tokens: int,
    print_warning: bool = True,
) -> str:
    """Truncate a string to a maximum number of tokens."""
    encoding = tiktoken.encoding_for_model(model)
    encoded_string = encoding.encode(string)
    truncated_string = encoding.decode(encoded_string[:max_tokens])
    if print_warning and len(encoded_string) > max_tokens:
        print(f"Warning: Truncated string from {len(encoded_string)} tokens to {max_tokens} tokens.")
    return truncated_string


def split_strings_from_subsection(
    subsection: tuple[list[str], str],
    max_tokens: int = 1000,
    model: str = GPT_MODEL,
    max_recursion: int = 5,
) -> list[str]:
    """
    Split a subsection into a list of subsections, each with no more than max_tokens.
    Each subsection is a tuple of parent titles [H1, H2, ...] and text (str).
    """
    titles, text = subsection
    string = "\n\n".join(titles + [text])
    num_tokens_in_string = num_tokens(string)
    # if length is fine, return string
    if num_tokens_in_string <= max_tokens:
        return [string]
    # if recursion hasn't found a split after X iterations, just truncate
    elif max_recursion == 0:
        return [truncated_string(string, model=model, max_tokens=max_tokens)]
    # otherwise, split in half and recurse
    else:
        titles, text = subsection
        for delimiter in ["\n\n", "\n", ". "]:
            left, right = halved_by_delimiter(text, delimiter=delimiter)
            if left == "" or right == "":
                # if either half is empty, retry with a more fine-grained delimiter
                continue
            else:
                # recurse on each half
                results = []
                for half in [left, right]:
                    half_subsection = (titles, half)
                    half_strings = split_strings_from_subsection(
                        half_subsection,
                        max_tokens=max_tokens,
                        model=model,
                        max_recursion=max_recursion - 1,
                    )
                    results.extend(half_strings)
                return results
    # otherwise no split was found, so just truncate (should be very rare)
    return [truncated_string(string, model=model, max_tokens=max_tokens)]

In [None]:
# split sections into chunks
MAX_TOKENS = 1600
wikipedia_strings = []
for section in wikipedia_sections:
    wikipedia_strings.extend(split_strings_from_subsection(section, max_tokens=MAX_TOKENS))

print(f"{len(wikipedia_sections)} Wikipedia sections split into {len(wikipedia_strings)} strings.")


In [None]:
# print example data
print(wikipedia_strings[1])