# Generating `NaturalProofs` from ProofWiki

In [2]:
%load_ext autoreload
%autoreload 2

from tqdm import tqdm
from bs4 import BeautifulSoup as BS
from nltk import ngrams
import jsonlines
import pandas as pd
import os
import pickle
import torch
import wikitextparser as wtp
import os
import pandas as pd
import glob
import re
import xml.etree.ElementTree as ET
pd.set_option('display.max_colwidth', -1)
%pylab inline

  pd.set_option('display.max_colwidth', -1)


Populating the interactive namespace from numpy and matplotlib


#### Load and parse

In [4]:
filepath = '/path/to/proof_wiki_nov_12_2020.xml'

soup = BS(open(filepath, 'r').read())

#### Parse redirects

We do this first so that we can store links using their redirected names.

In [5]:
redirects = {}

pages = soup.find_all('page')
pages = [page for page in pages if (
    (page.redirect is not None) and
    (not page.title.text.startswith('Talk:')) and
    (not page.title.text.startswith('User:')) and
    (not page.title.text.startswith('User talk:')) and
    (not page.title.text.startswith('Help:'))
)]

for page in tqdm(pages, total=len(pages)):
    redirects[page.title.text] = page.redirect['title']

print("%d redirects" % len(redirects))

100%|██████████| 7617/7617 [00:00<00:00, 20597.66it/s]

7617 redirects





#### Parse theorem title and theorem statement

In [6]:
from tqdm import tqdm

parsed = []
name_to_parsed = {}


pages = soup.find_all('page')
item_pages = [page for page in pages if (
    ("== Theorem ==" in page.text) and
    (not page.title.text.startswith('Talk:')) and
    (not page.title.text.startswith('User:')) and
    (not page.title.text.startswith('User talk:')) and
    (not page.title.text.startswith('Help:')) and
    (not page.redirect)
)]
exceptions = []
discarded = []
n = 0
for page in tqdm(item_pages, total=len(item_pages)):
    # title of the theorem
    theorem_title = page.title.text.replace('\u200e', '')

    # parse WikiMedia format
    text = page.text.replace('\u200e', '').replace('\u2062', '')
    wnode = wtp.parse(text)
    
    # get the WikiMedia Section that has "== Theorem ==" as its title
    theorem_sections = [s for s in wnode.sections if s.title is not None and s.title.strip() == 'Theorem']
    
    if len(theorem_sections) == 0:
        exceptions.append(page)
        continue
    else:
        theorem_section = theorem_sections[0]
    
    # get the content inside the <onlyinclude> tag, if there is one.
    tags = [t for t in theorem_section.tags() if t.name == 'onlyinclude']
    if len(tags) == 0:
        # remove subsections 
        contents = theorem_section.contents.strip().split('\n\n===')[0].strip()
        plain_text = theorem_section.plain_text().split("\n\n===")[0].strip()
        links = [l for l in theorem_section.wikilinks if l.title in contents]
    else:
        n += 1
        contents = tags[0].contents.strip()
        plain_text = tags[0].plain_text().strip()
        links = tags[0].wikilinks
        
    if plain_text == '== Theorem ==':
        contents = ""
        links = []

    links = [l.title for l in links]
    links = [redirects.get(l, l) for l in links]
    
    categories = [node.title.split('Category:')[1] for node in wnode.wikilinks if node.title.startswith('Category:')]
    
    data = {
        'type': 'theorem',
        'title': theorem_title,
        'contents': contents,
        'full_contents': text,
        'links': links,
        'has_contents': contents != '',
        'categories': categories,
    }
    parsed.append(data)
    name_to_parsed[theorem_title] = data
    
print("%d parsed, %d exceptions." % (len(parsed), len(exceptions)))
print("has theorem content: %d" % (len([x for x in parsed if x['has_contents']])))

100%|██████████| 20203/20203 [00:24<00:00, 828.52it/s] 

19734 parsed, 469 exceptions.
has theorem content: 16473





#### Definitions

In [8]:
parsed_def = []
name_to_parsed_def = {}


pages = soup.find_all('page')
item_pages = [page for page in pages if (
    ("Definition:" in page.title.text) and
    (not page.title.text.startswith('Talk:')) and
    (not page.title.text.startswith('User:')) and
    (not page.title.text.startswith('User talk:')) and
    (not page.title.text.startswith('Help:')) and 
    (not page.redirect) and
    (not page.title.text in name_to_parsed)  # some 'definitions' are actually theorem pages, e.g. Definition:Stabilizer
)]
discarded = []
for page in tqdm(item_pages, total=len(item_pages)):
    title = page.title.text.replace('\u200e', '').replace('\u2062', '')
    
    # parse WikiMedia format
    text = page.text.replace('\u200e', '').replace('\u2062', '')
    wnode = wtp.parse(text)
    
    sections = [s for s in wnode.sections if s.title is not None and s.title.strip() == 'Definition']

    if len(sections) == 0:
        contents = ""
        links = []
    else:
        section = sections[0]

        # get the content inside the <onlyinclude> tag, if there is one.
        tags = [t for t in section.tags() if t.name == 'onlyinclude']
        if len(tags) == 0:
            contents = section.contents.strip()
            links = section.wikilinks
            links = [l.title for l in links]
        else:
            contents = tags[0].contents.strip()
            links = tags[0].wikilinks
            links = [l.title for l in links]
    
    links = [redirects.get(l, l) for l in links]
    categories = [node.title.split('Category:')[1] for node in wnode.wikilinks if node.title.startswith('Category:')]
   
    data = {
        'type': 'definition',
        'title': title,
        'contents': contents,
        'full_contents': text,
        'links': links,
        'has_contents': contents != "",
        'categories': categories
    }
    parsed_def.append(data)
    name_to_parsed_def[title] = data
    

print("%d parsed" % (len(parsed_def)))
print("has definition content: %d" % (len([x for x in parsed_def if x['has_contents']])))

100%|██████████| 12420/12420 [00:07<00:00, 1576.04it/s]

12420 parsed
has definition content: 9982





#### Proofs

In [10]:
parsed_proof = []
name_to_parsed_proof = {}

pages = soup.find_all('page')
item_pages = [page for page in pages if (
    (not page.title.text.startswith('Talk:')) and
    (not page.title.text.startswith('User:')) and
    (not page.title.text.startswith('User talk:')) and
    (not page.title.text.startswith('Help:'))
)]
exceptions = []
discarded = []
missing_links = []

for page in tqdm(item_pages, total=len(item_pages)):
    title = page.title.text.strip('\u200e')
        
    # parse WikiMedia format
    text = page.text.replace('\u200e', '').replace('\u2062', '')
    wnode = wtp.parse(text)  
    
    # get the WikiMedia Section that has "== Proof ==" as its title
    sections = [s for s in wnode.sections if s.title is not None and s.title.strip() == 'Proof']

    if len(sections) == 0:
        exceptions.append(page)
        continue
    else:
        section = sections[0]

    # get the content inside the <onlyinclude> tag, if there is one.
    tags = [t for t in section.tags() if t.name == 'onlyinclude']
    if len(tags) == 0:
        contents = section.contents.strip()
        links = section.wikilinks
    else:
        contents = tags[0].contents.strip()
        links = tags[0].wikilinks

    links = [l.title.strip('\u200e') for l in links]
    links = [redirects.get(l, l) for l in links]
     
    for ltitle in links:
        if ltitle not in name_to_parsed and ltitle not in name_to_parsed_def:
            if ltitle.startswith('Category:') or ltitle.startswith('File:'):
                continue
            missing_links.append(ltitle)
    
    # skip proofs without a proof
    if contents == '{{ProofWanted}}' or contents == '{{proof wanted}}' or contents == '{{finish}}' or contents == '{{Finish}}':
        continue
        
    categories = [node.title.split('Category:')[1] for node in wnode.wikilinks if node.title.startswith('Category:')]

    data = {
        'type': 'proof',
        'title': title,
        'contents': contents,
        'links': links,
        'categories': categories
    }
    parsed_proof.append(data)
    name_to_parsed_proof[title] = data
    

print(len(missing_links), len(set(missing_links)))
print("%d parsed, %d exceptions." % (len(parsed_proof), len(exceptions)))

100%|██████████| 67077/67077 [00:33<00:00, 1973.51it/s]

4988 2019
19956 parsed, 46454 exceptions.





#### Parse pages that are linked to that we missed

- Axioms
- Proof techniques
- Corollaries
- ...

In [11]:
missing_links_set = set(missing_links)
pages = soup.find_all('page')
pages = [page for page in pages if page.title.text in missing_links_set]

parsed_extra = []
name_to_parsed_extra = {}

for page in tqdm(pages, total=len(pages)):
    title = page.title.text.strip('\u200e')
    
    # parse WikiMedia format
    text = page.text.replace('\u200e', '').replace('\u2062', '')
    wnode = wtp.parse(text)  
    
    links = wnode.wikilinks if wnode.wikilinks is not None else []
    links = [l.title.strip('\u200e') for l in links]
    links = [redirects.get(l, l) for l in links]

    categories = [node.title.split('Category:')[1] for node in wnode.wikilinks if node.title.startswith('Category:')]

    data = {
        'type': 'extra',
        'title': title,
        'contents': text,
        'full_contents': text,
        'has_contents': True,
        'links': links,
        'categories': categories
    }
    parsed_extra.append(data)
    name_to_parsed_extra[title] = data
    

print("%d parsed." % (len(parsed_extra)))

100%|██████████| 1006/1006 [00:00<00:00, 2127.86it/s]

1006 parsed.





#### Next, we remove the remaining missing links

In [12]:
link_set = set()
for name in name_to_parsed:
    link_set.add(name)

for name in name_to_parsed_def:
    link_set.add(name)
    
for name in name_to_parsed_extra:
    link_set.add(name)
    
print(len(link_set))

33160


In [13]:
for name, item in name_to_parsed.items():
    links = item['links']
    links = [l for l in links if l in link_set]
    item['links'] = links

for name, item in name_to_parsed_def.items():
    links = item['links']
    links = [l for l in links if l in link_set]
    item['links'] = links
    
for name, item in name_to_parsed_proof.items():
    links = item['links']
    links = [l for l in links if l in link_set]
    item['links'] = links
    
for name, item in name_to_parsed_extra.items():
    links = item['links']
    links = [l for l in links if l in link_set]
    item['links'] = links


#### Add a flag to denote whether a theorem has a proof

In [16]:
theorem_no_proof = set()

for name in tqdm(name_to_parsed, total=len(name_to_parsed)):
    theorem = name_to_parsed[name]
    proof_names = []
    for proof_name in name_to_parsed_proof:
        if name == proof_name:
            proof_names.append(proof_name)
        elif name in proof_name and '/' in proof_name and len(proof_name.split('/')) == 2:
            suffix = proof_name.split('/')[-1]
            if 'proof' in suffix.lower():
                proof_names.append(proof_name)
        else:  # NOTE: we discard some proofs this way
            pass
    theorem['proof_names'] = proof_names
    theorem['has_proof'] = len(proof_names) > 0


100%|██████████| 19734/19734 [01:22<00:00, 239.83it/s]


### Produce dataset

#### Utilities

In [18]:
def replace_links(lines):
    def __replace(line):
        matches = re.findall(r'(\[\[([^]]*)\]\])', line)
        for match in matches:
            full, inner = match
            splt = inner.split('|')
            if len(splt) == 1:
                txt = splt[0]
            elif len(splt) == 2:
                txt = splt[1]
            else:
                txt = ''.join(splt[1:])
            if full in line:
                line = line.replace(full, txt)
        return line
    lines_ = [
        __replace(line) for line in lines
    ]
    return lines_

Theorems with proofs.

In [19]:
examples_json = {
    'examples': [],
    'theorems': [],
    'definitions': [],
    'other': [],
    'proofs': []
}

# `examples`: contains theorems that have contents and at least one proof with at least one reference 
for theorem_name in name_to_parsed:
    theorem = name_to_parsed[theorem_name]
    
    if not theorem['has_proof'] or not theorem['has_contents']:
        continue
    
    example = {}
    example['type'] = 'theorem'
    example['has_proof'] = theorem['has_proof']
    example['title'] = theorem['title']
    example['proof_titles'] = theorem['proof_names']
    example['categories'] = theorem['categories']
    example['statement'] = {
        'contents': [line for line in theorem['contents'].split('\n') if line != ''],
        'refs': theorem['links'],
    }
    example['statement']['read_contents'] = replace_links(example['statement']['contents'])

    nrefs = 0
    example['proofs'] = []    
    for proof_name in theorem['proof_names']:
        proof_ = name_to_parsed_proof[proof_name]
        proof = {
            'title': proof_name,
            'refs': name_to_parsed_proof[proof_name]['links']
        }
        example['proofs'].append(proof)
        
        nrefs += len(proof['refs'])
    
    # only keep if there is at least one reference
    if nrefs == 0:
        continue
    
    if len(example['proofs']) == 0:
        print(theorem_name)   
    examples_json['examples'].append(example)
    
# store _all_ theorems (including `full_contents` as well) separately
for name in name_to_parsed:
    item = name_to_parsed[name]
    example = {}
    example['type'] = 'theorem'
    example['has_proof'] = item['has_proof']
    example['has_contents'] = item['has_contents']
    example['title'] = item['title']
    example['proof_titles'] = item['proof_names']
    example['contents'] = [line for line in item['contents'].split('\n') if line != '']
    example['read_contents'] = replace_links(example['contents'])
    example['refs'] = item['links']
    example['categories'] = item['categories']
    examples_json['theorems'].append(example)
    
# store all proofs separately
for name in name_to_parsed_proof:
    item = name_to_parsed_proof[name]
    example = {
        'type': 'proof',
        'title': item['title'],
        'contents': [line for line in item['contents'].split('\n') if line != ''],
        'refs': item['links'],
        'categories': item['categories']
    }
    example['read_contents'] = replace_links(example['contents'])
    examples_json['proofs'].append(example)

# store all definitions separately
for name in name_to_parsed_def:
    item = name_to_parsed_def[name]
    example = {
        'type': 'definition',
        'title': item['title'],
        'has_contents': item['has_contents'],
        'contents': [line for line in item['contents'].split('\n') if line != ''],
        'refs': item['links'],
        'categories': item['categories']
    }
    example['read_contents'] = replace_links(example['contents'])
    examples_json['definitions'].append(example)

# store all additional pages that are linked to
for name in name_to_parsed_extra:
    item = name_to_parsed_extra[name]
    example = {
        'type': 'other',
        'title': item['title'],
        'has_contents': item['has_contents'],
        'contents': [line for line in item['contents'].split('\n') if line != ''],
        'refs': item['links'],
        'categories': item['categories']
    }
    example['read_contents'] = replace_links(example['contents'])
    examples_json['other'].append(example)


#### Remove duplicate examples
- $(x_1,y_1),(x_2,y_2)$ such that the contents of $y_1$ exactly matches the contents of $y_2$.

In [20]:
from collections import defaultdict
duplicates = defaultdict(list)
for example in tqdm(examples_json['examples']):
    for example2 in examples_json['examples']:
        if example['title'] == example2['title']:
            continue
        
        for proof1_title in example['proof_titles']:
            for proof2_title in example2['proof_titles']:
                proof1 = name_to_parsed_proof[proof1_title]
                proof2 = name_to_parsed_proof[proof2_title]
                if proof1['contents'] == proof2['contents']:
                    duplicates[example['title']].append(example2['title'])
                    
print(len(duplicates))

100%|██████████| 13859/13859 [05:17<00:00, 43.59it/s]

328





In [21]:
removed = set()
to_remove = []
for title in sorted(duplicates.keys()):
    if title in removed:
        continue
    
    for dup_title in duplicates[title]:
        if dup_title not in removed:
            to_remove.append(dup_title)
            removed.add(dup_title)

len(to_remove)

199

In [22]:
examples = [x for x in examples_json['examples'] if x['title'] not in to_remove]
examples_json['examples'] = examples

theorem statement is the same:

In [23]:
from collections import defaultdict
duplicates = defaultdict(list)
for example in tqdm(examples_json['examples']):
    for example2 in examples_json['examples']:
        if example['title'] == example2['title']:
            continue
        
        if ''.join(example['statement']['contents']) == ''.join(example2['statement']['contents']):
            duplicates[example['title']].append(example2['title'])
                    
print(len(duplicates))

100%|██████████| 13660/13660 [02:57<00:00, 76.92it/s]

107





In [24]:
removed = set()
to_remove = []
for title in sorted(duplicates.keys()):
    if title in removed:
        continue
    
    for dup_title in duplicates[title]:
        if dup_title not in removed:
            to_remove.append(dup_title)
            removed.add(dup_title)

len(to_remove)

63

In [25]:
examples = [x for x in examples_json['examples'] if x['title'] not in to_remove]
examples_json['examples'] = examples

Finally, assign each reference (theorem/definition/other) a unique id, and include the reference ids in the examples.

In [26]:
name_to_id = {}
proof_name_to_id = {}
retrieval_set = []
for item in examples_json['theorems']:
    if item['title'] not in name_to_id:
        name_to_id[item['title']] = len(name_to_id)
        item['id'] = name_to_id[item['title']]
    else:
        print(item['title'])
        
for item in examples_json['definitions']:
    if item['title'] not in name_to_id:
        name_to_id[item['title']] = len(name_to_id)
        item['id'] = name_to_id[item['title']]
    else:
        print(item['title'])
        
for item in examples_json['other']:
    if item['title'] not in name_to_id:
        name_to_id[item['title']] = len(name_to_id)
        item['id'] = name_to_id[item['title']]
    else:
        print(item['title'])

for i, item in enumerate(examples_json['proofs']):
    name = 'proof_'+item['title']
    if name not in name_to_id:
        proof_name_to_id[name] = i
        item['proof_id'] = proof_name_to_id[name]
    else:
        print(name)
        
for i, example in enumerate(examples_json['examples']):
    example['example_id'] = i
    example['theorem_id'] = name_to_id[example['title']]
    # references in statement
    ref_ids = [name_to_id[ref] for ref in example['statement']['refs']]
    example['statement']['ref_ids'] = ref_ids
    
    # references in proofs
    for proof in example['proofs']:
        ref_ids = [name_to_id[ref] for ref in proof['refs']]
        proof['ref_ids'] = ref_ids
        proof['proof_id'] = proof_name_to_id['proof_'+proof['title']]

Rename `examples` as `retrieval_examples`

In [27]:
examples_json['retrieval_examples'] = examples_json.pop('examples')

In [29]:
dataset = {
    'dataset': examples_json,
}


import json
with open('dataset.json', 'w') as f:
    json.dump(dataset, f)