### This implements a lot of the example here:
https://github.com/openai/openai-cookbook/blob/57024c70cff473fb520105e9aea3ab4e514be0df/examples/fine-tuned_qa/olympics-1-collect-data.ipynb


### The purpose of this notebook is to build a dataset of text that we can use elsewhere to demonstrate question answering using embeddings

In [5]:
import pandas as pd
import wikipedia

# For wikipedia API documentation see here: https://wikipedia.readthedocs.io/en/latest/code.html



In [6]:
!pip install transformers



In [7]:
import re
from typing import Set
from transformers import GPT2TokenizerFast

import numpy as np
from nltk.tokenize import sent_tokenize

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/tim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:

def get_wiki_page(title):
    """
    Get the wikipedia page given a title
    """
    try:
        print(f"Fetching page '{title}'")
        return wikipedia.page(title)
    except wikipedia.exceptions.DisambiguationError as e:
        return wikipedia.page(e.options[0])
    except wikipedia.exceptions.PageError as e:
        return None

def recursively_find_all_pages(titles, titles_so_far=set(), title_filter=None):
    """
    Recursively find all the pages that are linked to the Wikipedia titles in the list
    """
    all_pages = []
    
    titles = list(set(titles) - titles_so_far)
    if title_filter:
        titles = title_filter(titles)
    titles_so_far.update(titles)
    for title in titles:
        page = get_wiki_page(title)
        if page is None:
            continue
        all_pages.append(page)

        new_pages = recursively_find_all_pages(page.links, titles_so_far, title_filter)
        for pg in new_pages:
            if pg.title not in [p.title for p in all_pages]:
                all_pages.append(pg)
        titles_so_far.update(page.links)
    return all_pages




In [9]:
# Define a filter to limit pages to genomics topics,
# and find all matching wikipedia pages--

def filter_genomics_titles(titles):
    """
    Get the titles which are related to genomics, given a list of titles
    """
    titles = [title for title in titles if 'genom' in title.lower()]
    
    return titles

In [10]:
# Get all the pages for a topic
# THIS TAKES A WHILE
pages = recursively_find_all_pages(["Genomics"], title_filter=filter_genomics_titles)
len(pages)


Fetching page 'Genomics'
Fetching page 'Pharmacogenomics'
Fetching page 'Cancer pharmacogenomics'
Fetching page 'Pharmacogenomics (journal)'
Fetching page 'Personal genomics'
Fetching page 'New York Genome Center'
Fetching page 'The Cancer Genome Atlas'
Fetching page 'International Cancer Genome Consortium'
Fetching page 'Chinese Cancer Genome Consortium'
Fetching page 'National Center for Cancer Genomics, National Project for Personalized Genomic Medicine, South Korean Ministry of Health and Welfare'
Fetching page 'Genomics England'
Fetching page 'Genomes'
Fetching page 'The Institute for Genomic Research'
Fetching page 'Genome Research'
Fetching page 'Bacterial genome size'
Fetching page 'Metagenome'
Fetching page 'Clinical metagenomic sequencing'
Fetching page 'Ecogenomics'
Fetching page 'Genomic library'
Fetching page 'Human genome project'
Fetching page 'Genome Reference Consortium'
Fetching page 'Wellcome Genome Campus'
Fetching page 'Chimpanzee genome project'
Fetching page 'Gen



  lis = BeautifulSoup(html).find_all('li')


Fetching page 'Genomic organization'
Fetching page 'Genome comparison'
Fetching page 'Genome projects'
Fetching page 'International Grape Genome Program'
Fetching page 'Neanderthal Genome Project'
Fetching page 'Bovine Genome Project'
Fetching page 'Bovine genome database'
Fetching page 'Bovine Genome Sequencing and Analysis Consortium'
Fetching page 'Bovine genome'
Fetching page '100K Pathogen Genome Project'
Fetching page 'Honey Bee Genome Sequencing Consortium'
Fetching page 'Honey bee genome'
Fetching page 'Sociogenomics'
Fetching page 'List of sequenced prokaryotic genomes'
Fetching page 'Horse genome'
Fetching page 'Dog genome'
Fetching page 'History of genomics'
Fetching page 'VISTA (comparative genomics)'
Fetching page 'Genome evolution'
Fetching page 'Genome wide association study'
Fetching page 'Genome-wide significance'
Fetching page 'Universal Declaration on the Human Genome and Human Rights'
Fetching page 'Chimpanzee Genome Project'
Fetching page 'Human Genome Project - Wr

168

In [13]:
# what is a page?
pages[0].revision_id
# create a link to the wikipedia page with 
# https://en.wikipedia.org/?oldid={revision_id}
# for example https://en.wikipedia.org/?oldid=1137582695
# or url = f"https://en.wikipedia.org/?oldid={pages[0].revision_id}"

url = f"https://en.wikipedia.org/?oldid={pages[0].revision_id}"
url

'https://en.wikipedia.org/?oldid=1137582695'

In [None]:
https://en.wikipedia.org/?oldid=1136190969

In [16]:
# This could be a class that defines which tokenizer to use when an instance is created.

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))

def reduce_long(
    long_text: str, long_text_tokens: bool = False, max_len: int = 590
) -> str:
    """
    Reduce a long text to a maximum of `max_len` tokens by potentially cutting at a sentence end
    """
    if not long_text_tokens:
        long_text_tokens = count_tokens(long_text)
    if long_text_tokens > max_len:
        sentences = sent_tokenize(long_text.replace("\n", " "))
        ntokens = 0
        for i, sentence in enumerate(sentences):
            ntokens += 1 + count_tokens(sentence)
            if ntokens > max_len:
                return ". ".join(sentences[:i][:-1]) + "."

    return long_text

def extract_sections(
    wiki_text: str,
    title: str,
    page_id: int, 
    max_len: int = 1500,
    discard_categories: Set[str] = None,
) -> str:
    """
    Extract the sections of a Wikipedia page, discarding the references and other low information sections
    """
    if len(wiki_text) == 0:
        return []

    # find all headings and the coresponding contents
    headings = re.findall("==+ .* ==+", wiki_text)
    for heading in headings:
        wiki_text = wiki_text.replace(heading, "==+ !! ==+")
    contents = wiki_text.split("==+ !! ==+")
    contents = [c.strip() for c in contents]
    assert len(headings) == len(contents) - 1

    cont = contents.pop(0).strip()
    outputs = [(title, "Summary", cont, count_tokens(cont)+4)]
    
    # discard the discard categories, accounting for a tree structure
    max_level = 100
    keep_group_level = max_level
    remove_group_level = max_level
    nheadings, ncontents = [], []
    for heading, content in zip(headings, contents):
        plain_heading = " ".join(heading.split(" ")[1:-1])
        num_equals = len(heading.split(" ")[0])
        if num_equals <= keep_group_level:
            keep_group_level = max_level

        if num_equals > remove_group_level:
            if (
                num_equals <= keep_group_level
            ):
                continue
        keep_group_level = max_level
        if plain_heading in discard_categories:
            remove_group_level = num_equals
            keep_group_level = max_level
            continue
        nheadings.append(heading.replace("=", "").strip())
        ncontents.append(content)
        remove_group_level = max_level

    # count the tokens of each section
    ncontent_ntokens = [
        count_tokens(c)
        + 3
        + count_tokens(" ".join(h.split(" ")[1:-1]))
        - (1 if len(c) == 0 else 0)
        for h, c in zip(nheadings, ncontents)
    ]

    # Create a tuple of (title, section_name, content, number of tokens)
    outputs += [(title, page_id, h, c, t) if t<max_len 
                else (title, page_id, h, reduce_long(c, max_len), count_tokens(reduce_long(c,max_len))) 
                    for h, c, t in zip(nheadings, ncontents, ncontent_ntokens)]
    
    return outputs



In [17]:
# Example page being processed into sections

discard_categories = ['See also', 'References', 'External links', 'Further reading', "Footnotes",
    "Bibliography", "Sources", "Citations", "Literature", "Footnotes", "Notes and references",
    "Photo gallery", "Works cited", "Photos", "Gallery", "Notes", "References and sources",
    "References and notes"]

example_title = 'Epigenomics'
example_page = get_wiki_page(example_title)
sections = extract_sections(example_page.content, example_page.revision_id, example_page.title, discard_categories=discard_categories)

print(f"There are {len(sections)} sections in the page {example_title}")
for section in sections:
    print('\nSection:\n')
    print(section)

Fetching page 'Epigenomics'


Token indices sequence length is longer than the specified maximum sequence length for this model (2522 > 1024). Running this sequence through the model will result in indexing errors


There are 48 sections in the page Epigenomics

Section:

(1136190969, 'Summary', 'In biology, epigenetics is the study of stable phenotypic changes (known as marks) that do not involve alterations in the DNA sequence. The Greek prefix epi- (ἐπι- "over, outside of, around") in epigenetics implies features that are "on top of" or "in addition to" the traditional genetic basis for inheritance. Epigenetics most often involves changes that affect the regulation of gene expression, but the term can also be used to describe any heritable phenotypic change. Such effects on cellular and physiological phenotypic traits may result from external or environmental factors, or be part of normal development.\nThe term also refers to the mechanism of changes: functionally relevant alterations to the genome that do not involve mutation of the nucleotide sequence. Examples of mechanisms that produce such changes are DNA methylation and histone modification, each of which alters how genes are expressed wi

In [48]:
# Build a data set from the wikipedia page sections

res = []
page_ids = []
for page in pages:
    res += extract_sections(page.content, page.revision_id, page.title, discard_categories=discard_categories)
    page_ids.append(page.revision_id)
df = pd.DataFrame(res, columns=["page_id", "title", "heading", "content", "tokens"])
df['url'] = df['page_id'].apply(lambda v: f"https://en.wikipedia.org/?oldid={v}")
df = df[df.tokens>40]
df = df.drop_duplicates(['title','heading'])
df = df.reset_index().drop('index',axis=1) # reset index
df.head()

df.to_csv('genomics-data/genomics_sections.csv', index=False)
df

Unnamed: 0,page_id,title,heading,content,tokens,url
0,1137582695,Genomics,Etymology,"From the Greek ΓΕΝ gen, ""gene"" (gamma, epsilon...",166.0,https://en.wikipedia.org/?oldid=1137582695
1,1137582695,Genomics,Early sequencing efforts,Following Rosalind Franklin's confirmation of ...,248.0,https://en.wikipedia.org/?oldid=1137582695
2,1137582695,Genomics,DNA-sequencing technology developed,In addition to his seminal work on the amino a...,355.0,https://en.wikipedia.org/?oldid=1137582695
3,1137582695,Genomics,Complete genomes,The advent of these technologies resulted in a...,691.0,https://en.wikipedia.org/?oldid=1137582695
4,1137582695,Genomics,"The ""omics"" revolution",The English-language neologism omics informall...,198.0,https://en.wikipedia.org/?oldid=1137582695
...,...,...,...,...,...,...
1308,1060657820,Toxicogenomics,Public projects,Chemical Effects in Biological Systems is a pr...,307.0,https://en.wikipedia.org/?oldid=1060657820
1309,1061709342,Comparative Toxicogenomics Database,Background,The Comparative Toxicogenomics Database (CTD) ...,79.0,https://en.wikipedia.org/?oldid=1061709342
1310,1061709342,Comparative Toxicogenomics Database,Goals and objectives,One of the primary goals of CTD is to advance ...,144.0,https://en.wikipedia.org/?oldid=1061709342
1311,1061709342,Comparative Toxicogenomics Database,Core data,CTD is a unique resource where biocurators rea...,58.0,https://en.wikipedia.org/?oldid=1061709342


In [41]:
a = df['url'].unique()
a[0]

'https://en.wikipedia.org/?oldid=0       1137582695\n1       1137582695\n2       1137582695\n3       1137582695\n4       1137582695\n           ...    \n1743    1061709342\n1744    1061709342\n1745    1061709342\n1746    1061709342\n1747    1061709342\nName: page_id, Length: 1748, dtype: int64'

In [47]:
df['url'] = df['page_id'].apply(lambda v: f"https://en.wikipedia.org/?oldid={v}")
df

Unnamed: 0,page_id,title,heading,content,tokens,url
0,1137582695,Genomics,Etymology,"From the Greek ΓΕΝ gen, ""gene"" (gamma, epsilon...",166.0,https://en.wikipedia.org/?oldid=1137582695
1,1137582695,Genomics,Early sequencing efforts,Following Rosalind Franklin's confirmation of ...,248.0,https://en.wikipedia.org/?oldid=1137582695
2,1137582695,Genomics,DNA-sequencing technology developed,In addition to his seminal work on the amino a...,355.0,https://en.wikipedia.org/?oldid=1137582695
3,1137582695,Genomics,Complete genomes,The advent of these technologies resulted in a...,691.0,https://en.wikipedia.org/?oldid=1137582695
4,1137582695,Genomics,"The ""omics"" revolution",The English-language neologism omics informall...,198.0,https://en.wikipedia.org/?oldid=1137582695
...,...,...,...,...,...,...
1308,1060657820,Toxicogenomics,Public projects,Chemical Effects in Biological Systems is a pr...,307.0,https://en.wikipedia.org/?oldid=1060657820
1309,1061709342,Comparative Toxicogenomics Database,Background,The Comparative Toxicogenomics Database (CTD) ...,79.0,https://en.wikipedia.org/?oldid=1061709342
1310,1061709342,Comparative Toxicogenomics Database,Goals and objectives,One of the primary goals of CTD is to advance ...,144.0,https://en.wikipedia.org/?oldid=1061709342
1311,1061709342,Comparative Toxicogenomics Database,Core data,CTD is a unique resource where biocurators rea...,58.0,https://en.wikipedia.org/?oldid=1061709342


In [34]:
df.apply(lambda s: f"https://en.wikipedia.org/?oldid={s['url']}" if s.name == 'url' else s, axis=1)

#df.apply(lambda x: np.square(x) if x.name == 'd' else x, axis=1)
#df

Unnamed: 0,url,title,heading,content,tokens
0,1137582695,Genomics,Etymology,"From the Greek ΓΕΝ gen, ""gene"" (gamma, epsilon...",166.0
1,1137582695,Genomics,Early sequencing efforts,Following Rosalind Franklin's confirmation of ...,248.0
2,1137582695,Genomics,DNA-sequencing technology developed,In addition to his seminal work on the amino a...,355.0
3,1137582695,Genomics,Complete genomes,The advent of these technologies resulted in a...,691.0
4,1137582695,Genomics,"The ""omics"" revolution",The English-language neologism omics informall...,198.0
...,...,...,...,...,...
1308,1060657820,Toxicogenomics,Public projects,Chemical Effects in Biological Systems is a pr...,307.0
1309,1061709342,Comparative Toxicogenomics Database,Background,The Comparative Toxicogenomics Database (CTD) ...,79.0
1310,1061709342,Comparative Toxicogenomics Database,Goals and objectives,One of the primary goals of CTD is to advance ...,144.0
1311,1061709342,Comparative Toxicogenomics Database,Core data,CTD is a unique resource where biocurators rea...,58.0


In [20]:
# The number of tokens processable by gpt2 is 1024
# I'm not sure what the downstream effects of these might be 
df[df['tokens'] > 1024].sort_values('tokens', ascending=False)

Unnamed: 0,url,title,heading,content,tokens
695,https://en.wikipedia.org/?oldid=0 113758...,List of sequenced animal genomes,Bony fish,"Order Anabantiformes\nBetta splendens, Siamese...",1496.0
226,https://en.wikipedia.org/?oldid=0 113758...,Neanderthal genome project,Findings,The researchers recovered ancient DNA of Neand...,1481.0
1210,https://en.wikipedia.org/?oldid=0 113758...,Biological effects of radiation on the epigenome,The lethal-potentially lethal model,This equation explores the hypothesis of a les...,1464.0
1227,https://en.wikipedia.org/?oldid=0 113758...,Gene,Conflicting definitions of 'gene',There are lots of different ways to use the te...,1294.0
96,https://en.wikipedia.org/?oldid=0 113758...,DeCODE genetics,Discoveries and scientific contributions,"Genome research in general, and deCODE's globa...",1277.0
161,https://en.wikipedia.org/?oldid=0 113758...,Clinical metagenomic sequencing,Laboratory workflow,A typical mNGS workflow consists of the follow...,1275.0
95,https://en.wikipedia.org/?oldid=0 113758...,DeCODE genetics,Genome of a Nation,By the time Bill Clinton and Tony Blair announ...,1268.0
449,https://en.wikipedia.org/?oldid=0 113758...,Rat Genome Database,Data,RGD’s data consists of manual annotations from...,1239.0
230,https://en.wikipedia.org/?oldid=0 113758...,Neanderthal genetics,Interbreeding with anatomically modern humans,Researchers addressed the question of possible...,1237.0
173,https://en.wikipedia.org/?oldid=0 113758...,Economics,Definitions of economics over time,The earlier term for the discipline was 'polit...,1231.0


In [None]:
# It seems like all of this could be consolidated into a pretty versitile package for 
# extracting data from wikipedia on a given topic.