### This implements a lot of the example here:
https://github.com/openai/openai-cookbook/blob/57024c70cff473fb520105e9aea3ab4e514be0df/examples/fine-tuned_qa/olympics-1-collect-data.ipynb


### The purpose of this notebook is to build a dataset of text that we can use elsewhere to demonstrate question answering using embeddings

In [41]:
import pandas as pd
import wikipedia

# For wikipedia API documentation see here: https://wikipedia.readthedocs.io/en/latest/code.html



In [42]:
!pip install transformers



In [43]:
import re
from typing import Set
from transformers import GPT2TokenizerFast

import numpy as np
from nltk.tokenize import sent_tokenize

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/tim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [44]:

def get_wiki_page(title):
    """
    Get the wikipedia page given a title
    """
    try:
        print(f"Fetching page '{title}'")
        return wikipedia.page(title)
    except wikipedia.exceptions.DisambiguationError as e:
        return wikipedia.page(e.options[0])
    except wikipedia.exceptions.PageError as e:
        return None

def recursively_find_all_pages(titles, titles_so_far=set(), title_filter=None):
    """
    Recursively find all the pages that are linked to the Wikipedia titles in the list
    """
    all_pages = []
    
    titles = list(set(titles) - titles_so_far)
    if title_filter:
        titles = title_filter(titles)
    titles_so_far.update(titles)
    for title in titles:
        page = get_wiki_page(title)
        if page is None:
            continue
        all_pages.append(page)

        new_pages = recursively_find_all_pages(page.links, titles_so_far, title_filter)
        for pg in new_pages:
            if pg.title not in [p.title for p in all_pages]:
                all_pages.append(pg)
        titles_so_far.update(page.links)
    return all_pages




In [46]:
# Define a filter to limit pages to the 2020 summer olympics,
# and find all matching wikipedia pages--
# THIS TAKES A WHILE

def filter_olympic_2020_titles(titles):
    """
    Get the titles which are related to Olympic games hosted in 2020, given a list of titles
    """
    titles = [title for title in titles if '2020' in title and 'olympi' in title.lower() and 'summer' in title.lower()]
    
    return titles

pages = recursively_find_all_pages(["2020 Summer Olympics"], title_filter=filter_olympic_2020_titles)
len(pages)

Fetching page '2020 Summer Olympics'
Fetching page 'Romania at the 2020 Summer Olympics'
Fetching page 'Archery at the 2020 Summer Olympics - Qualification'
Fetching page 'List of archers at the 2020 Summer Olympics'
Fetching page 'List of taekwondo practitioners at the 2020 Summer Olympics'
Fetching page 'Taekwondo at the 2020 Summer Olympics – Men's 80 kg'
Fetching page 'Taekwondo at the 2020 Summer Olympics – Women's 67 kg'
Fetching page 'Taekwondo at the 2020 Summer Olympics – Women's 49 kg'
Fetching page 'Taekwondo at the 2020 Summer Olympics – Women's 57 kg'
Fetching page 'Taekwondo at the 2020 Summer Olympics – Men's 58 kg'
Fetching page 'Taekwondo at the 2020 Summer Olympics – Men's 68 kg'
Fetching page 'Taekwondo at the 2020 Summer Olympics – Men's +80 kg'
Fetching page 'Taekwondo at the 2020 Summer Olympics – Women's +67 kg'
Fetching page 'List of weightlifters at the 2020 Summer Olympics'
Fetching page 'Weightlifting at the 2020 Summer Olympics – Men's 96 kg'
Fetching page '

Fetching page 'Volleyball at the 2020 Summer Olympics – Women's tournament'
Fetching page 'Beach volleyball at the 2020 Summer Olympics – Men's tournament'
Fetching page 'Beach volleyball at the 2020 Summer Olympics'
Fetching page 'List of sailors at the 2020 Summer Olympics'
Fetching page 'List of triathletes at the 2020 Summer Olympics'
Fetching page 'Triathlon at the 2020 Summer Olympics – Mixed relay'
Fetching page 'Triathlon at the 2020 Summer Olympics – Women's'
Fetching page 'List of canoeists at the 2020 Summer Olympics'
Fetching page 'Canoeing at the 2020 Summer Olympics – Women's K-2 500 metres'
Fetching page 'Canoeing at the 2020 Summer Olympics – Women's K-1 500 metres'
Fetching page 'Canoeing at the 2020 Summer Olympics – Women's slalom C-1'
Fetching page 'Canoeing at the 2020 Summer Olympics – Men's K-2 1000 metres'
Fetching page 'Canoeing at the 2020 Summer Olympics – Women's slalom K-1'
Fetching page 'Canoeing at the 2020 Summer Olympics – Women's C-1 200 metres'
Fetchi

Fetching page 'Football at the 2020 Summer Olympics – Women's tournament – Final'
Fetching page 'Football at the 2020 Summer Olympics – Men's tournament – Group D'
Fetching page 'Football at the 2020 Summer Olympics – Men's tournament – Group A'
Fetching page 'Football at the 2020 Summer Olympics – Women's tournament – Group G'
Fetching page 'Football at the 2020 Summer Olympics – Women's tournament – Group F'
Fetching page 'Football at the 2020 Summer Olympics – Women's qualification (CAF–CONMEBOL play-off)'
Fetching page 'Football at the 2020 Summer Olympics – Men's tournament – Group C'
Fetching page 'Football at the 2020 Summer Olympics – Men's tournament – Final'
Fetching page 'Swimming at the 2020 Summer Olympics – Men's 200 metre individual medley'
Fetching page 'Gymnastics at the 2020 Summer Olympics – Men's artistic individual all-around'
Fetching page 'Gymnastics at the 2020 Summer Olympics – Men's artistic qualification'
Fetching page 'Gymnastics at the 2020 Summer Olympics 

Fetching page 'List of divers at the 2020 Summer Olympics'
Fetching page 'Field hockey at the 2020 Summer Olympics – Men's team squads'
Fetching page 'List of judoka at the 2020 Summer Olympics'
Fetching page 'List of modern pentathletes at the 2020 Summer Olympics'
Fetching page 'Handball at the 2020 Summer Olympics – Men's team rosters'
Fetching page 'Handball at the 2020 Summer Olympics – Women's tournament'
Fetching page 'Handball at the 2020 Summer Olympics – Men's tournament'
Fetching page 'Football at the 2020 Summer Olympics – Men's team squads'
Fetching page 'List of gymnasts at the 2020 Summer Olympics'
Fetching page 'List of cyclists at the 2020 Summer Olympics'
Fetching page 'Cycling at the 2020 Summer Olympics – Men's road race'
Fetching page 'Cycling at the 2020 Summer Olympics – Women's road race'
Fetching page 'List of synchronized swimmers at the 2020 Summer Olympics'
Fetching page 'Basketball at the 2020 Summer Olympics – Women's team rosters'
Fetching page 'Water pol

Fetching page 'Field hockey at the 2020 Summer Olympics – Women's qualification'
Fetching page 'Estonia at the 2020 Summer Olympics'
Fetching page 'Tennis at the 2020 Summer Olympics - Qualification'
Fetching page 'Suriname at the 2020 Summer Olympics'
Fetching page 'Ireland at the 2020 Summer Olympics'
Fetching page 'Trinidad and Tobago at the 2020 Summer Olympics'
Fetching page 'Equestrian at the 2020 Summer Olympics'
Fetching page 'Guinea-Bissau at the 2020 Summer Olympics'
Fetching page 'Burkina Faso at the 2020 Summer Olympics'
Fetching page 'Athletics at the 2020 Summer Olympics - Men's triple jump'
Fetching page 'Mongolia at the 2020 Summer Olympics'
Fetching page 'Mauritania at the 2020 Summer Olympics'
Fetching page 'Tonga at the 2020 Summer Olympics'
Fetching page 'Chile at the 2020 Summer Olympics'
Fetching page 'Rowing at the 2020 Summer Olympics – Qualification'
Fetching page 'Uzbekistan at the 2020 Summer Olympics'
Fetching page 'Tunisia at the 2020 Summer Olympics'
Fetch

Fetching page 'Athletics at the 2020 Summer Olympics'
Fetching page 'Guatemala at the 2020 Summer Olympics'
Fetching page 'Weightlifting at the 2020 Summer Olympics - Qualification'
Fetching page 'Field hockey at the 2020 Summer Olympics'
Fetching page 'Myanmar at the 2020 Summer Olympics'
Fetching page 'Taekwondo at the 2020 Summer Olympics – Qualification'
Fetching page 'China at the 2020 Summer Olympics'
Fetching page 'Cook Islands at the 2020 Summer Olympics'
Fetching page 'Benin at the 2020 Summer Olympics'
Fetching page 'Uganda at the 2020 Summer Olympics'
Fetching page 'Tennis at the 2020 Summer Olympics – Qualification'
Fetching page 'Paraguay at the 2020 Summer Olympics'
Fetching page 'Weightlifting at the 2020 Summer Olympics'
Fetching page 'Rwanda at the 2020 Summer Olympics'
Fetching page 'Beach volleyball at the 2020 Summer Olympics – Women's qualification'
Fetching page 'Marshall Islands at the 2020 Summer Olympics'
Fetching page 'Cyprus at the 2020 Summer Olympics'
Fetch

Fetching page '2020 Summer Olympics torch relay'
Fetching page 'Belgium at the 2020 Summer Olympics'
Fetching page 'Skateboarding at the 2020 Summer Olympics – Nen’s street'
Fetching page 'Skateboarding at the 2020 Summer Olympics – Women’s street'
Fetching page 'Archery at the 2020 Summer Olympics – Qualification'
Fetching page 'Monaco at the 2020 Summer Olympics'
Fetching page 'Basketball at the 2020 Summer Olympics – Men's 3x3 qualification'
Fetching page 'Nicaragua at the 2020 Summer Olympics'
Fetching page 'Mauritius at the 2020 Summer Olympics'
Fetching page 'Angola at the 2020 Summer Olympics'
Fetching page 'Togo at the 2020 Summer Olympics'
Fetching page 'Turkmenistan at the 2020 Summer Olympics'
Fetching page 'Chronological summary of the 2020 Summer Olympics'
Fetching page 'Moldova at the 2020 Summer Olympics'
Fetching page 'Refugee Olympic Team at the 2020 Summer Olympics'
Fetching page 'Sri Lanka at the 2020 Summer Olympics'
Fetching page 'Sailing at the 2020 Summer Olympic

691

In [48]:
# This could be a class that defines which tokenized to use when an instance is created.

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))

def reduce_long(
    long_text: str, long_text_tokens: bool = False, max_len: int = 590
) -> str:
    """
    Reduce a long text to a maximum of `max_len` tokens by potentially cutting at a sentence end
    """
    if not long_text_tokens:
        long_text_tokens = count_tokens(long_text)
    if long_text_tokens > max_len:
        sentences = sent_tokenize(long_text.replace("\n", " "))
        ntokens = 0
        for i, sentence in enumerate(sentences):
            ntokens += 1 + count_tokens(sentence)
            if ntokens > max_len:
                return ". ".join(sentences[:i][:-1]) + "."

    return long_text

def extract_sections(
    wiki_text: str,
    title: str,
    max_len: int = 1500,
    discard_categories: Set[str] = discard_categories,
) -> str:
    """
    Extract the sections of a Wikipedia page, discarding the references and other low information sections
    """
    if len(wiki_text) == 0:
        return []

    # find all headings and the coresponding contents
    headings = re.findall("==+ .* ==+", wiki_text)
    for heading in headings:
        wiki_text = wiki_text.replace(heading, "==+ !! ==+")
    contents = wiki_text.split("==+ !! ==+")
    contents = [c.strip() for c in contents]
    assert len(headings) == len(contents) - 1

    cont = contents.pop(0).strip()
    outputs = [(title, "Summary", cont, count_tokens(cont)+4)]
    
    # discard the discard categories, accounting for a tree structure
    max_level = 100
    keep_group_level = max_level
    remove_group_level = max_level
    nheadings, ncontents = [], []
    for heading, content in zip(headings, contents):
        plain_heading = " ".join(heading.split(" ")[1:-1])
        num_equals = len(heading.split(" ")[0])
        if num_equals <= keep_group_level:
            keep_group_level = max_level

        if num_equals > remove_group_level:
            if (
                num_equals <= keep_group_level
            ):
                continue
        keep_group_level = max_level
        if plain_heading in discard_categories:
            remove_group_level = num_equals
            keep_group_level = max_level
            continue
        nheadings.append(heading.replace("=", "").strip())
        ncontents.append(content)
        remove_group_level = max_level

    # count the tokens of each section
    ncontent_ntokens = [
        count_tokens(c)
        + 3
        + count_tokens(" ".join(h.split(" ")[1:-1]))
        - (1 if len(c) == 0 else 0)
        for h, c in zip(nheadings, ncontents)
    ]

    # Create a tuple of (title, section_name, content, number of tokens)
    outputs += [(title, h, c, t) if t<max_len 
                else (title, h, reduce_long(c, max_len), count_tokens(reduce_long(c,max_len))) 
                    for h, c, t in zip(nheadings, ncontents, ncontent_ntokens)]
    
    return outputs



In [49]:
# Example page being processed into sections

discard_oly_categories = ['See also', 'References', 'External links', 'Further reading', "Footnotes",
    "Bibliography", "Sources", "Citations", "Literature", "Footnotes", "Notes and references",
    "Photo gallery", "Works cited", "Photos", "Gallery", "Notes", "References and sources",
    "References and notes",]

bermuda_page = get_wiki_page('Bermuda at the 2020 Summer Olympics')
ber = extract_sections(bermuda_page.content, bermuda_page.title, discard_categories=discard_oly_categories)

for section in ber:
    print('\nSection:\n')
    print(section)

Fetching page 'Bermuda at the 2020 Summer Olympics'

Section:

('Bermuda at the 2020 Summer Olympics', 'Summary', "Bermuda competed at the 2020 Summer Olympics in Tokyo. Originally scheduled to take place from 24 July to 9 August 2020, the Games had been postponed to 23 July to 8 August 2021, because of the COVID-19 pandemic. Since the nation's official debut in 1936, Bermudian athletes have appeared in every edition of the Summer Olympic Games, but did not attend the 1980 Summer Olympics in Moscow because of the nation's support for the US-led boycott.\nThe first ever gold medal for Bermuda was won by triathlete Flora Duffy.", 117)

Section:

('Bermuda at the 2020 Summer Olympics', 'Medalists', '', 2)

Section:

('Bermuda at the 2020 Summer Olympics', 'Competitors', 'The following is the list of number of competitors in the Games.', 16)

Section:

('Bermuda at the 2020 Summer Olympics', 'Rowing', "Bermuda qualified one boat in the men's single sculls for the Games by finishing third i

In [50]:
# Build a data set from the wikipedia page sections

res = []
for page in pages:
    res += extract_sections(page.content, page.title, discard_categories=discard_oly_categories)
df = pd.DataFrame(res, columns=["title", "heading", "content", "tokens"])
df = df[df.tokens>40]
df = df.drop_duplicates(['title','heading'])
df = df.reset_index().drop('index',axis=1) # reset index
df.head()

df.to_csv('olympics-data/summer_olympics_sections.csv', index=False)
df

Token indices sequence length is longer than the specified maximum sequence length for this model (1073 > 1024). Running this sequence through the model will result in indexing errors


Unnamed: 0,title,heading,content,tokens
0,2020 Summer Olympics,Summary,The 2020 Summer Olympics (Japanese: 2020年夏季オリン...,753
1,2020 Summer Olympics,Host city selection,The International Olympic Committee (IOC) vote...,126
2,2020 Summer Olympics,Impact of the COVID-19 pandemic,"In January 2020, concerns were raised about th...",374
3,2020 Summer Olympics,Qualifying event cancellation and postponement,Concerns about the pandemic began to affect qu...,298
4,2020 Summer Olympics,Effect on doping tests,Mandatory doping tests were being severely res...,163
...,...,...,...,...
3689,Serbia at the 2020 Summer Olympics,Table tennis,Serbia entered three athletes into the table t...,62
3690,Serbia at the 2020 Summer Olympics,Taekwondo,Serbia entered two athletes into the taekwondo...,80
3691,Serbia at the 2020 Summer Olympics,Tennis,Serbia entered five tennis players into the Ol...,106
3692,Serbia at the 2020 Summer Olympics,Wrestling,Serbia qualified four wrestlers for each of th...,110


In [51]:
# The number of tokens processable by gpt2 is 1024
# I'm not sure what the downstream effects of these might be 
df[df['tokens'] > 1024].sort_values('tokens', ascending=False)

Unnamed: 0,title,heading,content,tokens
1647,Venezuela at the 2020 Summer Olympics,Men's tournament,Venezuela men's volleyball team qualified for ...,1457
28,2020 Summer Olympics,Concerns and controversies,Several controversial issues occurred during t...,1398
2071,Concerns and controversies at the 2020 Summer ...,COVID-19 pandemic and other contagion risks,The COVID-19 (coronavirus disease 2019) pandem...,1339
3153,Athletics at the 2020 Summer Olympics – Qualif...,Qualifying standards,A National Olympic Committee (NOC) may enter u...,1260
1039,Cycling at the 2020 Summer Olympics – Men's in...,Race overview,"As soon as the race started, a group of eight ...",1226
15,2020 Summer Olympics,Biosecurity protocols,"In February 2021, the IOC began releasing ""pla...",1076


In [None]:
# It seems like all of this could be consolidated into a pretty versitile package for 
# extracting data from wikipedia on a given topic.