In [7]:
# get all paintings and painters that have an English article on Wikipedia
import json

loaded = True

if not loaded:
    # request Wikidata API
    url = 'https://query.wikidata.org/sparql'
    query = """
    SELECT ?painting ?article1 ?paintor ?article2
    WHERE {
        ?painting wdt:P31 wd:Q3305213 .
        ?article1 schema:about ?painting .
        ?article1 schema:isPartOf <https://en.wikipedia.org/>.
        ?painting wdt:P170 ?paintor .
        ?article2 schema:about ?paintor .
        ?article2 schema:isPartOf <https://en.wikipedia.org/>.
    }
    """
    r = requests.get(url, params = {'format': 'json', 'query': query})
    data = r.json()['results']['bindings']

    # store data into a json file
    with open('../data/wikidata.json', 'w') as outfile:
        json.dump(data, outfile, indent=4)
else:
    # load data from file
    with open('../data/wikidata.json', 'r') as infile:
        data = json.load(infile)

print("%d pairs loaded"%len(data))

6024 pairs loaded


In [8]:
artists_urls = []
objects_urls = []

for item in data:
    artists_urls.append(item['article2']['value'])
    objects_urls.append(item['article1']['value'])

# remove duplicates
unique_artists = set(artists_urls)
unique_objects = set(objects_urls)
print("%d paintings by %d artists"%(len(unique_objects), len(unique_artists)))
print("%d Wiki pages"%(len(unique_objects) + len(unique_artists)))

# show samples
print(artists_urls[0])
print(objects_urls[0])

5953 paintings by 1292 artists
7245 Wiki pages
https://en.wikipedia.org/wiki/Diego_Vel%C3%A1zquez
https://en.wikipedia.org/wiki/Las_Meninas


In [10]:
# use Wikimedia API to get section headings
import json
from collections import Counter

WIKI_API = "https://en.wikipedia.org/w/api.php?format=json&"
GET_SECTIONS = WIKI_API + "action=parse&page=%s&prop=sections"
general_sections = ['External links', 'Bibliography', 'Gallery', 'Notes and references', 'Citations', 'References and sources', 'References', 'Further reading', 'See also', 'Footnotes', 'Sources', 'Notes', 'General']


def collect_headings(urls, filename):
    '''
    Store titles and section headers of the Wikipedia pages
    '''
    pages = []
    with open('../data/%s.jsonl'%filename, 'w') as outfile:
        for url in urls:
            page = url.split('/')[-1]
            # skip duplicates
            if page not in pages:
                pages.append(page)
                entry = {'page': page}
                result = json.loads(requests.get(GET_SECTIONS%page).text)
                if 'parse' in result:
                    parse = result['parse']
                    title = parse['title']
                    entry['title'] = title
                    sections = [section['line'] for section in parse['sections'] if section['line'] not in general_sections]
                    entry['sections'] = sections
                json.dump(entry, outfile)
                outfile.write('\n')
    print("Retrieved %d Wikipedia pages"%len(pages))


# collect_headings(artists_urls, 'artists')
# collect_headings(objects_urls, 'paintings')

In [13]:
# count headings
from collections import Counter


def count_headers(filename):
    # load data from file
    titles, pages = [], []
    headers = Counter()
    with open('../data/%s.jsonl'%filename, 'r') as infile:
        lines = infile.readlines()
        for line in lines:
            entry = json.loads(line)
            if 'sections' in entry:
                headers.update([section for section in entry['sections'] if section not in general_sections])
            titles.append(entry['title'])
            pages.append(entry['page'])

    # show top most headers            
    for header, count in headers.most_common(10):
        print(header, count)
    
    return titles, pages, headers


artists, artists_pages, artists_section_counter = count_headers('artists')
# print('\n')
# paintings_section_counter = count_headers('paintings')

Biography 482
Life 220
Works 216
Early life 215
Legacy 180
Career 156
Selected works 127
Work 105
Personal life 85
Death 78


In [15]:
for artist in artists:
    for header, count in artists_section_counter.most_common(10):
        if count > 1:
            query = ' '.join([artist, header])
            print(query)
    break

Ángel Zárraga Biography
Ángel Zárraga Life
Ángel Zárraga Works
Ángel Zárraga Early life
Ángel Zárraga Legacy
Ángel Zárraga Career
Ángel Zárraga Selected works
Ángel Zárraga Work
Ángel Zárraga Personal life
Ángel Zárraga Death


In [19]:
# retrieve relevant pages using a web search engine
import requests
import urllib.parse
from lxml import html
from newspaper import Article


endpoint = "https://duckduckgo.com/html/"
skip_domains = ["wikipedia", "instagram", "facebook", "goodreads", "youtube", "wikiart", "wikimedia",
                "pinterest", "amazon", "linkedin", "jstor", "kinopoisk", "tumblr"]

artists = [artists[0]]


def get_pages(artist, header, pool):
    '''
    Search the web and crawl result pages
    '''
    query = ' '.join([artist, header])
    print(query)
    # Construct a request
    params = { 'q': query, 'mkt': 'en-US' }

    # get and parse URLs to web pages from the search results
    try:
        response = requests.get(endpoint, params=params, headers={'user-agent': 'my-app/0.0.1'})
        response.raise_for_status()
        doc = html.fromstring(response.text)
        urls = []
        for a in doc.cssselect('#links .links_main a'):
            url = a.get('href')
            if url not in urls:
                urls.append(url)
        print("found %d unique urls"%len(urls))
        
        # retrieve and parse web pages from the search results
        texts = []
        for j, url in enumerate(urls):
            url = url[len("/l/?kh=-1&uddg="):]
            url = urllib.parse.unquote(url)
            domain = url.split('//')[-1].split('/')[0].split('.')[1]
            
            if url not in pool and domain not in skip_domains:
                print(url)
                # parse page
                text = ""
                a = Article(url, language='en')
                try:
                    a.download()
                    a.parse()
                    title = a.title
                    text = a.text
                    texts.append(text)
                except:
                    pass
                pool[url] = text

    except Exception as ex:
        raise ex

    print("loaded %d texts from %d web results"%(len(texts), len(urls)))
    return pool


for i, artist in enumerate(artists):
    wiki_title = artists_pages[i]
    # start crawling pages by querying only by the page title
    pool = get_pages(artist, "", {})
    
    for header, count in artists_section_counter.most_common(10):
        if count > 1:
            pool = get_pages(artist, header, pool)
    print('\n')

    # store data into a json file
    with open('../data/%s.json'%wiki_title, 'w') as outfile:
        json.dump({wiki_title: pool}, outfile, indent=4)

Ángel Zárraga 
found 28 unique urls
http://www.artistsandart.org/2010/04/angel-zarraga-1886-1946-mexican-artist.html
https://www.tumblr.com/search/%C3%81ngel%20z%C3%A1rraga
https://www.biografiasyvidas.com/biografia/z/zarraga.htm
https://curiator.com/art/angel-zarraga
https://wiki2.org/en/%C3%81ngel_Z%C3%A1rraga
https://twitter.com/angel_zarraga
https://wikivisually.com/wiki/%C3%81ngel_Z%C3%A1rraga
https://www.wikidata.org/wiki/Q251967
https://www.widewalls.ch/artist/angel-zarraga/
https://www.artsy.net/artist/angel-zarraga
https://tvorchestvof.blogspot.com/2017/07/angel-zarraga.html
https://vimeo.com/64126468
https://artsandculture.google.com/entity/m06415y3
https://rkd.nl/en/explore/artists/86180
https://artclasscurator.com/tag/angel-zarraga/
https://www.invaluable.com/artist/zarraga-angel-cxbqpby90k/
https://www.mexicodesconocido.com.mx/angel-zarraga-pintor-duranguense-que-traspaso-las-fronteras.html
https://www.proceso.com.mx/376835/angel-zarraga-en-el-palacio-de-bellas-artes
https

ConnectionError: HTTPSConnectionPool(host='duckduckgo.com', port=443): Max retries exceeded with url: /html/?q=%C3%81ngel+Z%C3%A1rraga+Death&mkt=en-US (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x10726ec18>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))

In [None]:
# TODO evaluate baseline

from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score('The quick brown fox jumps over the lazy dog',
                      'The quick brown dog jumps on the log.')
print(scores)