In [2]:
!pip install SPARQLWrapper

Collecting SPARQLWrapper
  Downloading https://files.pythonhosted.org/packages/7f/c3/72d2d0a2ad86bd8d2bc762406c838f479c421678161cf78585d085436919/SPARQLWrapper-1.8.2-py3-none-any.whl
Collecting rdflib>=4.0 (from SPARQLWrapper)
[?25l  Downloading https://files.pythonhosted.org/packages/3c/fe/630bacb652680f6d481b9febbb3e2c3869194a1a5fc3401a4a41195a2f8f/rdflib-4.2.2-py3-none-any.whl (344kB)
[K    100% |████████████████████████████████| 348kB 6.2MB/s ta 0:00:01
Installing collected packages: rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-1.8.2 rdflib-4.2.2


In [19]:
from SPARQLWrapper import SPARQLWrapper, JSON
import re

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery("""
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX res:  <http://dbpedia.org/resource/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?albumName, ?releaseDate WHERE
    {
        ?album dbo:artist <http://dbpedia.org/resource/Queen_(band)> .
        ?album dbp:type ?type .
        ?album rdfs:label ?albumName .
        ?album dbo:releaseDate ?releaseDate
        FILTER (lang(?albumName) = 'en') . 
        FILTER (regex(?type,'(Studio album)|(studio)'))
    } 
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

def cleanup(item):
    return re.sub(r'\s*\([^)]+\)', '', item)

ALBUMS = [(cleanup(result['albumName']['value']), \
           int(re.sub(r'(\d+)-\d+-\d+', '\g<1>',result['releaseDate']['value']))) \
          for result in results['results']['bindings']]

# Not marked as studio albums but they are
ALBUMS.append(('A Kind of Magic', 1986))
ALBUMS.append(('Flash Gordon', 1980))

ALBUMS.sort(key = lambda item: item[1])

ALBUMS

[('Queen', 1973),
 ('Queen II', 1974),
 ('Sheer Heart Attack', 1974),
 ('A Night at the Opera', 1975),
 ('A Day at the Races', 1976),
 ('News of the World', 1977),
 ('Jazz', 1978),
 ('The Game', 1980),
 ('Flash Gordon', 1980),
 ('Hot Space', 1982),
 ('The Works', 1984),
 ('A Kind of Magic', 1986),
 ('The Miracle', 1989),
 ('Innuendo', 1991)]

In [21]:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery("""
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX res:  <http://dbpedia.org/resource/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT * {{
        SELECT ?name WHERE 
        {
            <http://dbpedia.org/resource/Queen_(band)> dbo:bandMember ?member.
            ?member rdfs:label ?name .
            FILTER (lang(?name) = 'en') . 
        }
    } UNION {
        SELECT ?name WHERE
        {
            <http://dbpedia.org/resource/Queen_(band)> dbo:formerBandMember ?member.
            ?member rdfs:label ?name .
            FILTER (lang(?name) = 'en') . 
        }
    }} 
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

MEMBERS = [cleanup(result["name"]["value"]) for result in results["results"]["bindings"]]

MEMBERS

['Brian May', 'Roger Taylor', 'John Deacon', 'Freddie Mercury']

In [129]:
import wikipediaapi

wiki = wikipediaapi.Wikipedia('en')

page = wiki.page('Queen_(band)')

In [130]:

page.title

'Queen_(band)'

In [127]:
def dump_page(page):
    with open(re.sub(r'\_', ' ', page.title) + '.md','w') as f:
        f.write(page.text)

In [163]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [164]:
doc = nlp(page.text)

In [169]:
{sent for sent in doc.sents for album in ALBUMS if album[0] in sent.text}

{Queen are a British rock band formed in London in 1970.,
 Before forming Queen, May and Taylor had played together in the band Smile.,
 He joined in 1970 and suggested the name "Queen".,
 Queen first charted in the UK with their second album, Queen II, in 1974.,
 Sheer Heart Attack later that year and A Night at the Opera in 1975 brought them international success.,
 The band’s 1977 album News of the World contained "We Will Rock You" and "We Are the Champions", which have become anthems at sporting events.,
 By the early 1980s, Queen were one of the biggest stadium rock bands in the world. ",
 In August 1986, Mercury gave his last performance with Queen at Knebworth, England.,
 Since 2004, May and Taylor have toured under the "Queen +" name with vocalists Paul Rodgers and Adam Lambert.,
 Estimates of Queen's record sales range from 170 million to 300 million records, making them one of the world's best-selling music artists.,
 Queen received the Outstanding Contribution to British Mu

In [175]:
by_album_deps = {album for sent in doc.sents for album in ALBUMS if album[0] in sent.text}
l = list(by_album_deps)
l.sort(key = lambda a: a[1])
l

[('Queen', 1973),
 ('Queen II', 1974),
 ('Sheer Heart Attack', 1974),
 ('A Night at the Opera', 1975),
 ('A Day at the Races', 1976),
 ('News of the World', 1977),
 ('Jazz', 1978),
 ('The Game', 1980),
 ('Flash Gordon', 1980),
 ('Hot Space', 1982),
 ('The Works', 1984),
 ('A Kind of Magic', 1986),
 ('The Miracle', 1989),
 ('Innuendo', 1991)]

In [142]:
def show_deps(text):
    doc = nlp(text)
    for token in doc:
        print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

In [144]:
show_deps('Queen first charted in the UK with their second album, Queen II, in 1974.')

Queen nsubj charted VERB []
first advmod charted VERB []
charted ROOT charted VERB [Queen, first, in, with, in, .]
in prep charted VERB [UK]
the det UK PROPN []
UK pobj in ADP [the]
with prep charted VERB [album]
their poss album NOUN []
second amod album NOUN []
album pobj with ADP [their, second, ,, II, ,]
, punct album NOUN []
Queen compound II PROPN []
II appos album NOUN [Queen]
, punct album NOUN []
in prep charted VERB [1974]
1974 pobj in ADP []
. punct charted VERB []


In [176]:
{ ent.text for ent in doc.ents for album in ALBUMS if ent.text == album[0]}

{'A Kind of Magic',
 'A Night at the Opera',
 'Hot Space',
 'Jazz',
 'News of the World',
 'Queen',
 'Queen II',
 'Sheer Heart Attack',
 'The Miracle',
 'The Works'}

In [96]:
print(re.match(r'.*Studio albums',"=== Studio albums ===\n"))

<_sre.SRE_Match object; span=(0, 17), match='=== Studio albums'>


In [123]:

def parse_studio_albums(page):    
    albums = []
    inAlbums = False
    inDiscography = False
    for line in page.text.split("\n"):        
        if not line:
            continue
        if not inDiscography and re.match(r'\=+\s*Discography\s*\=+',line):              
            print("disco")
            inDiscography = True
            continue
        if inAlbums and re.match(r'\=+\s*.+\s*\=+',line): 
            break    
        if inDiscography and not inAlbums and re.match(r'.*Studio albums',line):            
            print('album')
            inAlbums = True        
            continue
        if inAlbums:
            print(line)
            albums.append(line)
            
    return albums
    
parse_studio_albums(page)

[]

In [56]:
wikipedia.page('The_Prodigy').sections('Studio albums')

'Studio albums\n\nExperience (1992)\nMusic for the Jilted Generation (1994)\nThe Fat of the Land (1997)\nAlways Outnumbered, Never Outgunned (2004)\nInvaders Must Die (2009)\nThe Day Is My Enemy (2015)\nNo Tourists (2018)'

In [100]:
parse_studio_albums(wikipedia.page('The_Prodigy'))

['Experience (1992)',
 'Music for the Jilted Generation (1994)',
 'The Fat of the Land (1997)',
 'Always Outnumbered, Never Outgunned (2004)',
 'Invaders Must Die (2009)',
 'The Day Is My Enemy (2015)',
 'No Tourists (2018)']

In [126]:
dump_page(wiki.page('Deep_Purple'))

In [109]:
parse_studio_albums(wikipedia.page('Deep_Purple'))

disco
album


[]

In [128]:
dump_page(wikipedia.page('The_Rolling_Stones'))

AttributeError: 'WikipediaPage' object has no attribute 'wikitext'