In [2]:
!pip install SPARQLWrapper

Collecting SPARQLWrapper
  Downloading https://files.pythonhosted.org/packages/7f/c3/72d2d0a2ad86bd8d2bc762406c838f479c421678161cf78585d085436919/SPARQLWrapper-1.8.2-py3-none-any.whl
Collecting rdflib>=4.0 (from SPARQLWrapper)
[?25l  Downloading https://files.pythonhosted.org/packages/3c/fe/630bacb652680f6d481b9febbb3e2c3869194a1a5fc3401a4a41195a2f8f/rdflib-4.2.2-py3-none-any.whl (344kB)
[K    100% |████████████████████████████████| 348kB 6.2MB/s ta 0:00:01
Installing collected packages: rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-1.8.2 rdflib-4.2.2


In [12]:
!pip install Wikipedia-API

Collecting Wikipedia-API
  Downloading https://files.pythonhosted.org/packages/3e/75/972785bf5a21b15c2f59ca46a30559c2558793cb850349dc085934860600/Wikipedia-API-0.5.1.tar.gz
Building wheels for collected packages: Wikipedia-API
  Building wheel for Wikipedia-API (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/serg/Library/Caches/pip/wheels/fb/78/59/e0533f31f4e51d495f8d21301d54d2c47d46b63cb74e11b89c
Successfully built Wikipedia-API
Installing collected packages: Wikipedia-API
Successfully installed Wikipedia-API-0.5.1


In [129]:
from SPARQLWrapper import SPARQLWrapper, JSON
import re

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery("""
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX res:  <http://dbpedia.org/resource/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?albumName, ?releaseDate WHERE
    {
        ?album dbo:artist <http://dbpedia.org/resource/Queen_(band)> .
        ?album dbp:type ?type .
        ?album rdfs:label ?albumName .
        ?album dbo:releaseDate ?releaseDate
        FILTER (lang(?albumName) = 'en') . 
        FILTER (regex(?type,'(Studio album)|(studio)'))
    } 
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

def cleanup(item):
    return re.sub(r'\s*\([^)]+\)', '', item)

ALBUMS = [(cleanup(result['albumName']['value']), \
           int(re.sub(r'(\d+)-\d+-\d+', '\g<1>',result['releaseDate']['value']))) \
          for result in results['results']['bindings']]

# Not marked as studio albums but they are
ALBUMS.append(('A Kind of Magic', 1986))
ALBUMS.append(('Flash Gordon', 1980))

ALBUMS.sort(key = lambda item: item[1])

ALBUMS

[('Queen', 1973),
 ('Queen II', 1974),
 ('Sheer Heart Attack', 1974),
 ('A Night at the Opera', 1975),
 ('A Day at the Races', 1976),
 ('News of the World', 1977),
 ('Jazz', 1978),
 ('The Game', 1980),
 ('Flash Gordon', 1980),
 ('Hot Space', 1982),
 ('The Works', 1984),
 ('A Kind of Magic', 1986),
 ('The Miracle', 1989),
 ('Innuendo', 1991)]

In [2]:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery("""
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX res:  <http://dbpedia.org/resource/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT * {{
        SELECT ?name WHERE 
        {
            <http://dbpedia.org/resource/Queen_(band)> dbo:bandMember ?member.
            ?member rdfs:label ?name .
            FILTER (lang(?name) = 'en') . 
        }
    } UNION {
        SELECT ?name WHERE
        {
            <http://dbpedia.org/resource/Queen_(band)> dbo:formerBandMember ?member.
            ?member rdfs:label ?name .
            FILTER (lang(?name) = 'en') . 
        }
    }} 
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

MEMBERS = [cleanup(result["name"]["value"]) for result in results["results"]["bindings"]]

MEMBERS

['Brian May', 'Roger Taylor', 'John Deacon', 'Freddie Mercury']

In [93]:
import wikipediaapi

wiki = wikipediaapi.Wikipedia('en')

page = wiki.page('Queen_(band)')

In [4]:
page.title

'Queen_(band)'

In [5]:
def dump_page(page):
    with open(re.sub(r'\_', ' ', page.title) + '.md','w') as f:
        f.write(page.text)

In [6]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [19]:
doc = nlp(page.text[:68202])

In [20]:
{sent for sent in doc.sents for album in ALBUMS if album[0] in sent.text}

{Queen are a British rock band formed in London in 1970.,
 Before forming Queen, May and Taylor had played together in the band Smile.,
 He joined in 1970 and suggested the name "Queen".,
 Queen first charted in the UK with their second album, Queen II, in 1974.,
 Sheer Heart Attack later that year and A Night at the Opera in 1975 brought them international success.,
 The band’s 1977 album News of the World contained "We Will Rock You" and "We Are the Champions", which have become anthems at sporting events.,
 By the early 1980s, Queen were one of the biggest stadium rock bands in the world. ",
 In August 1986, Mercury gave his last performance with Queen at Knebworth, England.,
 Since 2004, May and Taylor have toured under the "Queen +" name with vocalists Paul Rodgers and Adam Lambert.,
 Estimates of Queen's record sales range from 170 million to 300 million records, making them one of the world's best-selling music artists.,
 Queen received the Outstanding Contribution to British Mu

In [25]:
sorted({album for sent in doc.sents for album in ALBUMS if album[0] in sent.text}, key = lambda a: a[1])

[('Queen', 1973),
 ('Sheer Heart Attack', 1974),
 ('Queen II', 1974),
 ('A Night at the Opera', 1975),
 ('A Day at the Races', 1976),
 ('News of the World', 1977),
 ('Jazz', 1978),
 ('Flash Gordon', 1980),
 ('The Game', 1980),
 ('Hot Space', 1982),
 ('The Works', 1984),
 ('A Kind of Magic', 1986),
 ('The Miracle', 1989),
 ('Innuendo', 1991)]

In [30]:
{ (ent.text) for ent in doc.ents for album in ALBUMS if ent.text == album[0]}

{'A Kind of Magic',
 'A Night at the Opera',
 'Hot Space',
 'Jazz',
 'News of the World',
 'Queen',
 'Queen II',
 'Sheer Heart Attack',
 'The Miracle',
 'The Works'}

In [32]:
{ (ent.text) for ent in doc.ents for album in ALBUMS if ent.text == album[0] and ent.label_ == 'WORK_OF_ART'}

{'A Kind of Magic',
 'A Night at the Opera',
 'News of the World',
 'Queen',
 'Queen II',
 'The Miracle',
 'The Works'}

In [112]:
def show_deps(text):
    doc = nlp(text)
    for token in doc:
        print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])
        
    for ent in doc.ents:
        print (ent.text, ent.label_)

In [18]:
show_deps('Queen first charted in the UK with their second album, Queen II, in 1974.')

Queen nsubj charted VERB []
first advmod charted VERB []
charted ROOT charted VERB [Queen, first, in, with, in, .]
in prep charted VERB [UK]
the det UK PROPN []
UK pobj in ADP [the]
with prep charted VERB [album]
their poss album NOUN []
second amod album NOUN []
album pobj with ADP [their, second, ,, II, ,]
, punct album NOUN []
Queen compound II PROPN []
II appos album NOUN [Queen]
, punct album NOUN []
in prep charted VERB [1974]
1974 pobj in ADP []
. punct charted VERB []
first ORDINAL
UK GPE
second ORDINAL
Queen II WORK_OF_ART
1974 DATE


In [19]:
show_deps('The band’s 1977 album News of the World contained "We Will Rock You" and "We Are the Champions", which have become anthems at sporting events.')

The det band NOUN []
band poss News PROPN [The, ’s]
’s case band NOUN []
1977 nummod News PROPN []
album compound News PROPN []
News nsubj contained VERB [band, 1977, album, of]
of prep News PROPN [World]
the det World PROPN []
World pobj of ADP [the]
contained ROOT contained VERB [News, ", Rock, .]
" punct contained VERB []
We nsubj Rock PROPN []
Will aux Rock PROPN []
Rock ccomp contained VERB [We, Will, You, ", and, Are, become]
You dobj Rock PROPN []
" punct Rock PROPN []
and cc Rock PROPN []
" punct Are VERB []
We nsubj Are VERB []
Are conj Rock PROPN [", We, Champions, ", ,]
the det Champions PROPN []
Champions attr Are VERB [the]
" punct Are VERB []
, punct Are VERB []
which nsubj become VERB []
have aux become VERB []
become relcl Rock PROPN [which, have, anthems]
anthems attr become VERB [at]
at prep anthems NOUN [sporting]
sporting pcomp at ADP [events]
events dobj sporting NOUN []
. punct contained VERB []
1977 DATE
News of the World WORK_OF_ART
Will Rock PERSON
We Are the C

In [134]:
def is_reachable(token, word):
    if token.head.lemma_ == word:
        return True
    elif token.dep_ == 'ROOT':
        return False
    
    return is_reachable(token.head, word)

def depends_on_album(token):
    return (is_reachable(token, 'album') or is_reachable(token, 'release')) and not is_reachable(token, 'song') and not is_reachable(token, 'single')

def depends_on_ent(token, ent):
    for e_token in ent:
        if is_reachable(token, e_token.text):
            return True
    return False    

def is_album(ent, sent):
    if ent.label_ == "WORK_OF_ART" or ent.label_ == "PRODUCT":
        for token in ent:
            if depends_on_album(token):
                return True
        for token in sent:
            if token.lemma_ == 'album':
                return depends_on_ent(token, ent)
            if token.lemma_ == 'single' and depends_on_ent(token, ent):
                return False
            if token.lemma_ == 'song' and depends_on_ent(token, ent):
                return False
            
    return False    

def extract_albums(doc):
    albums = set([])
    prev_sent = None
    for sent in doc.sents:        
        for ent in sent.ents:
            if is_album(ent, sent):
                print(sent)
                albums.add(ent.text)                            
    return albums

In [135]:
#extract_albums(nlp(doc.text.replace("=",'')))
extract_albums(doc)

Queen first charted in the UK with their second album, Queen II, in 1974.
Another One Bites the Dust" (1980) became their best-selling single, while their 1981 compilation album Greatest Hits is the best-selling album in the UK and is certified eight times platinum in the US.
Retrospectively, it is cited as the highlight of the album, and in 2008 Rolling Stone ranked it 31st in the "100 Greatest Guitar Songs of All Time", describing it as "an entire album's worth of riffs crammed into a single song".
The group's second LP, Queen II, was released in 1974, and features rock photographer Mick Rock's iconic image of the band on the cover.
The album reached number five on the British album chart and became the first Queen album to chart in the UK.
In late 1975, Queen recorded and released A Night at the Opera, taking its name from the popular Marx Brothers movie.
The 50 Best British Albums Ever" in 2004, and number 11 in Rolling Stone's "The 100 Greatest Albums of All Time" as featured in t

{'A Night at the',
 'A Night at the Opera',
 'Another One Bites the Dust',
 'Bohemian Rhapsody',
 'Classic Queen',
 'Crazy Little Thing Called Love',
 'Greatest Hits',
 'Greatest Hits III',
 'Guitar Songs of All Time',
 'Hammer to Fall',
 'I Want to Break Free',
 'Live Killers',
 'Live at Wembley Stadium',
 'Opera',
 'Queen',
 'Queen Forever',
 'Queen II',
 'Queen on Fire – Live at the Bowl',
 'Queen:',
 'SingStar',
 'The 100 Greatest Albums of All Time',
 'The 50 Best British Albums Ever',
 'The Cosmos Rocks',
 'The Graduation Album',
 'The Show Must Go',
 'The Works',
 'as Queen: Live in Rio'}

In [133]:
show_deps("In the United States, \"Bohemian Rhapsody\" was re-released as a single in 1992 after appearing in the comedy film Wayne's World.")

In prep was VERB [States]
the det States PROPN []
United compound States PROPN []
States pobj In ADP [the, United]
, punct was VERB []
" punct was VERB []
Bohemian compound Rhapsody PROPN []
Rhapsody nsubj was VERB [Bohemian]
" punct was VERB []
was auxpass released VERB [In, ,, ", Rhapsody, "]
re punct - PUNCT []
- punct released VERB [re]
released punct as ADP [was, -]
as punct . PUNCT [released, single]
a det single ADJ []
single pobj as ADP [a, in, after]
in prep single ADJ [1992]
1992 pobj in ADP []
after prep single ADJ [appearing]
appearing pcomp after ADP [in]
in prep appearing VERB [film]
the det film NOUN []
comedy compound film NOUN []
film pobj in ADP [the, comedy, World]
Wayne poss World PROPN ['s]
's case Wayne PROPN []
World appos film NOUN [Wayne]
. ROOT . PUNCT [as]
the United States GPE
Bohemian Rhapsody WORK_OF_ART
1992 DATE
Wayne PERSON


In [136]:
extract_albums(nlp(wiki.page('The_Prodigy').text))

In 1993, Howlett released an anonymous white label, bearing only the title "Earthbound I".
It was officially released as "One Love" later that year, and went on to chart at number 8 in the UK.
The following year, the Prodigy's second album, Music for the Jilted Generation, debuted in the UK Albums Chart at number one, and jettisoned into positive reactions from album critics.
The release of "Firestarter" in 1996, featuring vocals for the first time courtesy of a new-look Keith Flint, helped the band break into the United States and other overseas markets, and reached number one on the UK Singles Chart.
The long-awaited third Prodigy album, The Fat of the Land, was released in 1997, just as the band headlined the Glastonbury Festival on its opening night.
A precursory and experimental single, "Memphis Bells", was released in very limited numbers, followed by the traditional release of the single "Girls".
The Prodigy's first two albums, 1992's Experience and 1994's Music for the Jilted G

{'Earthbound I',
 'Firestarter',
 'Invaders Must Die',
 'Invaders Must Die (',
 'Memphis Bells',
 'Music for the Jilted Generation',
 'Nasty',
 'One Love',
 'Prodigy',
 'The Fat of the Land',
 'Wild Frontier',
 'the Special Edition of the'}

In [137]:
extract_albums(nlp(wiki.page('Deep_Purple').text))

The band's second album, The Book of Taliesyn, was quickly recorded, then released in North America in October 1968 to coincide with the tour.
The Book of Taliesyn would not be released in the band's home country until the following year and, like its predecessor, it failed to have much impact in the UK charts.

The non-album single "Black Night", released around the same time, finally put Deep Purple into the UK Top Ten.
Fireball" was released as a single, as was "Strange Kind of Woman", not from the album but recorded during the same sessions (although it replaced "Demon's Eye" on the US version of the album). "
Fireball" was released as a single, as was "Strange Kind of Woman", not from the album but recorded during the same sessions (although it replaced "Demon's Eye" on the US version of the album). "
The resulting album, Come Taste the Band, was released in October 1975, one month before Bolin's Teaser album.
Later in the year, Bolin had just finished recording his second solo al

{'"Time for Bedlam"',
 'Black Night',
 'Bolin',
 'Deep',
 'Discography\nStudio',
 'Fireball',
 'Hush',
 'Perfect Strangers',
 'Private Eyes',
 'Purpendicular',
 'Strange Kind of Woman',
 'The Book of Taliesyn',
 'The Soundboard Series'}

In [102]:
extract_albums(nlp(wiki.page('The_Rolling_Stones').text))

{'"Brown Sugar"',
 'Air',
 'Beggars Banquet',
 'Dead Flowers',
 'Emotional Rescue',
 'Goats Head Soup',
 'Goddess in the Doorway',
 'Love in Vain',
 'Midnight Rambler',
 'Plundered My Soul',
 "You Can't Always Get What You Want",
 'You Live',
 'the Glimmer Twins'}

In [100]:
extract_albums(nlp(wiki.page('The_Chemical_Brothers').text))

{'Chemical 6',
 'Galvanize',
 'It Began in Afrika',
 'Out of Control',
 'Surface to Air',
 'Xtrmntr'}

In [103]:
extract_albums(nlp(wiki.page('Led_Zeppelin').text))

{'"Whole Lotta Love"', 'No Quarter', 'Stairway to Heaven', 'The Song Remains'}

In [113]:
extract_albums(nlp("The resulting album, In Through the Out Door, featured sonic experimentation that again drew mixed reactions from critics."))

show_deps("The resulting album, In Through the Out Door, featured sonic experimentation that again drew mixed reactions from critics.")

The det album NOUN []
resulting amod album NOUN []
album nsubj featured ADJ [The, resulting]
, punct featured ADJ []
In prep featured ADJ [Through]
Through prep In ADP [Door]
the det Door PROPN []
Out compound Door PROPN []
Door pobj Through ADP [the, Out]
, punct featured ADJ []
featured ROOT featured ADJ [album, ,, In, ,, experimentation, .]
sonic amod experimentation NOUN []
experimentation dobj featured ADJ [sonic, drew]
that nsubj drew VERB []
again advmod drew VERB []
drew relcl experimentation NOUN [that, again, reactions]
mixed amod reactions NOUN []
reactions dobj drew VERB [mixed, from]
from prep reactions NOUN [critics]
critics pobj from ADP []
. punct featured ADJ []
