In [59]:
# Specific packages used in this notebook
#!pip install SPARQLWrapper
#!pip install Wikipedia-API

### Еталонна база даних

Будемо знаходити альбоми відомої рок гурту Queen в тексті відповідної строрінки в Wikipedia. Для цього формуємо базу даних, яку будемо використовувати оцінки якості.

In [131]:
from SPARQLWrapper import SPARQLWrapper, JSON
import re

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery("""
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX res:  <http://dbpedia.org/resource/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?albumName, ?releaseDate WHERE
    {
        ?album dbo:artist <http://dbpedia.org/resource/Queen_(band)> .
        ?album dbp:type ?type .
        ?album rdfs:label ?albumName .
        ?album dbo:releaseDate ?releaseDate
        FILTER (lang(?albumName) = 'en') . 
        #FILTER (regex(?type,'[Aa]lbum'))
    } 
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

def cleanup(item):
    return re.sub(r'\s*\([^)]+\)', '', item)

ALBUMS = sorted({(cleanup(result['albumName']['value']), \
           int(re.sub(r'(\d+)-\d+-\d+', '\g<1>',result['releaseDate']['value']))) \
          for result in results['results']['bindings']}.\
                union({('Live Killers', 1979),\ # albums that are not contained in dbpedia
                       ('Live at Wembley Stadium', 1982),\
                       ('The Cosmos Rocks', 2008)}), key = lambda item: item[1])

ALBUMS

[('Queen', 1973),
 ('Queen II', 1974),
 ('Sheer Heart Attack', 1974),
 ('A Night at the Opera', 1975),
 ('A Day at the Races', 1976),
 ('News of the World', 1977),
 ('Jazz', 1978),
 ('Live Killers', 1979),
 ('Flash Gordon', 1980),
 ('The Game', 1980),
 ('Flash Gordon', 1981),
 ('Greatest Hits', 1981),
 ('Live at Wembley Stadium', 1982),
 ('Hot Space', 1982),
 ('The Works', 1984),
 ('A Kind of Magic', 1986),
 ('Live Magic', 1986),
 ('At the Beeb', 1989),
 ('The Miracle', 1989),
 ('Innuendo', 1991),
 ('Greatest Hits II', 1991),
 ('Classic Queen', 1992),
 ('Five Live', 1993),
 ('Ultimate Queen', 1995),
 ('Queen Rocks', 1997),
 ('The Crown Jewels', 1998),
 ('Greatest Hits III', 1999),
 ('The Platinum Collection', 2000),
 ('Jewels', 2004),
 ('Queen on Fire – Live at the Bowl', 2004),
 ('Stone Cold Classics', 2006),
 ('The A–Z of Queen, Volume 1', 2007),
 ('The Singles Collection Volume 1', 2008),
 ('The Cosmos Rocks', 2008),
 ('Absolute Greatest', 2009),
 ('Greatest Hits II', 2011),
 ('Flas

### Данні

Завантажуємо сторінку і беремо чистий текст без списку дискографії

In [62]:
import wikipediaapi

wiki = wikipediaapi.Wikipedia('en')

page = wiki.page('Queen_(band)')

In [64]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [117]:
#exclude discography list
def page_text(page):
    index = page.text.find('Discography') - 1
    return page.text[:index]

Алгоритм по екстракту необхідних данних базуємо на іменованих сутностях типу WORK_OF_ART та PRODUCT. Працюємо в контексті одного реченя, для розрізнення назв альбомів використовуемо залежності.

Оскільки текст статті не обов'язково включає іноформацію по всім альбомам (для цього є окрема таблиця, а ми працюємо тільки з текстом) то для оцінки якості нам треба зкрозуміти які альбомі є в тексті.

In [118]:
doc = nlp(page_text(page))

In [66]:
sorted({album for sent in doc.sents for album in ALBUMS if album[0] in sent.text}, key = lambda a: a[1])

[('Queen', 1973),
 ('Queen II', 1974),
 ('Sheer Heart Attack', 1974),
 ('A Night at the Opera', 1975),
 ('A Day at the Races', 1976),
 ('News of the World', 1977),
 ('Jazz', 1978),
 ('Live Killers', 1979),
 ('Flash Gordon', 1980),
 ('The Game', 1980),
 ('Flash Gordon', 1981),
 ('Greatest Hits', 1981),
 ('Live at Wembley Stadium', 1982),
 ('Hot Space', 1982),
 ('The Works', 1984),
 ('A Kind of Magic', 1986),
 ('The Miracle', 1989),
 ('Innuendo', 1991),
 ('Greatest Hits II', 1991),
 ('Classic Queen', 1992),
 ('Queen Rocks', 1997),
 ('Greatest Hits III', 1999),
 ('Queen on Fire – Live at the Bowl', 2004),
 ('The Cosmos Rocks', 2008),
 ('Absolute Greatest', 2009),
 ('Greatest Hits II', 2011),
 ('Flash Gordon', 2011),
 ('Icon', 2013),
 ('Queen Forever', 2014)]

In [67]:
{ album[0] for ent in doc.ents for album in ALBUMS if album[0].lower() in ent.text.lower()}

{'A Kind of Magic',
 'A Night at the Opera',
 'Absolute Greatest',
 'Classic Queen',
 'Greatest Hits',
 'Greatest Hits II',
 'Greatest Hits III',
 'Hot Space',
 'Icon',
 'Jazz',
 'Live Killers',
 'Live at Wembley Stadium',
 'News of the World',
 'Queen',
 'Queen Forever',
 'Queen II',
 'Queen on Fire – Live at the Bowl',
 'Sheer Heart Attack',
 'The Cosmos Rocks',
 'The Game',
 'The Miracle',
 'The Works'}

In [68]:
ent_albums = { album[0] for ent in doc.ents for album in ALBUMS if album[0].lower() in ent.text.lower() and \
 ent.label_ == 'WORK_OF_ART' or ent.label_ == 'PRODUCT'}

ent_albums

{'A Day at the Races',
 'A Kind of Magic',
 'A Night at the Opera',
 'Absolute Greatest',
 'At the Beeb',
 'Classic Queen',
 'Deep Cuts, Volume 3',
 'Five Live',
 'Flash Gordon',
 'Greatest Hits',
 'Greatest Hits II',
 'Greatest Hits III',
 'Hot Space',
 'Hungarian Rhapsody: Queen Live in Budapest',
 'Icon',
 'Innuendo',
 'Jazz',
 'Jewels',
 'Live Killers',
 'Live Magic',
 'Live at Wembley Stadium',
 'News of the World',
 'On Air',
 'Queen',
 'Queen Forever',
 'Queen II',
 'Queen Rocks',
 'Queen on Fire – Live at the Bowl',
 'Sheer Heart Attack',
 'Stone Cold Classics',
 'The A–Z of Queen, Volume 1',
 'The Cosmos Rocks',
 'The Crown Jewels',
 'The Game',
 'The Miracle',
 'The Platinum Collection',
 'The Singles Collection Volume 1',
 'The Works',
 'Ultimate Queen'}

In [69]:
from spacy import displacy

def show_deps(text):
    doc = nlp(text)
    for token in doc:
        print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])
        
    print('================================')    
        
    for ent in doc.ents:
        print (ent.text, ent.label_)
    
    displacy.render(doc, style="dep", jupyter = True, options={'distance':100})    

In [119]:
def reach(token, word):
    if token.head.lemma_ == word:
        return token.head
    elif token.dep_ == 'ROOT':
        return None
    
    return reach(token.head, word)        

def is_reachable(token, word):    
    return not (reach(token, word) is None)

def depends_on_album(token):
    if is_reachable(token, 'album'):
        return True
    release = reach(token, 'release')
    if release:
        return not(is_reachable(release, 'single') or is_reachable(release, 'song') or is_reachable(release, 'game') or is_reachable(release, 'soundtrack') or is_reachable(release, 'track'))
    record = reach(token, 'record')
    if release:
        return not(is_reachable(record, 'single') or is_reachable(record, 'song') or is_reachable(record, 'game') or is_reachable(record, 'soundtrack') or is_reachable(record, 'track'))
    
    return False
        
def depends_on_ent(token, ent):
    for e_token in ent:
        if is_reachable(token, e_token.lemma_) or is_reachable(e_token ,token.lemma_):
            return True
    return False    

def is_album(ent, sent):
    proof = False
    if ent.label_ == "WORK_OF_ART" or ent.label_ == "PRODUCT":
        for token in ent:
            if depends_on_album(token):
                proof = True
                break
        for token in sent:           
            if token.lemma_ == 'single' and (depends_on_ent(token, ent) or is_reachable(token, 'release') or is_reachable(token, 'record')):
                proof = False
                break
            if token.lemma_ == 'song' and (depends_on_ent(token, ent) or is_reachable(token, 'release') or is_reachable(token, 'record')):
                proof = False
                break
            if token.lemma_ == 'soundtrack' and (depends_on_ent(token, ent) or is_reachable(token, 'release') or is_reachable(token, 'record')):
                proof = False
                break
            if token.lemma_ == 'track' and (depends_on_ent(token, ent) or is_reachable(token, 'release') or is_reachable(token, 'record')):
                proof = False
                break    
                
            if token.lemma_ == 'game' and (depends_on_ent(token, ent) or is_reachable(token, 'release')):
                proof = False
                break
                
            if token.lemma_ == 'album' and depends_on_ent(token, ent):
                proof = True
                break
                            
    return proof

def extract_albums(text):
    doc = nlp(text)
    albums = set([])
    for sent in doc.sents:        
        for ent in sent.ents:
            if 'albums' in ent.text.lower() or 'award' in ent.text.lower() or 'of all time' in ent.text.lower(): #exclude charts aka Greatest Albums of All Time Ever
                continue
            if is_album(ent, sent):
                albums.add(ent.text.strip())                            
    return albums

In [120]:
### True positives

def singleton(text):
    return set([text])

assert extract_albums('Queen first charted in the UK with their second album, Queen II, in 1974.') ==\
     singleton('Queen II')

assert extract_albums('Another One Bites the Dust" (1980) became their best-selling single, while their 1981 compilation album Greatest Hits is the best-selling album in the UK and is certified eight times platinum in the US.')\
    == singleton('Greatest Hits')

assert extract_albums('In late 1975, Queen recorded and released A Night at the Opera, taking its name from the popular Marx Brothers movie.')\
    == singleton('A Night at the Opera')

#assert extract_albums('By 1976, Queen were back in the studio recording A Day at the Races, which is often regarded as a sequel album to A Night at the Opera.')\
#    == singleton('A Night at the') ##  'A Day at the Races' is not recornized as NER

#Released in 1974, Sheer Heart Attack reached number two in the UK, sold well throughout Europe, and went gold in the US.
# 'Sheer Heart Attack' is not recornized as NER

#false negative
assert extract_albums('They released their first live album, Live Killers, in 1979; it went platinum twice in the US.')\
    == singleton('Live Killers')

assert extract_albums("In October that year, Queen released their first compilation album, titled Greatest Hits, which showcased the group's highlights from 1974 to 1981.")\
    == singleton('Greatest Hits')

#assert extract_albums('In February 1984, Queen released their eleventh studio album, The Works, which included the successful singles "Radio Ga Ga", "Hammer to Fall" and "I Want to Break Free".')\
#    == singleton('The Works') cannot distinguish between singles and album in one sentence

assert extract_albums('The first Queen + Paul Rodgers album, titled The Cosmos Rocks, was released in Europe on 12 September 2008 and in the United States on 28 October 2008.')\
    == singleton('The Cosmos Rocks')

assert extract_albums("A distinctive characteristic of Queen's music are the vocal harmonies which are usually composed of the voices of May, Mercury, and Taylor best heard on the studio albums A Night at the Opera and A Day at the Races.")\
    == singleton('A Night at the')

assert extract_albums('In May 2012, the choir performed "We Are the Champions" in the episode "Nationals", and the song features in The Graduation Album.')\
    == singleton('The Graduation Album')

assert extract_albums("DVD releases of their 1986 Wembley concert (titled Live at Wembley Stadium), 1982 Milton Keynes concert (Queen on Fire – Live at the Bowl), and two Greatest Video Hits (Volumes 1 and 2, spanning the 1970s and 1980s) have seen the band's music remixed into 5.1 and DTS surround sound")\
    == {'Live at Wembley Stadium', 'Queen on Fire – Live at the Bowl'}

assert extract_albums("The band’s 1977 album News of the World contained \"We Will Rock You\" and \"We Are the Champions\", which have become anthems at sporting events.")\
    == singleton('News of the World')

assert extract_albums("In October that year, Queen released their first compilation album, titled Greatest Hits, which showcased the group's highlights from 1974 to 1981.")\
    == singleton('Greatest Hits')

In [121]:
#True negatives
assert not extract_albums("Retrospectively, it is cited as the highlight of the album, and in 2008 Rolling Stone ranked it 31st in the \"100 Greatest Guitar Songs of All Time\", describing it as \"an entire album's worth of riffs crammed into a single song\"")

assert not extract_albums("\"The 50 Best British Albums Ever\" in 2004, and number 11 in Rolling Stone's \"The 100 Greatest Albums of All Time\" as featured in their Mexican edition in 2004.")

assert not extract_albums('Queen also released the very successful single "Crazy Little Thing Called Love", a rockabilly inspired song done in the style of Elvis Presley.')

assert not extract_albums('After attending a Queen concert in Los Angeles, Michael Jackson suggested to Mercury backstage that "Another One Bites the Dust" be released as a single, and in October 1980 it spent three weeks at number one.')

assert not extract_albums('In conjunction with Electronic Arts, Queen released the computer game Queen: The eYe in 1998')

assert not extract_albums("Several of the guest singers recorded new versions of Queen's hits under the Queen + name, such as Robbie Williams providing vocals for \"We Are the Champions\" for the soundtrack of A Knight's Tale (2001)")

# fasle positive
#extract_albums("In the United States, \"Bohemian Rhapsody\" was re-released as a single in 1992 after appearing in the comedy film Wayne's World.")

In [125]:
extracted_albums = extract_albums(doc.text)

extracted_albums

{'A Kind of Magic',
 'A Night at the',
 'A Night at the Opera',
 'Bohemian Rhapsody',
 'Classic Queen',
 'Greatest Hits',
 'Greatest Hits III',
 'Hammer to Fall',
 'Live Killers',
 'Live at Wembley Stadium',
 'News of the World',
 'Opera',
 'Queen',
 'Queen Forever',
 'Queen II',
 'Queen on Fire – Live at the Bowl',
 'SingStar',
 'The Cosmos Rocks',
 'The Graduation Album',
 'The Works',
 'as Queen: Live in Rio',
 'the Races to Live Killers\nBy 1976'}

In [126]:
true_positives = ent_albums.intersection(extracted_albums)

true_positives

{'A Kind of Magic',
 'A Night at the Opera',
 'Classic Queen',
 'Greatest Hits',
 'Greatest Hits III',
 'Live Killers',
 'Live at Wembley Stadium',
 'News of the World',
 'Queen',
 'Queen Forever',
 'Queen II',
 'Queen on Fire – Live at the Bowl',
 'The Cosmos Rocks',
 'The Works'}

In [127]:
false_negatives = ent_albums.difference(extracted_albums)

false_negatives

{'A Day at the Races',
 'Absolute Greatest',
 'At the Beeb',
 'Deep Cuts, Volume 3',
 'Five Live',
 'Flash Gordon',
 'Greatest Hits II',
 'Hot Space',
 'Hungarian Rhapsody: Queen Live in Budapest',
 'Icon',
 'Innuendo',
 'Jazz',
 'Jewels',
 'Live Magic',
 'On Air',
 'Queen Rocks',
 'Sheer Heart Attack',
 'Stone Cold Classics',
 'The A–Z of Queen, Volume 1',
 'The Crown Jewels',
 'The Game',
 'The Miracle',
 'The Platinum Collection',
 'The Singles Collection Volume 1',
 'Ultimate Queen'}

In [128]:
false_positives = extracted_albums.difference(ent_albums)

false_positives

{'A Night at the',
 'Bohemian Rhapsody',
 'Hammer to Fall',
 'Opera',
 'SingStar',
 'The Graduation Album',
 'as Queen: Live in Rio',
 'the Races to Live Killers\nBy 1976'}

In [129]:
true_negatives = {ent.text.strip() for ent in doc.ents if ent.label_ == 'PRODUCT' or ent.label_ == "WORK_OF_ART"}\
    .difference(ent_albums)

true_negatives

{'"Bohemian Rhapsody"',
 '"Love of My Life"',
 '"Queen at the Ballet"',
 '"The Show Must Go',
 '"The Show Must Go On',
 '"The Show Must Go On"',
 "A Knight's Tale",
 'A Night',
 'A Night at the',
 'A Night at the Opera Tour',
 'A Night at the Opera and The Game',
 'A Trip',
 'American Idol',
 'Another One Bites the Dust',
 'Battleship Potemkin',
 'Bicycle Race',
 'Birthday Tribute',
 'Bohemian Rhapsody',
 'Breakthru',
 'Candle in the Wind 1997',
 'Composers',
 'Crazy Little Thing Called Love',
 'Death on Two Legs',
 'Disco',
 "Do They Know It's Christmas",
 "Don't Stop Me Now",
 'Ericsson',
 'Essential Hard Rock and Heavy Metal',
 'Fat Bottomed Girls',
 'Friends Will Be Friends',
 'Fun in Space',
 'Game',
 'Gigwise',
 'God Save the Queen',
 'Grammy Award for Best Metal Performance',
 'Grammy Lifetime Achievement Award',
 'Grand Theft Auto IV',
 'Guitar Hero',
 'Guitar Hero 5',
 'Guitar Hero World Tour',
 'Guitar Hero:',
 'Guitar Hero: Warriors of Rock',
 'Guitar Songs of All Time',
 'H

In [130]:
tp = len(true_positives)
fp = len(false_positives)
fn = len(false_negatives)
tn = len(true_negatives)

pr = tp/(tp + fp)
rc = tp/(tp + fn)
f1 = 2 * (pr * rc)/(pr + rc)    

print("Accuracy: {}".format((tp + tn)/(tp+fp+fn+tn)))      
print("Precicison: {}".format(pr))
print("Recall: {}".format(rc))
print("F1: {}".format(f1))

Accuracy: 0.8081395348837209
Precicison: 0.6363636363636364
Recall: 0.358974358974359
F1: 0.459016393442623
