### Sparql query

In [1]:
import pandas as pd
from urllib.request import urlopen
import urllib.request
import pickle
import numpy as np
from bs4 import BeautifulSoup
import requests
import spacy
from pandas import DataFrame
from collections import defaultdict
import re

In [2]:
nlp = spacy.load("en_core_web_lg")

In [117]:
def normalize_name (name):
    name = name.split('(')[0]
    return name.strip()

df_dbp = pd.read_csv('dbp_data.csv', usecols = ['band_name', 'member_name', 'former_member_name', 'hometown_name', 'year' ])

df_dbp['member_name'] = df_dbp['member_name'].apply(lambda x: normalize_name(x))
df_dbp['former_member_name'] = df_dbp['former_member_name'].apply(lambda x: normalize_name(x))
df_dbp['hometown_name'] = df_dbp['hometown_name'].fillna('').apply(lambda x: x.split(',')[0])

df_dbp.set_index('band_name', inplace= True)
df_dbp.head()

Unnamed: 0_level_0,member_name,former_member_name,hometown_name,year
band_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Monkees,Micky Dolenz,Davy Jones,Los Angeles,1965.0
The Monkees,Micky Dolenz,Michael Nesmith,Los Angeles,1965.0
The Monkees,Peter Tork,Davy Jones,Los Angeles,1965.0
The Monkees,Peter Tork,Michael Nesmith,Los Angeles,1965.0
Television (band),Fred Smith,Richard Lloyd,New York City,1973.0


### Merge members names into 1 column

In [118]:
df_dbp['members'] =  df_dbp.apply(lambda row: {row['member_name'], row['former_member_name']}, axis=1)
df_dbp.head()

Unnamed: 0_level_0,member_name,former_member_name,hometown_name,year,members
band_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
The Monkees,Micky Dolenz,Davy Jones,Los Angeles,1965.0,"{Micky Dolenz, Davy Jones}"
The Monkees,Micky Dolenz,Michael Nesmith,Los Angeles,1965.0,"{Michael Nesmith, Micky Dolenz}"
The Monkees,Peter Tork,Davy Jones,Los Angeles,1965.0,"{Davy Jones, Peter Tork}"
The Monkees,Peter Tork,Michael Nesmith,Los Angeles,1965.0,"{Michael Nesmith, Peter Tork}"
Television (band),Fred Smith,Richard Lloyd,New York City,1973.0,"{Richard Lloyd, Fred Smith}"


### Create two separate Series with info, that will be used for comparison

In [119]:
dbp_names = df_dbp.groupby('band_name').apply(lambda x: set.union(*x.members))
dbp_names.head()

band_name
'68 Comeback         {Jack Yarber, Jack Taylor, Walter Daniels, Jef...
14 Bis (band)                    {Cláudio Venturini, Flávio Venturini}
1974 AD              {Rohit John Chhetri, Manoj Kumar KC, Sanjay Sh...
3 (American band)    {Josh Eppard, Billy Riker, Chris Gartmann, Joe...
54-40                        {Darryl Neudorf, Neil Osborne, Dave Genn}
dtype: object

In [120]:
dbp_dates = df_dbp.groupby('band_name')['year'].apply(lambda x: list(x)[0])
dbp_dates = dbp_dates.dropna().astype(int)
dbp_dates = dbp_dates.apply(lambda x: {x})
dbp_dates.head()

band_name
'68 Comeback    {1992}
1974 AD         {1994}
54-40           {1981}
A-Studio        {1982}
AC/DC           {1973}
Name: year, dtype: object

In [121]:
band_names = df_dbp.index.unique().tolist()
band_names[:5]

band_texts = defaultdict(list)
for link in band_names:
    link_underscore = link.replace(' ', "_")
    website_url = requests.get(f"https://en.wikipedia.org/wiki/{link_underscore}").text
    soup = BeautifulSoup(website_url,'lxml')
    # take only first 3 paragraphs of text
    text = [i.text.strip() for i in soup.select('p') if len(i) > 1][:3]
    band_texts[link] = text

In [122]:
def jaccard (truth, prediction):
    return len(truth.intersection(prediction)) / len(truth.union(prediction))

def calculate_precision (truth, prediction):
    tp = len(truth.intersection(prediction))
    if not len(prediction):
        return 0
    return  tp / len(prediction)

def calculate_recall (truth, prediction):
    tp = len(truth.intersection(prediction))
    fn = len(truth) - tp
    if tp + fn == 0:
        return 0
    return tp / (tp + fn)
    
def f1 (truth, prediction):
    precision = calculate_precision(truth, prediction)
    recall = calculate_recall(truth, prediction)
    if precision + recall == 0 :
        return 0
    return 2 * (precision * recall / (precision + recall))

In [123]:
music_related_words = ['founder', 'member','frontman', 'bassist', 'drummer', 'vocal', 'vocalist', 'sax', 'keyboard', \
                        'singer', 'guitarist', 'bass', 'keyboardists', 'drum', 'guitar', 'piano', 'harmonica', 'saxofonist', 'player']   

band_and_membres = defaultdict(list)
    
    
def is_correct_parent (token):
    other = ['consist', 'find', 'comprise', 'form', 'join', 'by', 'of', 'featuring']
    lemma = token.lemma_
    linking_verb = ['stay', 'remain', 'be', 'include', 'have', 'feature']
    #line for line up
    member_noun = ['member', 'founder', 'line', 'lineup' , 'line-up', 'up']
    if lemma in other or lemma in music_related_words:
        return True
    if lemma in linking_verb:
        for child in token.children:
            if child.lemma_ in member_noun:
                return True
            

def postprocess_name (name):
    # delete ner mistakes - "Owen McIntyr[4]"
    name = re.sub(r'[\[\d\]]', '', name)
     # delete nicknames - Owen "Onnie" McIntyr
    name = re.sub(r'"[\w\s.:-]+"\s', '', name)
    if name.endswith('\'s'):
        name = name[0:-2]
    if name.endswith('.') or name.endswith(','):
        name = name[0:-1]
    return name
    
person_names = set()        
for k,v in band_texts.items():
    set_of_members = set()
    for paragraph in v:
        parsed_sent = nlp(paragraph)
        for ent in parsed_sent.ents:
             if ent.label_ == "PERSON":
                person_names.add(ent.text)
            
        for i, token in list(enumerate(parsed_sent))[:-1]:
            # named entities usually consist of 2 tokens
            possible_entity = token.text + ' ' + parsed_sent[i+1].text
            if possible_entity in person_names:
                
                for parent in parsed_sent[i+1].ancestors:
                    if is_correct_parent(parent):
                        set_of_members.add(postprocess_name(possible_entity))  
                        
                for child in parsed_sent[i+1].children:
                    if child.lemma_ in music_related_words:
                        set_of_members.add(postprocess_name(possible_entity))                        
                        
    band_and_membres[k] = set_of_members

In [125]:
names_compared = pd.DataFrame(columns=['truth', 'prediction'])
names_compared['truth'] = dbp_names
names_compared['prediction'] = pd.Series(band_and_membres)
names_compared.sample(n = 10)

Unnamed: 0_level_0,truth,prediction
band_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Evil Beaver,"{Claude Coleman, Jr., Gene Trautmann}","{David J, Gene Trautmann, Betty Blowtorch, Sam..."
AnnenMayKantereit,"{Drums, Guitar, Double Bass, Piano, Vocals, Me...","{Lars Lötgering, Henning May, du gehst}"
Neighborhood Texture Jam,"{Guitar, Lead vocals, Paul Buchignani, Bass gu...","{Ed Scott, John Whittemore, Joe Lapsley}"
Jello Biafra and the Guantanamo School of Medicine,"{Billy Gould, Jello Biafra, Andrew Weiss}","{Jon Weiss, Kimo Ball, Ralph Spight, Tupac Sha..."
The Reels,"{Karen Ansel, Paul Abrahams, Dave Mason, Craig...","{John Bliss, Dave Mason, Craig Hooper}"
The 77s,"{Michael Roe, Aaron Smith, Mark Harmon, Bruce ...","{Michael Roe, Aaron Smith, Bruce Spencer, Stev..."
The Saw Doctors,"{Anthony Thistlethwaite, Davy Carton, Eímhín C...","{Kieran Duddy, Blaze X, Mary O'Connor, Davy Ca..."
False Alarm (band),"{Mike Burkett, Paul Kostabi}","{Dylan Maunder, Paul Aragon, Art Chianello, Br..."
Fiel a la Vega,"{Tito Auger, Pedro Arraiza, Jorge Arraiza, Pap...","{Los Arraiza, Papo Román, Tito Auger, Jorge Ar..."
Gary Lewis & the Playboys,"{Carl Radle, Gary Lewis}","{Jerry Lewis, Gary Lewis}"


In [126]:
names_compared['similarity'] = names_compared.apply(lambda x: jaccard(x['truth'], x['prediction']), axis=1)
names_compared['f1'] = names_compared.apply(lambda x: f1(x['truth'], x['prediction']), axis=1)
names_compared.sample(n = 10)

Unnamed: 0_level_0,truth,prediction,similarity,f1
band_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cold Chisel,"{Don Walker, Swanee, Phil Small, Ray Arnott, I...","{Ian Moss, Les Kaczmarek, Jimmy Barnes}",0.222222,0.363636
The Real Nasty,"{Drums, Electric Guitar, Upright Bass, Vox}","{Brian Huston, bass),Sean Williams, Ryan Lukas}",0.0,0.0
NRBQ,"{Pete Donnelly, Terry Adams, Tom Staley, Joey ...","{John Perrin, Steve Ferguson, Terry Adams, Sco...",0.384615,0.555556
Hoodoo Gurus,"{Roddy Radalj, James Baker, Brad Shepherd, Ric...","{Richard Grossman, Mark Kingsmill, Dave Faulkn...",0.428571,0.6
Bigbang (Norwegian band),"{Chris Summers, Øystein Greni, Lasse Weeden, M...","{Øystein Greni, Dinosaur Jr}",0.142857,0.25
De Press,"{Andrzej Dziubek, Ola Snortheim, Jørn Christen...",{},0.0,0.0
Affinity (band),"{Grant Serpell, Mo Foster, Linda Hoyle}","{Grant Serpell, Nick Nicholas}",0.25,0.4
3 (American band),"{Josh Eppard, Billy Riker, Chris Gartmann, Joe...","{Joey Eppard, Josh Eppard}",0.333333,0.5
The Blockheads,"{Wilko Johnson, Ian Dury, Mick Gallagher, Gila...","{Ian Dury, Dave Lewis, Terry Edwards, Mick Gal...",0.5,0.666667
MiRthkon,"{Aram Shelton, Matt Lebofsky, Wally Scharold}","{Peter Evans, Wally Scharold, Brian Chase}",0.2,0.333333


In [127]:
names_compared.similarity.mean()

0.2595749631809406

In [128]:
names_compared.f1.mean()

0.3677478433774634

In [135]:
print(band_texts['Flying Other Brothers'])

['The Flying Other Brothers were an American rock band active from 1997–2006 in San Francisco.[1] The band played original tunes and covers.', 'The band started at an Electronic Frontier Foundation (EFF) benefit at The Fillmore Auditorium in 1997.[2] At this time the band included Roger McNamee, a venture capitalist; Giles McNamee, investment banker, board member of EFF and Roger\'s brother; guitarist Bert Keely of Microsoft; Bill Bennett, a marketing and communications strategist; and Tony Bove (rechristened "TBone"), author of books about computers and the Internet. The band had its roots in acoustic performances Roger and Giles McNamee gave as the Other Brothers in lounges around New England during the early 1980s.[citation needed]', 'The band backed Grateful Dead members Bob Weir and Mickey Hart in a series of fund-raisers[3] for Al Gore and Bill Clinton (Tipper Gore played congas during one Silicon Valley soiree[4]).']


### Fact checking for dates (start year)

In [None]:
band_and_year = defaultdict(list)

def process_date (date):
    if re.search(r'\d', date):
        date_extracted = re.findall(r'\d{4}', date)
        if date_extracted:
            return int(date_extracted[0])
    
for k,v in band_texts.items():
    date = None
    for paragraph in v:
        dates = set()
        parsed_sent = nlp(paragraph)
        
        for ent in parsed_sent.ents:
            if ent.label_ == 'DATE' and not date:
                date = {process_date(ent.text)}
                break
    if date:
        band_and_year[k] = date
    else:
        band_and_year[k] = {0}

In [164]:
dates_compared = pd.DataFrame(columns=['truth', 'prediction'])

dates_compared['truth'] = dbp_dates
dates_compared['prediction'] = pd.Series(band_and_year)

In [165]:
dates_compared = pd.DataFrame(columns=['truth', 'prediction'])

dates_compared['truth'] = dbp_dates
dates_compared['prediction'] = pd.Series(band_and_year)
dates_compared.sample(n = 10)

Unnamed: 0_level_0,truth,prediction
band_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Roger Clyne and the Peacemakers,{1998},{1990}
Cheap Trick,{1973},{1973}
Ultralyd,{2004},{2004}
The Cowsills,{1965},{None}
Lir (band),{90},{1980}
The Connection (band),{2011},{1980}
Col. Bruce Hampton and the Aquarium Rescue Unit,{1988},{1990}
Transsylvania Phoenix,{1962},{1962}
NRBQ,{1966},{1965}
The Feelers,{1993},{1990}


In [166]:
dates_compared['similarity'] = dates_compared.apply(lambda x: jaccard(x['truth'], x['prediction']), axis=1)

dates_compared['f1'] = dates_compared.apply(lambda x: f1(x['truth'], x['prediction']), axis=1)
dates_compared.sample(n = 10)

Unnamed: 0_level_0,truth,prediction,similarity,f1
band_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DumDum Boys,{1985},{1980},0.0,0.0
The Feelers,{1993},{1990},0.0,0.0
The Spencer Davis Group,{1963},{1963},1.0,1.0
Deacon Blue,{1985},{1985},1.0,1.0
Kanjani Eight,{2002},{2002},1.0,1.0
Brutus (Canadian band),{1969},{1969},1.0,1.0
Icehouse (band),{1977},{1977},1.0,1.0
Light FM (band),{1999},{2011},0.0,0.0
My Jerusalem,{2010},{0},0.0,0.0
Brainbox,{1968},{1960},0.0,0.0


In [167]:
dates_compared.similarity.mean()

0.5071428571428571

In [168]:
dates_compared.f1.mean()

0.5071428571428571