In [1]:
import requests, time, re, en_core_web_sm, urllib.parse, musicbrainzngs as mbz, pandas as pd
from lyricsgenius import Genius
from qwikidata.sparql import return_sparql_query_results
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
from nltk.corpus import wordnet as wn
from bardapi import Bard

from SPARQLWrapper import SPARQLWrapper, JSON

model = SentenceTransformer('all-mpnet-base-v2')  # pretrained model for sentence similarity

# Load the spacy model that you have installed
nlp = en_core_web_sm.load()

2023-07-22 17:04:06.700024: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 00. Loading keys 

I import libraries that I will use for retrieving api keys, tokens, etc. from a file that is not in the repository.

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

True

## 01. Loading data

### Get data from Spotify API

First we set up the API credentials, headers and parameters for the API call.

In [3]:
data = {
    'grant_type': 'client_credentials',
    'client_id': os.getenv('SPOTIFY_CLIENT_ID'),
    'client_secret': os.getenv('SPOTIFY_CLIENT_SECRET'),
}

response = requests.post('https://accounts.spotify.com/api/token', data=data).json()

token_type = response['token_type']
access_token = response['access_token']

headers = {
    'Authorization': str(token_type) + ' ' + str(access_token),
}

**Making requests to the Spotify API: seraching for "Resistance" by Muse**

Thus, I create a dictionary for importing relevant information about the song.

In [4]:
track_id = '1C2QJNTmsTxCDBuIgai8QV'
get_song = requests.get('https://api.spotify.com/v1/tracks/'+track_id, headers=headers).json()

song = {}

song['spotify_id'] = get_song['id']
song['spotify_href'] = get_song['href']
song['name'] = get_song['name']
song['artists'] = []
for artist in get_song['artists']:
    artist_dict = {}
    artist_dict['spotify_id'] = artist['id']
    artist_dict['spotify_href'] = artist['href']
    artist_dict['name'] = artist['name']
    get_art = requests.get('https://api.spotify.com/v1/artists/'+artist['id'], headers=headers).json()
    artist_dict['genres'] = get_art['genres']
    song['artists'].append(artist_dict)
song['isrc'] = get_song['external_ids']['isrc']



song

{'spotify_id': '1C2QJNTmsTxCDBuIgai8QV',
 'spotify_href': 'https://api.spotify.com/v1/tracks/1C2QJNTmsTxCDBuIgai8QV',
 'name': 'Resistance',
 'artists': [{'spotify_id': '12Chz98pHFMPJEknJQMWvI',
   'spotify_href': 'https://api.spotify.com/v1/artists/12Chz98pHFMPJEknJQMWvI',
   'name': 'Muse',
   'genres': ['modern rock', 'permanent wave', 'rock']}],
 'isrc': 'GBAHT0900321'}

### Import data from Genius

Now we are ready for retrieving information about lyrics and annotations/interpretations from Genius. I will use the Genius API for this purpose. I will create an instance of Genius class where I specify my access token and then I will make requests to the API to retrieve the information I need.

Firstly I set the needed information for the API call. Then, I can make the request to the API and retrieve the information I need, firstly adding the genius id to the song dictionary and then adding the lyrics and annotations.

In [5]:
genius = Genius(os.getenv('GENIUS_ACCESS_TOKEN'))

genius_track_info = genius.search_song(song['name'], song['artists'][0]['name']) 
song['genius_id'] = genius_track_info.id
song['genius_url'] = genius_track_info.url

Searching for "Resistance" by Muse...
Done.


I make a list for comments and a list for annotations. I also extract the lyrics.

In [6]:
comment_list = []
annotation_list = []

for comment in genius.song_comments(song['genius_id'])['comments']:
    comment_list.append(comment['body']['plain'])
song['comments'] = comment_list

to_process = genius.song_annotations(song['genius_id'])
for tup in to_process:
    new_tup = (tup[0], tup[1][0][0])
    annotation_list.append(new_tup)
song['annotations'] = annotation_list
song['lyrics'] = re.sub(r'[0-9]+\sContributors.+\sLyrics\n', '', genius.lyrics(song['genius_id'], remove_section_headers=True)) # remove the header that says how many contributors there are

Thus I search for additional song's info and add to my song dictionary the Genius description.

Now the dictionary starts to be quite complete. The next step is to see if MusicBrainz has some additional information about the song that we can potentially use.

In [7]:
si = genius.song(song['genius_id'])
song['genius_description'] = si['song']['description']['plain'].replace('\n\n', ' ').replace('\n', ' ')

song

{'spotify_id': '1C2QJNTmsTxCDBuIgai8QV',
 'spotify_href': 'https://api.spotify.com/v1/tracks/1C2QJNTmsTxCDBuIgai8QV',
 'name': 'Resistance',
 'artists': [{'spotify_id': '12Chz98pHFMPJEknJQMWvI',
   'spotify_href': 'https://api.spotify.com/v1/artists/12Chz98pHFMPJEknJQMWvI',
   'name': 'Muse',
   'genres': ['modern rock', 'permanent wave', 'rock']}],
 'isrc': 'GBAHT0900321',
 'genius_id': 54137,
 'genius_url': 'https://genius.com/Muse-resistance-lyrics',
 'comments': ['You can’t forget to mention that the ENTIRE Resistance album is inspired by George Orwell’s novel, Nineteen Eighty-Four. The band’s talked about that a few times, and the idea is essential to understanding the lyrics in full.',
  'favorite muse song',
  'I know what this song is about but still, it’s such a lgbtq+ anthem!!',
  'Love Rock On The Way\nI Ago To You GO DUCK',
  'GAY ANTHEM FR'],
 'annotations': [('Hold me\n Our lips must always be sealed',
   'He wants from the significant other not to let him go.\n\nLips bei

### Import data from MusicBrainz

We use the MusicBrainz API to check if we have interesting information to include in our song dictionary.

Firstly, I set the usual information including username and API key. i also set the useragent adding an optional contance mail.

In [8]:
mbz.set_useragent("topic-box", 0.1)
mbz.auth(os.getenv('MUSICBRAINZ_USER'), os.getenv('MUSICBRAINZ_TOKEN'))

I can now search for data related to my song's recording on the basis of the ISRC (International Standard Recording Code) code previously retrieved from Spotify.

In [9]:
song_rec = mbz.get_recordings_by_isrc(song['isrc'])
song_rec

{'isrc': {'id': 'GBAHT0900321',
  'recording-list': [{'id': '09595161-717b-4ec5-94d1-9040aab8aae4',
    'title': 'Resistance',
    'length': '346693'}],
  'recording-count': 1}}

From the many possible values I can specify for retrieving information, I chose to retrieve only the specificed ones using the function `get_recording_by_id`. Further specification can be found on the [official documentation](https://python-musicbrainzngs.readthedocs.io/en/v0.7.1/api/#getting-data) of the Python library for querying the MusicBrainz API.

Non-included values for retieving additional information are: 'aliases', 'instrument-rels', 'area-rels', 'discids', 'ratings', 'user-ratings', 'releases', 'media'.

In [10]:
sr = mbz.get_recording_by_id(song_rec['isrc']['recording-list'][0]['id'], includes=['artists', 'artist-credits', 'isrcs','work-level-rels', 'annotation', 'tags', 'user-tags', 'artist-rels', 'label-rels', 'place-rels', 'event-rels', 'recording-rels', 'release-rels', 'release-group-rels', 'series-rels', 'url-rels', 'work-rels'])
song['musicbrainz_id'] = sr['recording']['id']
sr 

{'recording': {'id': '09595161-717b-4ec5-94d1-9040aab8aae4',
  'title': 'Resistance',
  'length': '346693',
  'artist-credit': [{'artist': {'id': '9c9f1380-2516-4fc9-a3e6-f9f61941d090',
     'type': 'Group',
     'name': 'Muse',
     'sort-name': 'Muse',
     'disambiguation': 'UK rock band',
     'tag-list': [{'count': '1', 'name': 'alternative dance'},
      {'count': '17', 'name': 'alternative rock'},
      {'count': '1', 'name': 'anthemic'},
      {'count': '1', 'name': 'anxious'},
      {'count': '7', 'name': 'british'},
      {'count': '1', 'name': 'eclectic'},
      {'count': '1', 'name': 'electronic'},
      {'count': '1', 'name': 'england'},
      {'count': '2', 'name': 'english'},
      {'count': '1', 'name': 'funk rock'},
      {'count': '1', 'name': 'funky'},
      {'count': '1', 'name': 'groovy'},
      {'count': '1', 'name': 'indie rock'},
      {'count': '1', 'name': 'melodic'},
      {'count': '1', 'name': 'muse'},
      {'count': '1', 'name': 'new prog'},
      {'count

In this way I cann add additional keys to my song dictionary, especially about the artists for disambiguation.

I also add relavant tags related to the artists in order to gat a better understanding of the song mood and genre.

In [11]:
#for every artist in both dictionaries add dismabiguation and mbz ids
for art in song['artists']: #for every artist in the song_ditionary
    for mbz_art in sr['recording']['artist-credit']:   #for every artist in the mbz dictionary
        if art['name'] == mbz_art['artist']['name']:   #if the name of the artist in the song dictionary is the same as the name of the artist in the mbz dictionary
            art['disambiguation'] = mbz_art['artist']['disambiguation'] #add the disambiguation to the song dictionary
            art['mbz_id'] = mbz_art['artist']['id'] #add the mbz id to the song dictionary
            art['mbz_tags'] = {}
            for tag in mbz_art['artist']['tag-list']: #for every tag in the mbz dictionary
                art['mbz_tags'][tag['name']] = tag['count'] #add the tag name and the tag count to the song dictionary

song

{'spotify_id': '1C2QJNTmsTxCDBuIgai8QV',
 'spotify_href': 'https://api.spotify.com/v1/tracks/1C2QJNTmsTxCDBuIgai8QV',
 'name': 'Resistance',
 'artists': [{'spotify_id': '12Chz98pHFMPJEknJQMWvI',
   'spotify_href': 'https://api.spotify.com/v1/artists/12Chz98pHFMPJEknJQMWvI',
   'name': 'Muse',
   'genres': ['modern rock', 'permanent wave', 'rock'],
   'disambiguation': 'UK rock band',
   'mbz_id': '9c9f1380-2516-4fc9-a3e6-f9f61941d090',
   'mbz_tags': {'alternative dance': '1',
    'alternative rock': '17',
    'anthemic': '1',
    'anxious': '1',
    'british': '7',
    'eclectic': '1',
    'electronic': '1',
    'england': '1',
    'english': '2',
    'funk rock': '1',
    'funky': '1',
    'groovy': '1',
    'indie rock': '1',
    'melodic': '1',
    'muse': '1',
    'new prog': '1',
    'political': '1',
    'pop rock': '2',
    'prog related': '1',
    'progressive rock': '5',
    'rock': '13',
    'science fiction': '1',
    'space': '1',
    'symphonic rock': '4',
    'synthpop':

I also want to add the link to wikidata (if existent) for the artists and some other infromation relevant to populate the KG in the future.

In [12]:
for art in song['artists']:
    wts = mbz.get_artist_by_id(art['mbz_id'], includes=["url-rels", "tags"])
    art['type'] = wts['artist']['type']
    for url in wts['artist']['url-relation-list']:
        if url['type'] == 'wikidata':
            art['wikidata_url'] = url['target']
wts

{'artist': {'id': '9c9f1380-2516-4fc9-a3e6-f9f61941d090',
  'type': 'Group',
  'name': 'Muse',
  'sort-name': 'Muse',
  'disambiguation': 'UK rock band',
  'isni-list': ['0000000122708200'],
  'country': 'GB',
  'area': {'id': '8a754a16-0027-3a29-b6d7-2b40ea0481ed',
   'name': 'United Kingdom',
   'sort-name': 'United Kingdom',
   'iso-3166-1-code-list': ['GB']},
  'begin-area': {'id': '86d2134f-d94d-4bf4-b89d-133280df1d30',
   'name': 'Teignmouth',
   'sort-name': 'Teignmouth'},
  'life-span': {'begin': '1994'},
  'url-relation-list': [{'type': 'allmusic',
    'type-id': '6b3e3c85-0002-4f34-aca6-80ace0d7e846',
    'target': 'https://www.allmusic.com/artist/mn0000514563',
    'direction': 'forward'},
   {'type': 'bandsintown',
    'type-id': 'ea45ed3d-2d5e-456e-8c32-94b6f51426e2',
    'target': 'https://www.bandsintown.com/a/143',
    'direction': 'forward'},
   {'type': 'BBC Music page',
    'type-id': 'd028a975-000c-4525-9333-d3c8425e4b54',
    'target': 'https://www.bbc.co.uk/music/

## 02. Information extraction

Note that in this notebook we only extract information for what concern the annotations since they are directly bonded to the lyrics and many times verified.

### References extraction with SpaCy
Firstly, I store all the annotations in a list. Then, I join all the entries into a single string, which will be my text to analyse.

In [13]:
ann_list = []

for ann in song['annotations']:
    ann_list.append(ann[1])
    
anns_txt = " ".join(ann_list).replace('\n\n', ' ').replace('\n', ' ')

Now I can perform **Named Entity Recognition (NER)** with SpaCy. I will use the pre-trained model `en_core_web_sm` for this purpose (already loaded at the beginning of the notebook).

For this aim I define two dictionaries: one for the Works of Art (WOA) and one for the Persons (PER). I will use these dictionaries to extract the references from the annotations related to these two different type of entities.

Within the dictionaries I will set the keys as the entities and the values as another dictionary with keys `count`and `annotation_number` so to track both the number of times the entity is mentioned and the number of annotations in which it is mentioned. in this way I can easily track the co-occurrence between the different kinds of entities.

By now, count and annotation_number are set to 0 for all the entities. Another iteration will compile these values.

In [14]:
ann_woa = {}
ann_per = {}

for ann_num, ann in enumerate(song['annotations']):
    doc = nlp(ann[1])
    
    for ent in doc.ents:
        if ent.label_ == 'WORK_OF_ART':
            woa_sub_str = re.sub(r'[’\']s$', '', ent.text)
            if ent.text not in ann_woa:
                ann_woa[woa_sub_str] = {'count': 0, 'annotation_number': {}}

        elif ent.label_ == 'PERSON':
            per_sub_str = re.sub(r'[’\']s$', '', ent.text)
            if ent.text not in ann_per:
                ann_per[per_sub_str] = {'count': 0, 'annotation_number': {}}


ann_woa, ann_per

({'1984': {'count': 0, 'annotation_number': {}},
  'Love': {'count': 0, 'annotation_number': {}}},
 {'Julia': {'count': 0, 'annotation_number': {}},
  'George Orwell': {'count': 0, 'annotation_number': {}},
  'Winston': {'count': 0, 'annotation_number': {}}})

#### Storing occurrences of entities in a dictionary

The named entities we extracted have not been recognized for all their occurences as Works of Art or People. Thus, we need to iterate over the annotations once again to find all the occurrences and compile in a better way the dictionary with the relevant data.

In [15]:
for ann_num, ann in enumerate(song['annotations']):
    clean_txt = re.sub(r'\W', ' ', ann[1]) #remove all the non alphanumeric characters
    words = clean_txt.split(' ')

    for woa in ann_woa: #for every work of art in the dictionary
        if len(woa.split(' ')) == 1: #if the work of art is one word
            ann_occurrences = words.count(woa) #count the number of occurrences of the word in the annotation
            if ann_occurrences > 0: #if the word is in the annotation
                ann_woa[woa]['annotation_number'][ann_num] = ann_occurrences #add the number of occurrences to the dictionary
                ann_woa[woa]['count'] += ann_occurrences #add the number of occurrences to the total count
        else: #if the work of art is more than one word
            ann_occurrences = clean_txt.count(woa) #here changes the variable on which to count
            if ann_occurrences > 0: #if the word is in the annotation
                    ann_woa[woa]['annotation_number'][ann_num] = ann_occurrences #add the number of occurrences to the dictionary
                    ann_woa[woa]['count'] += 1 #add the number of occurrences to the total count


    for per in ann_per: #for every person in the dictionary
        if len(per.split(' ')) == 1: #if the person is one word
            ann_occurrences = words.count(per) #count the number of occurrences of the word in the annotation
            if ann_occurrences > 0: #if the word is in the annotation
                ann_per[per]['annotation_number'][ann_num] = ann_occurrences #add the number of occurrences to the dictionary
                ann_per[per]['count'] += ann_occurrences #add the number of occurrences to the total count
        else: #if the person is more than one word
            ann_occurrences = clean_txt.count(per) #here changes the variable on which to count
            if ann_occurrences > 0: #if the word is in the annotation
                ann_per[per]['annotation_number'][ann_num] = ann_occurrences #add the number of occurrences to the dictionary
                ann_per[per]['count'] += 1 #add the number of occurrences to the total count
            
            remove_occurrences = clean_txt.replace(per, ' ').split(' ') #remove the occurrences of the person from the annotation
            ps = per.split(' ')[-1] #get the last word of the person which is more likely to be the surname
            new_ann_occurrences = remove_occurrences.count(ps) #count the number of occurrences of the last word of the person in the annotation
            if new_ann_occurrences > 0: #if the word is in the annotation
                if ann_num in ann_per[per]['annotation_number']: #if the annotation number is already in the dictionary
                    ann_per[per]['annotation_number'][ann_num] += new_ann_occurrences #add the number of occurrences to the dictionary
                else: #if the annotation number is not in the dictionary
                    ann_per[per]['annotation_number'][ann_num] = new_ann_occurrences #add the number of occurrences to the dictionary
                ann_per[per]['count'] += new_ann_occurrences #add the number of occurrences to the total count


ann_woa, ann_per

({'1984': {'count': 8,
   'annotation_number': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 7: 1, 8: 1, 9: 1}},
  'Love': {'count': 4, 'annotation_number': {3: 1, 4: 1, 5: 1, 6: 1}}},
 {'Julia': {'count': 8, 'annotation_number': {2: 2, 3: 2, 5: 2, 6: 1, 9: 1}},
  'George Orwell': {'count': 4, 'annotation_number': {2: 1, 4: 3}},
  'Winston': {'count': 7,
   'annotation_number': {2: 1, 3: 2, 5: 2, 6: 1, 9: 1}}})

## 03. Wikidata reconciliation

Here the aim is to connect with the Wikidata API and retrieve information about the entities we found in the annotations.

### Querying Wikidata API
I define a function for retrieving and store in json format the different resluts of the API call.

The aim is to use the namedentiti extracted from the previous step as the labels to serach for on Wikidata in order to retrieve the possible entities the names refer to. Howver, I do that only for the Work of Arts since they are the focus of the project.

For more information about Wikidata API and parameters settings see the [official documentation](https://www.mediawiki.org/wiki/Wikibase/API/en) and the `wbsearchentities` [action page](https://www.wikidata.org/w/api.php?action=help&modules=wbsearchentities).

In [16]:
def wikidata_reconciliation(query):
    API_WD = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbsearchentities',
        'format': 'json',
        'language': 'en',
        'type': 'item',
        'search': query # the query string
    }
    
    # query wd API    
    return requests.get(API_WD, params = params).json() 

I set a dictionary for storing information about the candidate entities for each label searched. For each candidate entity I retrieve, I store in the dictionary only the ones that are instances of a subclass of the class `creative work`.

For each named entity as key I store a list of dictionaries with the information about the candidate entities. At the first iteration, each dictionary has the following keys: `id` and `description`.

In [17]:
candidates = {}

for woa in ann_woa:
    candidates[woa] = []

    r = wikidata_reconciliation(woa)

    if 'search' in r and len(r['search']) >= 1: # if there is at least one result
        for result in r['search']:
            qid = result['title']
            query_string = """ASK { wd:"""+qid+""" wdt:P31 ?type. ?type wdt:P279* wd:Q17537576 }""" # query string to check if the entity is ANYTHING BEING SUBCLASS OF CREATIVE WORK
        
            # this time I query the wikidata endpoint directly
            res = return_sparql_query_results(query_string) 
            
            if res["boolean"] == True: #If the answer is true
                entity = {}
                #entity['class'] = 'creative work' 
                entity['id'] = result['title']
                entity['description'] = result['description']
                candidates[woa].append(entity)
            
    else:
        print('no results matching the entity name')

candidates

{'1984': [{'id': 'Q208460',
   'description': 'dystopian novel written by George Orwell'},
  {'id': 'Q208424', 'description': '1984 film by Michael Radford'},
  {'id': 'Q664787', 'description': '1956 film by Michael Anderson'},
  {'id': 'Q203383',
   'description': '1984 American television commercial directed by Ridley Scott'},
  {'id': 'Q208429', 'description': '1984 studio album by Van Halen'}],
 'Love': [{'id': 'Q18643718',
   'description': 'American romantic comedy web television series'},
  {'id': 'Q19829024', 'description': '2015 film by Gaspar Noé'}]}

## 04. Entity Disambiguation

Candidates entities retrieved by the queries will be disambiguated on the basis of the information retrieved from Wikidata and sentence similarity between the annotations and the description of the entity.

#### Wikidata information retrieval
With an iteration over the candidate entities for each named entity, I set a deendency score of 0 and a relevance score which is inverse to the position of the candidate entity in the query results. In this way, first candidate retrieved will have an higher relevance score than the second one, and so on.
In this way, the dependency score will be higher for the candidate entities that are not derivative works of the other candidate entities.

Thus, for each candidate I make some SPARQL queries. The first aims at understanding if the people mentioned in the annotations are somehow related to the candidate entity by means of first-level entity relationships in Wikidata. If so, I increase the score of people found in Wikidata. For the second, I compile a SPARQL query aiming to ask whether the candidte entity is a derivative work of at least one of the other candidate instances. If so, I decrese the dependency score by 1. 

I also make use of the `time.sleep()` function to avoid making too many requests to the API in a short time and thus being blocked.

In [18]:
req = 0 # this is the number of requests made to the WD endpoint

people = []
for per in ann_per:
    people.append((per, ann_per[per]['count']))


for woa in candidates:
    relevance = len(candidates[woa])

    for idx, cand in enumerate(candidates[woa]):

        cand['relevance'] = relevance 
        relevance -= 1 #decrease for the next iteration

        cand['dependency_score'] = 0  #this is the score that will be used to compute the dependency between candidates
        cand['person_in_description'] = 0 #this is the number of times the person is mentioned in the description of the candidate
        cand['person_in_wikidata'] = 0 #this is the number of times a person is mentioned in the wikidata page of the candidate
        
        for per in people:
            per_split = per[0].split(' ')
            if len(per_split) > 1:
                per_ts = per_split[-1]  #take the last token of the person name [SI PUO' FARE MEGLIO INCLODENDO TUTTI I TOKEN DELLA PERSONA IN UN CICLO FOR E VEDERE SE SONO TUTTI IN DESCRIPTION ECC.]
            else:
                per_ts = per[0]

            if per_ts in cand['description']:
                    cand['person_in_description'] += 1 #if the person is mentioned in the description of the candidate, increase the score

            query_people = """ASK { wd:"""+cand['id']+""" ?p ?o.    
                                ?o rdfs:label ?label.
                                FILTER (CONTAINS((?label), '"""+per_ts+"""'))
                                }"""

            if req % 5 == 0:
                time.sleep(5) # to avoid to be blocked by WD endpoint for too many requests in a short time
                
            res_people = return_sparql_query_results(query_people)
            req += 1

            if res_people["boolean"] == True:
                cand['person_in_wikidata'] += 1

        #____________Second query____________

        qs = "VALUES ?items { "      # this is the SPARQL query string to complete
        for i in candidates[woa]:    # i repeat the loop since for every candidate i want to ask if it is dependent on at least one of the other candidates (meaning it's a derivative wor of another candidate)
            if candidates[woa].index(i) == idx:
                continue
            else:
                wd_id = "wd:"+i['id']
                qs += wd_id+" "
        qs += "}"

        query_dependency =  """ASK {
                                    VALUES ?d { wdt:P144 wdt:P737 wdt:P941 wdt:P8371 }
                                    """+qs+"""
                                    {
                                        wd:"""+cand['id']+""" ?d ?items.
                                    }UNION{
                                        ?items wdt:P4969 wd:"""+cand['id']+""".
                                    }
                            }"""
        
        if req % 5 == 0:
            time.sleep(5) # to avoid to be blocked by WD endpoint for too many requests in a short time

        res_dependency = return_sparql_query_results(query_dependency)
        req += 1

        if res_dependency["boolean"] == True:
            candidates[woa][idx]['dependency_score'] -= 1 # if the answer is true, it means that the candidate is dependent on at least one of the other candidates, so I decrease its dependency score


candidates

{'1984': [{'id': 'Q208460',
   'description': 'dystopian novel written by George Orwell',
   'relevance': 5,
   'dependency_score': 0,
   'person_in_description': 1,
   'person_in_wikidata': 3},
  {'id': 'Q208424',
   'description': '1984 film by Michael Radford',
   'relevance': 4,
   'dependency_score': -1,
   'person_in_description': 0,
   'person_in_wikidata': 1},
  {'id': 'Q664787',
   'description': '1956 film by Michael Anderson',
   'relevance': 3,
   'dependency_score': -1,
   'person_in_description': 0,
   'person_in_wikidata': 1},
  {'id': 'Q203383',
   'description': '1984 American television commercial directed by Ridley Scott',
   'relevance': 2,
   'dependency_score': -1,
   'person_in_description': 0,
   'person_in_wikidata': 0},
  {'id': 'Q208429',
   'description': '1984 studio album by Van Halen',
   'relevance': 1,
   'dependency_score': 0,
   'person_in_description': 0,
   'person_in_wikidata': 0}],
 'Love': [{'id': 'Q18643718',
   'description': 'American romantic

#### Sentence similarity with BERT
For comparing the annotations with the description of the candidate entities, I use the BERT model for sentence similarity. 

First, I take the annotations one by one. I clean them from new line characters and other special characters and then I split the annotations into sentences and store them into a list in order to be processed.

In [19]:
sents = []

for ann in song['annotations']:
    clean_txt = ann[1].replace('\n\n', ' ').replace('\n', ' ').replace('   ', ' ').replace('  ', ' ')
    sts = sent_tokenize(clean_txt)
    for s in sts:
        sents.append(s)

sents

['He wants from the significant other not to let him go.',
 'Lips being sealed can be interpreted in two ways: one, they should be always ‘that’ close and two, they should never utter a word to anybody about each other.',
 'In 1984, they have to ‘keep their lips sealed’ i.e.',
 'keep their romance a secret, as love is considered a crime in the novel’s world.',
 'If the government found out about them, they would be punished, as specified in the verses.',
 'Metaphorically, it’s at night when we are safe, when we can do secret things.',
 'When the night ends, everybody can see us, so “we can’t pretend”.',
 'The only thing we can do is to run, and think that someone or something will save us.',
 'As this song and the whole album are influenced by 1984, we all know (spoiler alert!)',
 'that although they run and ask to be saved, they won’t be protected from harm.',
 'In Orwell’s 1984, Winston and Julia cannot show any public signs of their relationship lest it be discovered by the Party; e

Thus, I define a dictionary. Here I store the BERT embeddings of the sentences in which the named entity is mentioned.

Meanwhile, I also embed the description of the candidate entities and store them in the candidates dictionary. Here, I remove the entity name from the description to avoid that the model learns to recognize the entity from the description itself.

Finally, I compute the cosine similarity between the embeddings of the sentences in which the entity is mentioned and the embeddings of the description of the candidate entities. I take the mean of these values and I store it in the dictionary.

The last step consists in computing the final score using the other partial scores computed until now. I also store the final score in the dictionary.

In [20]:
to_embed = {}

for woa in candidates:
    to_embed[woa] = []

    for s in sents:
        if woa in s:
            # ----- You can insert here code for defining a context window around the entity -----
            to_embed[woa].append(model.encode(s, convert_to_tensor=True))
            
    for ent in candidates[woa]:
       #________Sentence embedding________ 
        ent['description_embedding'] = model.encode(re.sub(woa, ' ', ent['description']), convert_to_tensor=True) # I remove the entity name from the description to avoid that the model learns to recognize the entity from the description itself

        #________Averaging cosine similarities________
        to_avg = [] # this is the list of cosine similarities between the entity description and the sentences containing the entity name
        for emb in to_embed[woa]:
            cos_sim = util.cos_sim(ent['description_embedding'], emb)
            to_avg.append(cos_sim[0][0])

        ent['similarity_score'] = round(float(sum(to_avg)/len(to_avg)), 4)
        del ent['description_embedding']
        
        #________Final score________
        ent['final_score'] = round(float(ent['dependency_score'] + ent['person_in_description'] + ent['person_in_wikidata'] + ent['similarity_score']), 4)

The algorithm works fine this way, however, you can add a context window and embed the contex window only instead of the full sentence in which the entity is mentioned.

Code:

````
li = word_tokenize(s)  #word_tokenize(re.sub(r'\W', ' ', s))
idx = li.index(woa)
if idx < 4:
    start = 0
    if len(li) < 9:
        end = len(li)
    else:
        end = 9
else:
    if len(li) < 9:
        start = 0
        end = len(li)
    else:
        if idx + 4 > len(li):
            future_tokens = len(li)-1-idx
            past_tokens = 9 - future_tokens
            if idx - past_tokens < 0:
                start = 0
            else:
                start = idx - past_tokens
            end = len(li)
        
context = ' '.join(li[start:end])
``````

Before going on with the other sections, I choose the entity with the highest score for each named entity and I store it in a dictionary.

I consider relevance of output only in the case in which the final score of two or more candidates is the same.

In [21]:
chosen = {}
for woa in candidates:
    start = 0
    best = ''
    for ent in candidates[woa]:
        if ent['final_score'] > start:
            start = ent['final_score']
            best = ent
        elif ent['final_score'] == start:
            if best['relevance'] < ent['relevance']:
                best = ent
        if candidates[woa].index(ent) == len(candidates[woa]) -1 and (best['final_score'] >= 1 or best['similarity_score'] >= 0.4):
            chosen[woa] = best
            
chosen

{'1984': {'id': 'Q208460',
  'description': 'dystopian novel written by George Orwell',
  'relevance': 5,
  'dependency_score': 0,
  'person_in_description': 1,
  'person_in_wikidata': 3,
  'similarity_score': 0.3814,
  'final_score': 4.3814}}

## 05. Wikipedia scraping and DBpedia links retrieval with BeautifulSoup and SPARQLWrapper

Before start scrping the wikipedia page, we need its URL. Thus, I ask Wikidata for the URL of the page of the entity I am interested in. 

In [22]:
for woa in chosen:
    query_string = """SELECT ?URL
                        WHERE {
                            ?URL schema:about wd:"""+chosen[woa]['id']+""".
                            ?URL schema:isPartOf <https://en.wikipedia.org/>.
                    }"""
    
    res = return_sparql_query_results(query_string)
    value = res['results']['bindings'][0]['URL']['value']
    if value:
        chosen[woa]['wikipedia_url'] = value

Once added the URLs to the dictionary, I can start scraping the page of the different entities.

For the project's aim, I am interested only in the main work of art referenced by the initial song. Therefore, I select the one with the highest score and I scrape its page.

In [23]:
start = 0
selected = ''
for woa in chosen:
    chosen[woa]['name'] = woa
    if chosen[woa]['final_score'] > start:
        start = chosen[woa]['final_score']
        selected = chosen[woa]
    elif chosen[woa]['final_score'] == start:
        if selected['relevance'] < chosen[woa]['relevance']:
            selected = chosen[woa]

selected

{'id': 'Q208460',
 'description': 'dystopian novel written by George Orwell',
 'relevance': 5,
 'dependency_score': 0,
 'person_in_description': 1,
 'person_in_wikidata': 3,
 'similarity_score': 0.3814,
 'final_score': 4.3814,
 'wikipedia_url': 'https://en.wikipedia.org/wiki/Nineteen_Eighty-Four',
 'name': '1984'}

### Scraping the page with bs4 library

I start by getting the URL from the song dictionary. Then, I make the request to the page and I parse the HTML with BeautifulSoup.

I also cut the link of the page taking only the last part of the path and I store it in a variable for later use.

In [24]:
url = selected['wikipedia_url']
cut_url = url.replace('https://en.wikipedia.org/wiki/', '')

page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser").select('body')[0]

Inside the HTML page, i want to focus on specific section I know containing relavant information about the connections betweeen the searched entity and other relavant entities.

Therefore I serach for all the `h2` tags containing a `span`tag whose text matches specific patterns. In particular I want to focs on the sections dealing with adaptations and cultural impact of the entity.

Starting from the sections' titles, I serach for all the siblings being `ul` tags. In this way I can retrieve all the lists of references and links to other Wikipedia pages.

In [25]:
spans = soup.find_all('span', string=[re.compile('^(Cultural)? [Ii]mpact$'), re.compile('^[Aa]daptations?')])

h2_tags = [span.parent for span in spans if span.parent.name == 'h2']

li_tags = []
for h2 in h2_tags:
    for el in h2.next_siblings:
        if el.name == 'h2':
            break
        if el.name == 'ul':
            li_tags.extend(el.find_all('li'))

li_tags

[<li>In 1955, an episode of BBC's <i><a href="/wiki/The_Goon_Show" title="The Goon Show">The Goon Show</a></i>, <i>1985</i>, was broadcast, written by <a href="/wiki/Spike_Milligan" title="Spike Milligan">Spike Milligan</a> and <a href="/wiki/Eric_Sykes" title="Eric Sykes">Eric Sykes</a> and based on <a href="/wiki/Nigel_Kneale" title="Nigel Kneale">Nigel Kneale</a>'s <a class="mw-redirect" href="/wiki/Nineteen_Eighty-Four_(UK_TV_programme)" title="Nineteen Eighty-Four (UK TV programme)">television adaptation</a>. It was re-recorded about a month later with the same script but a slightly different cast.<sup class="reference" id="cite_ref-116"><a href="#cite_note-116">[116]</a></sup> <i>1985</i> parodies many of the main scenes in Orwell's novel.</li>,
 <li>In 1970, the American rock group <a href="/wiki/Spirit_(band)" title="Spirit (band)">Spirit</a> released the song "1984" based on Orwell's novel.</li>,
 <li>In 1973, ex-<a href="/wiki/Soft_Machine" title="Soft Machine">Soft Machine</

### Querying DBpedia for wikipedia links

For the wikipedia page I found, I want to retrieve the DBpedia page. I do that by means of a SPARQL query.

Once I get the DBpedia page, I ask for all the links to other DBpedia pages of entities whose class is relevant for my search.

In [26]:
db_query = """
        PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX wikipedia-en: <http://en.wikipedia.org/wiki/>
        PREFIX schema: <http://schema.org/>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX xml: <http://www.w3.org/XML/1998/namespace>

        SELECT DISTINCT ?wikiLinks ?type ?dbEntities ?label
        WHERE { 
                VALUES ?type { dbo:Artist dbo:MusicalWork dbo:Artwork dbo:Film dbo:TelevisionShow dbo:TelevisionSeason dbo:TelevisionEpisode dbo:Poem dbo:Book dbo:Comic dbo:Play dbo:Group} #dbo:Person dbo:WrittenWork  dbo:RadioProgram 
                ?entity foaf:isPrimaryTopicOf wikipedia-en:"""+cut_url+""";
                        dbo:wikiPageWikiLink ?dbEntities.
                ?dbEntities rdf:type ?type;
                        foaf:isPrimaryTopicOf ?wikiLinks;
                        rdfs:label ?label.
                FILTER(langMatches(lang(?label),"EN"))
                }
        """

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)
sparql.setQuery(db_query)

I perform the query and analyse the resuts storing in different lists teh links to the different entities and their class. I use lists for maintaining the order of the results.

In [27]:
db_res = sparql.query().convert()['results']['bindings']

links = [i['wikiLinks']['value'].replace('http://en.wikipedia.org', '') for i in db_res]
classes = [i['type']['value'].replace('http://dbpedia.org/ontology/', '') for i in db_res]
db_uri = [i['dbEntities']['value'] for i in db_res]
label = [i['label']['value'] for i in db_res]

print(links, classes, db_uri, label)

['/wiki/The_Children_of_Men', '/wiki/Under_the_Spreading_Chestnut_Tree', '/wiki/Bastille_(band)', '/wiki/Hugh_Hopper', '/wiki/Betrayal_of_the_Left', '/wiki/Paul_Weller', '/wiki/V_for_Vendetta', '/wiki/V_for_Vendetta_(film)', '/wiki/The_Jam', '/wiki/1984_(1956_film)', '/wiki/1984_(Westinghouse_Studio_One)', '/wiki/1984_(advertisement)', '/wiki/1984_(opera)', '/wiki/1984_(play)', '/wiki/Give_Me_the_Future', '/wiki/Muse_(band)', '/wiki/NBC_University_Theatre', '/wiki/The_Resistance_(album)', '/wiki/The_Theory_and_Practice_of_Oligarchical_Collectivism', '/wiki/This_Is_the_Modern_World', '/wiki/Animal_Farm', '/wiki/Linguistic_relativity', '/wiki/Star_Trek:_The_Next_Generation', '/wiki/Studio_One_(American_TV_series)', '/wiki/Coming_Up_for_Air', '/wiki/Pastiche', '/wiki/Spirit_(band)', '/wiki/Darkness_at_Noon', '/wiki/Virtual_Light', '/wiki/The_Captive_Mind', '/wiki/2_+_2_=_5_(song)', '/wiki/Eric_Sykes', '/wiki/Brave_New_World', '/wiki/Nineteen_Eighty-Four_(1984_film)', '/wiki/Nineteen_Eight

### Using DBpedia links and Wikipedia page for understanding more abot entities connections

First, I define an empty dictionary that I compile with the information I retrieve from the DBpedia page. For each link to an entity I retrieve in the wikipedia lists, if it is one of the links I retrieved from DBpedia, i store it in the dictionary along with the text contained in the `a` tag of the Wikipedia link, the class of the entity, and the internal id i set for the entity.

In [28]:
wld = {}

ent_id = 0
for link in links:
    for li in li_tags:
        a_tags = li.find_all('a')

        for a in a_tags:
            href = a.get('href')
            if href == link or urllib.parse.unquote(href) == link or re.sub(r'#.+', '', href) == link: #I use the unquote function to convert the %20 in the link to spaces
                ls = str(a)
                if ls not in wld:
                    ent_id += 1
                    wld[ls] = {"wikipedia link": link, "text": [], "entity id": "Entity-"+str(ent_id), 'class': classes[links.index(link)], 'label': label[links.index(link)],'db_uri': db_uri[links.index(link)]}
                if a.text not in wld[ls]["text"]:
                    wld[ls]['text'].append(a.text)


wld

{'<a href="/wiki/Bastille_(band)" title="Bastille (band)">Bastille</a>': {'wikipedia link': '/wiki/Bastille_(band)',
  'text': ['Bastille'],
  'entity id': 'Entity-1',
  'class': 'Group',
  'label': 'Bastille (band)',
  'db_uri': 'http://dbpedia.org/resource/Bastille_(band)'},
 '<a href="/wiki/Hugh_Hopper" title="Hugh Hopper">Hugh Hopper</a>': {'wikipedia link': '/wiki/Hugh_Hopper',
  'text': ['Hugh Hopper'],
  'entity id': 'Entity-2',
  'class': 'Artist',
  'label': 'Hugh Hopper',
  'db_uri': 'http://dbpedia.org/resource/Hugh_Hopper'},
 '<a href="/wiki/Paul_Weller" title="Paul Weller">Paul Weller</a>': {'wikipedia link': '/wiki/Paul_Weller',
  'text': ['Paul Weller'],
  'entity id': 'Entity-3',
  'class': 'Artist',
  'label': 'Paul Weller',
  'db_uri': 'http://dbpedia.org/resource/Paul_Weller'},
 '<a href="/wiki/The_Jam" title="The Jam">The Jam</a>': {'wikipedia link': '/wiki/The_Jam',
  'text': ['The Jam'],
  'entity id': 'Entity-4',
  'class': 'Group',
  'label': 'The Jam',
  'db_ur

Now I build another dictionary that I will use in the next section where i use internal ids as keys and other information as values.

In [29]:
fast_check = {}
for k in wld:
    fast_check[wld[k]['entity id']] = (wld[k]['wikipedia link'] , wld[k]['class'], wld[k]['db_uri'], wld[k]['label'])
fast_check

{'Entity-1': ('/wiki/Bastille_(band)',
  'Group',
  'http://dbpedia.org/resource/Bastille_(band)',
  'Bastille (band)'),
 'Entity-2': ('/wiki/Hugh_Hopper',
  'Artist',
  'http://dbpedia.org/resource/Hugh_Hopper',
  'Hugh Hopper'),
 'Entity-3': ('/wiki/Paul_Weller',
  'Artist',
  'http://dbpedia.org/resource/Paul_Weller',
  'Paul Weller'),
 'Entity-4': ('/wiki/The_Jam',
  'Group',
  'http://dbpedia.org/resource/The_Jam',
  'The Jam'),
 'Entity-5': ('/wiki/1984_(advertisement)',
  'Film',
  'http://dbpedia.org/resource/1984_(advertisement)',
  '1984 (advertisement)'),
 'Entity-6': ('/wiki/Give_Me_the_Future',
  'MusicalWork',
  'http://dbpedia.org/resource/Give_Me_the_Future',
  'Give Me the Future'),
 'Entity-7': ('/wiki/Muse_(band)',
  'Group',
  'http://dbpedia.org/resource/Muse_(band)',
  'Muse (band)'),
 'Entity-8': ('/wiki/The_Resistance_(album)',
  'MusicalWork',
  'http://dbpedia.org/resource/The_Resistance_(album)',
  'The Resistance (album)'),
 'Entity-9': ('/wiki/This_Is_the_M

I take the text in the `li` tags I found and put it in a list, which becomes my text to analyse. For eeach key in the dictionary of links, I check if the key is in the text. If so, I replace the `li` tag with the id of the link to facilitate the relation extraction process.

I also convertthe list in a single text string.

In [30]:
to_parse = []
src_to_parse = [] #will be reused to store the source string in the final graph

for li in li_tags:
    str_li = str(li)
    src_li = re.sub(r'<.+?>', '', str_li)
    src_li = re.sub(r'\[[0-9]+\]', ' ', src_li)
    src_to_parse.append(src_li)

    for k in wld:
        if k in str_li:
            str_li  = str_li.replace(k, wld[k]['entity id'])
            
    str_li = re.sub(r'<.+?>', '', str_li)
    str_li = re.sub(r'\[[0-9]+\]', ' ', str_li) #remove the numbers in square brackets (wikidata references)
    to_parse.append(str_li)


to_parse_txt = ' '.join(to_parse)

to_parse_txt

'In 1955, an episode of BBC\'s Entity-16, 1985, was broadcast, written by Spike Milligan and Entity-13 and based on Nigel Kneale\'s television adaptation. It was re-recorded about a month later with the same script but a slightly different cast.  1985 parodies many of the main scenes in Orwell\'s novel. In 1970, the American rock group Entity-11 released the song "1984" based on Orwell\'s novel. In 1973, ex-Entity-21 bassist Entity-2 released an album called 1984 on the Columbia label (UK), consisting of instrumentals with Orwellian titles such as "Miniluv", "Minipax", "Minitrue", and so forth. In 1974, David Bowie released the album Entity-19, which is thought to be loosely based on the novel Nineteen Eighty-Four. It includes the tracks "We Are The Dead", "1984" and "Big Brother". Before the album was made, Bowie\'s management (MainMan) had planned for Bowie and Tony Ingrassia (MainMan\'s creative consultant) to co-write and direct a musical production of Orwell\'s Nineteen Eighty-Fou

## 06. Relation extraction

### A. Brute force approach

For each string contained in the list of content `to_parse`, I iterate over the sentences extracting the entities and the verbs.

At the end of the string I have a dictionary where for each sentence I have information about the verbs and the entities contained. Therefore, for each first-level iteration string, i apply the following concepts and assumptions:
- I assume that for each sentence there is at maximum one significant verb that allows me to infer which kind of relation exists between the main entityand other entities mentioned in the sentence keeping in mind that the main entity is almost never cited since it is the implicit subject of the sentence.
- If in the same sentence there are both entities and a significant verb, I assume that the verb is related in some way to all the entities.
- If in the sentence there are only entities, it is possible that the following sentence has a significant verb that is related to the entities in the previous sentence by meas of pronouns.
- If in the sentence there is only the verb, it is possible that it is referred to the entities in the previous sentence by means of pronouns.

In [31]:
rel_sent = []
relations = {}

for s in to_parse:
    if len(rel_sent) == 1:
        for k in rel_sent:
            if len(k['entities']) > 0:
                if len(k['verbs']) == 1:
                    for ent in k['entities']:
                        relations[ent] = k['verbs'][0]
                elif len(k['verbs']) == 0:
                    for ent in k['entities']:
                        relations[ent] = 'general'
    elif len(rel_sent) > 1:
        for k in rel_sent:
            idx = rel_sent.index(k)
            if len(k['entities']) > 0:
                if len(k['verbs']) == 1:
                    for ent in k['entities']:
                        relations[ent] = k['verbs'][0]
                elif len(k['verbs']) == 0 and idx+1 < len(rel_sent):
                    if (len(rel_sent[idx+1]['entities']) > 0 and rel_sent[idx]['entities'] == rel_sent[idx+1]['entities'] and len(rel_sent[idx+1]['verbs']) == 1) or (len(rel_sent[idx+1]['entities']) == 0 and len(rel_sent[idx+1]['verbs'] == 1)):
                        for ent in k['entities']:
                            relations[ent] = rel_sent[idx+1]['verbs'][0]
                    else:
                        for ent in k['entities']:
                            relations[ent] = 'general'
                else:
                    for ent in k['entities']:
                        relations[ent] = k['verbs']
            else:
                continue #the previous step should be able to handle this case too
                
    doc = nlp(s)
    rel_sent = []

    for sent in doc.sents:
        cand_ent = set()
        cand_verbs = set()
        current_verb = ''
        lemma = ''
        next_tok = False
        for token in sent:
            #print(token.text, token.pos_, token.dep_)
            if token.text in fast_check:
                cand_ent.add(token.text)
            elif token.pos_ == 'VERB':
                current_verb = token.text
                lemma = token.lemma_
                next_tok = True
            else:
                if next_tok:
                    if token.dep_ == 'prep' or token.dep_ == 'agent':
                        next_tok = False
                        current_verb += ' ' + token.text
                        cand_verbs.add((current_verb, lemma))
                        current_verb = ''
                    else:
                        next_tok = False
                        cand_verbs.add((current_verb, lemma))
                        current_verb = ''

        if cand_verbs:
            sel_ve = []
            for tup in cand_verbs:
                if re.match(r'based\son|(inspired|influenced)(\sby)?|(derive[ds]?|adapted)(\sfrom)?|(reference|mention)(s|ed\s(by|in)?)?|alludes?\sto|quote[sd]?', tup[0]):
                    sel_ve.append(tup)

        if cand_ent or sel_ve:
            ev_dict = {'entities': [], 'verbs': []}

            for ent in cand_ent:
                if ent not in ev_dict['entities']:
                    ev_dict['entities'].append(ent)

            for verb in sel_ve:
                if verb[1] not in ev_dict['verbs']:
                    ev_dict['verbs'].append(verb[1])
        
            rel_sent.append(ev_dict)
        

relations

{'Entity-16': 'base',
 'Entity-13': 'base',
 'Entity-11': 'base',
 'Entity-21': 'general',
 'Entity-2': 'general',
 'Entity-19': 'base',
 'Entity-9': 'general',
 'Entity-4': 'general',
 'Entity-3': 'general',
 'Entity-5': 'general',
 'Entity-20': 'quote',
 'Entity-15': 'quote',
 'Entity-22': 'quote',
 'Entity-17': 'general',
 'Entity-10': 'general',
 'Entity-12': 'general',
 'Entity-14': 'general',
 'Entity-23': 'general',
 'Entity-7': 'influence',
 'Entity-8': 'influence',
 'Entity-1': 'general',
 'Entity-6': 'general',
 'Entity-18': 'reference'}

### B. Google Bard and WordNet approach

I set the Google Bard API key and I define a function for querying the API and retrieving the results in json format.

In [32]:
token = os.getenv('GOOGLEBARD_TOKEN')
bard = Bard(token=token) 

Thus, I ask directly the model to extract the relations between entities by means of a prompt where I explain how to extract the relations, how to identify the entities and how to return the results.

As format I choose to ask for a pandas dataframe, which I will directly reuse in my code.

In [33]:
bard_res = bard.get_answer("You are an expert in relation extraction from plain text and your aim is to identify the existing relations between the creative work '1984', which is the implicit subject of the text you will be fed with, and other ENTITIES that you will find in the text. ### Return the relationships between '1984' and the entities marked as 'Entity- ' in the text in the following format: subject | relation | 1984 as a pandas dataframe following this example: ```df = pd.DataFrame({'Entity': ['Entity-', 'Entity-', 'Entity-'], 'Relation': ['based on', 'derived from', 'inspired by'], '1984': '1984' })```. ### Text: "+to_parse_txt)

print(bard_res['content'])

Sure, here is the DataFrame of the relationships between '1984' and the entities marked as 'Entity- ' in the text:

```python
import pandas as pd

df = pd.DataFrame({
    'Entity': ['Entity-16', 'Entity-11', 'Entity-2', 'Entity-19', 'Entity-4', 'Entity-5', 'Entity-20', 'Entity-17', 'Entity-23', 'Entity-7', 'Entity-1', 'Entity-18'],
    'Relation': ['based on', 'based on', 'inspired by', 'loosely based on', 'references', 'references', 'references', 'bears some resemblances to', 'Orwellian by title and content', 'influenced by', 'references', 'directly references'],
    '1984': '1984'
})

print(df.to_string())
```

```
    Entity  Relation  1984
0  Entity-16  based on  1984
1  Entity-11  based on  1984
2  Entity-2   inspired by  1984
3  Entity-19  loosely based on  1984
4  Entity-4   references  1984
5  Entity-5   references  1984
6  Entity-20  references  1984
7  Entity-17  bears some resemblances to  1984
8  Entity-23  Orwellian by title and content  1984
9  Entity-7   influenced by  1

Therefore, I am now able to get the result and divide it on the basis of the output format I chose, which contains "```" as separator for the python code.

I find which is the relevant section of the oputput and i clean it to maintain only the definition of the dataframe.

In [34]:
out_list = bard_res['content'].replace('\n\n', ' ').replace('\n', ' ').replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').split('```')
print(out_list)

for el in out_list:
    if 'df = pd.DataFrame' in el:
        df_idx = out_list.index(el)

df_string = re.sub(r'(python)\s?import pandas as pd (.+)\s?\=', '', out_list[df_idx])
df_string = re.sub(r'print\(.+\)', '', df_string).strip()

response_df = eval(df_string)
response_df

["Sure, here is the DataFrame of the relationships between '1984' and the entities marked as 'Entity- ' in the text: ", "python import pandas as pd df = pd.DataFrame({ 'Entity': ['Entity-16', 'Entity-11', 'Entity-2', 'Entity-19', 'Entity-4', 'Entity-5', 'Entity-20', 'Entity-17', 'Entity-23', 'Entity-7', 'Entity-1', 'Entity-18'], 'Relation': ['based on', 'based on', 'inspired by', 'loosely based on', 'references', 'references', 'references', 'bears some resemblances to', 'Orwellian by title and content', 'influenced by', 'references', 'directly references'], '1984': '1984' }) print(df.to_string()) ", ' ', ' Entity Relation 1984 0 Entity-16 based on 1984 1 Entity-11 based on 1984 2 Entity-2 inspired by 1984 3 Entity-19 loosely based on 1984 4 Entity-4 references 1984 5 Entity-5 references 1984 6 Entity-20 references 1984 7 Entity-17 bears some resemblances to 1984 8 Entity-23 Orwellian by title and content 1984 9 Entity-7 influenced by 1984 10 Entity-1 references 1984 11 Entity-18 direct

Unnamed: 0,Entity,Relation,1984
0,Entity-16,based on,1984
1,Entity-11,based on,1984
2,Entity-2,inspired by,1984
3,Entity-19,loosely based on,1984
4,Entity-4,references,1984
5,Entity-5,references,1984
6,Entity-20,references,1984
7,Entity-17,bears some resemblances to,1984
8,Entity-23,Orwellian by title and content,1984
9,Entity-7,influenced by,1984


Retrieved infromation seems to be ok, but there are still some issues to be solved. For example, the model does not recognize all the entities in the sentence and it does not use only the specified labels to describe the relations.

So, the first step is to build a dictionary with the `li`tags index and the group of entities contained for each tag.

In [35]:
groups = {}
count = 0
for sent in to_parse:
    g = re.findall(r'Entity-[0-9]+', sent)
    if g:
        groups[count] = g
    count += 1

groups

{0: ['Entity-16', 'Entity-13'],
 1: ['Entity-11'],
 2: ['Entity-21', 'Entity-2'],
 3: ['Entity-19'],
 4: ['Entity-4', 'Entity-9', 'Entity-3'],
 5: ['Entity-5'],
 6: ['Entity-20', 'Entity-15', 'Entity-22'],
 7: ['Entity-17', 'Entity-10'],
 8: ['Entity-23', 'Entity-12', 'Entity-14'],
 9: ['Entity-7', 'Entity-8'],
 11: ['Entity-1', 'Entity-6'],
 12: ['Entity-18']}

Thus, I make the following assumption to assign a relation to the missing entities:
- If the missing entity is in the same sentence of a found entity, I assume that the relation is the same as the one found for the other entity since it should be more probable to have only one relation for `li` tag.

Therefore I check for the relation found for the other entity in the same sentence and I assign it to the missing entity.

In [36]:
entity_relation = {}

for ent in response_df['Entity'].values:
    relation = response_df[response_df['Entity'] == ent]['Relation'].values[0]
    entity_relation[ent] = relation
    for group in groups:
        if ent in groups[group]:
            for e in groups[group]:
                if e not in response_df['Entity'].values:
                    entity_relation[e] = relation
        

print(entity_relation)

{'Entity-16': 'based on', 'Entity-13': 'based on', 'Entity-11': 'based on', 'Entity-2': 'inspired by', 'Entity-21': 'inspired by', 'Entity-19': 'loosely based on', 'Entity-4': 'references', 'Entity-9': 'references', 'Entity-3': 'references', 'Entity-5': 'references', 'Entity-20': 'references', 'Entity-15': 'references', 'Entity-22': 'references', 'Entity-17': 'bears some resemblances to', 'Entity-10': 'bears some resemblances to', 'Entity-23': 'Orwellian by title and content', 'Entity-12': 'Orwellian by title and content', 'Entity-14': 'Orwellian by title and content', 'Entity-7': 'influenced by', 'Entity-8': 'influenced by', 'Entity-1': 'references', 'Entity-6': 'references', 'Entity-18': 'directly references'}


Still, the relations are not omogeneous and labels could be potentially uncountable. I need to find a way to group the relations in a smaller number of categories indeed.

I start by defining a list of tuples that will be used as a reference vocabulary fro relation definition. In particular, each tuple will contain the lemma of the verb that expresses the relation, the WordNet Synset that represents the meaning of the verb in this context of use (human based choice of the synset), and the word form with which the verb appears in the majority of times in the text.

In [37]:
lemma_synset = [('base', 'establish.v.08', 'based on'), ('inspire', 'inspire.v.02', 'inspired by'), ('influence', 'determine.v.02', 'influenced by'), ('derive', 'derive.v.04', 'derived from'), ('adapt', 'adapt.v.01', 'adapted from'), ('reference', 'reference.v.01', 'references'), ('allude', 'allude.v.01', 'alludes to'), ('mention', 'mention.v.01', 'mentions'), ('quote', 'quote.v.01', 'quotes')]

Using regular expressions, I check if in the relation found by Bard there is a verb that matches the lemma of one of the verbs in the reference vocabulary. If so, I replace the relation with the verb form in the reference vocabulary.

If not, i iterate over the tokesmaking up the relation expression and find the verb. I serch for every possible synset of the verb and for each synset I compute the similarity with the reference synsets. I take the synset with the highest similarity and I replace the relation with the verb form in the reference vocabulary.

if also this method does not work, I assume that the relation is a relation of "general influence".

I also apply another check. Since I deliberately chose the relationships valid for entities not extracted by Bard, I check if the relation is valid for the entities in the sentence. i assume that Artists and Groups, for example, cannot linguistically bear some kind of relationship (e.g., an artist cannot be "based on" a work of art). In these cases, I assume that the relation is a relation of "general influence".

In [38]:
for ent in entity_relation:
    if not re.match(r'based\son|(inspired|influenced)(\sby)?|(derive[sd]|adapted)(\sfrom)?|(reference|mention)(s|ed\s(by|in)?)?|alludes?\sto', entity_relation[ent]):
        if re.search('based\son', entity_relation[ent]):
            entity_relation[ent] = 'based on'
        elif re.search('inspired\sby', entity_relation[ent]):
            entity_relation[ent] = 'inspired by'
        elif re.search('influenced\sby', entity_relation[ent]):
            entity_relation[ent] = 'influenced by'
        elif re.search('derive[ds](\sfrom)?', entity_relation[ent]):
            entity_relation[ent] = 'derived from'
        elif re.search('adapted(\sfrom)?', entity_relation[ent]):
            entity_relation[ent] = 'adapted from'
        elif re.search('reference(s|ed\s(by|in)?)?', entity_relation[ent]):
            entity_relation[ent] = 'references'
        elif re.search('mention(s|ed\s(by|in)?)?', entity_relation[ent]):
            entity_relation[ent] = 'mentions'
        elif re.search('alludes?(\sto)?', entity_relation[ent]):
            entity_relation[ent] = 'alludes to'
        elif re.search('quotes?(\sto)?', entity_relation[ent]):
            entity_relation[ent] = 'quotes'
        else:
            pos = nlp(entity_relation[ent])
            for tok in pos:
                if tok.pos_ == 'VERB':
                    lemma = tok.lemma_
                    best_syn = ['general', 0]
                    for syn_a in lemma_synset:
                        for syn_b in wn.synsets(lemma, pos=wn.VERB):
                            similarity = wn.lch_similarity(wn.synset(syn_a[1]), wn.synset(syn_b.name()))
                            if similarity >= 2.5 and similarity >= best_syn[1]:
                                best_syn = [syn_a[2], similarity]
                    entity_relation[ent] = best_syn[0]
                else:
                    entity_relation[ent] = 'general influence'

    if fast_check[ent][1] == 'Artist' or fast_check[ent][1] == 'Group': #Here we say that if the entity is an artist or a group, we can assume that the relation is 'general influence' rather than 'based on' or 'alludes to' or 'mentions' or 'references
        if entity_relation[ent] == 'based on' or entity_relation[ent] == 'references' or entity_relation[ent] == 'mentions' or entity_relation[ent] == 'alludes to' or entity_relation[ent] == 'derived from':
            entity_relation[ent] = 'general influence'
                            
                    
entity_relation    

{'Entity-16': 'based on',
 'Entity-13': 'general influence',
 'Entity-11': 'general influence',
 'Entity-2': 'inspired by',
 'Entity-21': 'inspired by',
 'Entity-19': 'based on',
 'Entity-4': 'general influence',
 'Entity-9': 'references',
 'Entity-3': 'general influence',
 'Entity-5': 'references',
 'Entity-20': 'references',
 'Entity-15': 'references',
 'Entity-22': 'references',
 'Entity-17': 'general influence',
 'Entity-10': 'general influence',
 'Entity-23': 'general influence',
 'Entity-12': 'general influence',
 'Entity-14': 'general influence',
 'Entity-7': 'influenced by',
 'Entity-8': 'influenced by',
 'Entity-1': 'general influence',
 'Entity-6': 'references',
 'Entity-18': 'general influence'}

In [39]:
adjusted_df = pd.DataFrame({'Entity': list(entity_relation.keys()), 'Relation': list(entity_relation.values()), '1984': '1984'})
adjusted_df

Unnamed: 0,Entity,Relation,1984
0,Entity-16,based on,1984
1,Entity-13,general influence,1984
2,Entity-11,general influence,1984
3,Entity-2,inspired by,1984
4,Entity-21,inspired by,1984
5,Entity-19,based on,1984
6,Entity-4,general influence,1984
7,Entity-9,references,1984
8,Entity-3,general influence,1984
9,Entity-5,references,1984


## 07. Knowledge Graph creation

I will use `rdflib` library for creating the Knowledge Graph usin the `Graph` class. The KG creation will be based on the classes and the proerties defined by the [MuCH-Ontology](../ontology_and_kg/mucho.owl) that has been created for this purpose. Classes and properties' URIs are saved in the [URIs.py file](./URIs.py) that has been created for saving space and it's imported in one of the following notebook's cells.

By now, I will make a simple query to DBpedia in order to retrieve the `rdf:type` of the main entity found in the song's annotations. I will use this information for assigning the entity to its class in my KG.

In [40]:
db_query_ent_type = """
        PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX wikipedia-en: <http://en.wikipedia.org/wiki/>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

        SELECT DISTINCT ?wikiLinks ?type
        WHERE { 
                VALUES ?type { dbo:MusicalWork dbo:Artwork dbo:Film dbo:TelevisionShow dbo:TelevisionSeason dbo:TelevisionEpisode dbo:Poem dbo:Book dbo:Comic dbo:Play}
                ?entity foaf:isPrimaryTopicOf wikipedia-en:"""+cut_url+""";
                        rdf:type ?type.
                }
        """

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)
sparql.setQuery(db_query_ent_type)

In [41]:
woa_type = sparql.query().convert()['results']['bindings'][0]['type']['value'].replace('http://dbpedia.org/ontology/', '')

if woa_type == 'MusicalWork':
    db_query_ent_type = """
        PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX wikipedia-en: <http://en.wikipedia.org/wiki/>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

        SELECT DISTINCT ?wikiLinks ?type
        WHERE { 
                VALUES ?type { dbo:Album dbo:Song dbo:Single}
                ?entity foaf:isPrimaryTopicOf wikipedia-en:"""+cut_url+""";
                        rdf:type ?type.
                }
        """

    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)
    sparql.setQuery(db_query_ent_type)

    woa_type = sparql.query().convert()['results']['bindings'][0]['type']['value'].replace('http://dbpedia.org/ontology/', '')
else:
    pass

woa_type

'Book'

### Graph creation and population
I will now import the needed URIs as well as the `rdflib` library and create my local graph.

In [51]:
from URIs import *
from rdflib import URIRef, Namespace, Graph, Literal, RDF, RDFS, BNode
from rdflib.namespace import  XSD


mucho_gustore = Graph()

Since many songs are not available in DBpedia or Wikidata, I will use the Muscbrainz ID for retrieving creating the song URI pointing to the musicbrainz identifying the specifc recording.

I also set a base URI path with the same base namespace used for my ontology in oder to define new individuals. Once done, I add to the graph the information about the song, its lyrics, the annotations, the artists and their genres. For what concerns the lyrics, I connect them with the specific text fragments that have an annotataion linked.

In [52]:
song_uri = URIRef('https://musicbrainz.org/recording/'+song['musicbrainz_id'])
base_indv = 'https://raw.githubusercontent.com/tommasobattisti/MuCH-O/main/ontology/mucho.owl#'

# Add song
mucho_gustore.add((song_uri, RDF.type, Song))
mucho_gustore.add((song_uri, title, Literal(song['name'], datatype=XSD.string, normalize=False)))
mucho_gustore.add((song_uri, RDFS.label, Literal(song['name'], datatype=XSD.string, normalize=False)))

# Add song lyrics
lyrics_bn = BNode()
mucho_gustore.add((song_uri, hasPart, lyrics_bn))
mucho_gustore.add((lyrics_bn, RDF.type, Lyrics))
mucho_gustore.add((lyrics_bn, text, Literal(song['lyrics'], datatype=XSD.string, normalize=False)))

# Add text fragments and annotations to the lyrics
for ann_num, ann in enumerate(song['annotations']):
    fr_name = base_indv+'fragment_'+str(ann_num)
    ann_name = base_indv+'annotation_'+str(ann_num)
    mucho_gustore.add((lyrics_bn, hasTextFragment, URIRef(fr_name)))
    mucho_gustore.add((URIRef(fr_name), RDF.type, TextFragment))
    mucho_gustore.add((URIRef(fr_name), text, Literal(ann[0], datatype=XSD.string, normalize=False)))
    mucho_gustore.add((URIRef(fr_name), hasAnnotation, URIRef(ann_name)))
    mucho_gustore.add((URIRef(ann_name), RDF.type, Annotation))
    mucho_gustore.add((URIRef(ann_name), text, Literal(ann[1], datatype=XSD.string, normalize=False)))


# Add song artists
for art in song['artists']:
    if 'wikidata_url' in art:
        art_uri = URIRef(art['wikidata_url'])
    else:
        art_uri = URIRef('https://musicbrainz.org/artist/'+art['mbz_id'])

    mucho_gustore.add((song_uri, hasAuthor, art_uri))
    mucho_gustore.add((art_uri, name, Literal(art['name'], datatype=XSD.string, normalize=False)))
    mucho_gustore.add((art_uri, RDFS.label, Literal(art['name'], datatype=XSD.string, normalize=False)))

    if art['type'] == 'Group':
        mucho_gustore.add((art_uri, RDF.type, MusicEnsemble))
    elif art['type'] == 'Person':
        mucho_gustore.add((art_uri, RDF.type, Musician))

    # Add artists genres
    for genre in art['genres']:
        genre = genre.replace(' ', '_')
        genre_uri = URIRef(base_indv+genre)
        mucho_gustore.add((art_uri, hasGenre, genre_uri))
        mucho_gustore.add((genre_uri, RDF.type, MusicGenre))
        mucho_gustore.add((genre_uri, RDFS.label, Literal(genre, datatype=XSD.string, normalize=False)))

Once my song and the related information is in the graph, I can start adding the information about the entities found in the annotations, defining the first relations. I reuse the query done on DBpedia in the first cell of this section to assign to the related entity also its type.

In addition, for each entity I add the information about the annotations in which it is mentioned and the source of this information (the URL of the song's lyrics on Genius).

In [54]:
# Add relation with referenced creative work
woa_uri = URIRef('https://www.wikidata.org/wiki/'+selected['id'])
mucho_gustore.add((song_uri, references, woa_uri))
mucho_gustore.add((woa_uri, title, Literal(selected['name'], datatype=XSD.string, normalize=False)))
mucho_gustore.add((woa_uri, RDFS.label, Literal(selected['name'], datatype=XSD.string, normalize=False)))
# Add reference information
reference_bn = BNode()
mucho_gustore.add((song_uri, qualifiedReference, reference_bn))
mucho_gustore.add((reference_bn, RDF.type, Reference))
mucho_gustore.add((reference_bn, entity, woa_uri))

if woa_type == 'MusicalWork':
    mucho_gustore.add((woa_uri, RDF.type, MusicEntity))
elif woa_type == 'Album':
    mucho_gustore.add((woa_uri, RDF.type, MusicAlbum))
elif woa_type == 'Single' or woa_type == 'Song':
    mucho_gustore.add((woa_uri, RDF.type, Song))
elif woa_type == 'Artwork':
    mucho_gustore.add((woa_uri, RDF.type, VisualArtEntity))
elif woa_type == 'Film' or woa_type == 'TelevisionShow' or woa_type == "TelevisionSeason" or woa_type == 'TelevisionEpisode':
    mucho_gustore.add((woa_uri, RDF.type, AudiovisualEntity))
elif woa_type == 'Poem' or woa_type == 'Book' or woa_type == 'Comic' or woa_type == 'Play':
    mucho_gustore.add((woa_uri, RDF.type, LiteraryEntity))
else:
    mucho_gustore.add((woa_uri, RDF.type, InformationObject))


# Add source url and information text about relation
for woa in ann_woa:
    if woa == selected['name']:
        for a in ann_woa[woa]["annotation_number"]:
            string = song["annotations"][int(a)][1].replace('\n\n', ' ').replace('\n', ' ').replace('   ', ' ').replace('  ', ' ')
            mucho_gustore.add((reference_bn, influenceSourceText, Literal(string, datatype=XSD.string, normalize=False)))
            mucho_gustore.add((reference_bn, hasInformationSource, URIRef(base_indv+'annotation_'+str(a)))) #add the annotation as information source for the relation
            print(reference_bn, hasInformationSource, URIRef(base_indv+'annotation_'+str(a)))
mucho_gustore.add((reference_bn, influenceInformationSource, Literal(song['genius_url'], datatype=URIRef("http://www.w3.org/2001/XMLSchema#anyURI"))))

N6a96825e0ae34e99931011b65d023726 https://raw.githubusercontent.com/tommasobattisti/MuCH-O/main/ontology/mucho.owl#hasinformationSource https://raw.githubusercontent.com/tommasobattisti/MuCH-O/main/ontology/mucho.owl#annotation_0
N6a96825e0ae34e99931011b65d023726 https://raw.githubusercontent.com/tommasobattisti/MuCH-O/main/ontology/mucho.owl#hasinformationSource https://raw.githubusercontent.com/tommasobattisti/MuCH-O/main/ontology/mucho.owl#annotation_1
N6a96825e0ae34e99931011b65d023726 https://raw.githubusercontent.com/tommasobattisti/MuCH-O/main/ontology/mucho.owl#hasinformationSource https://raw.githubusercontent.com/tommasobattisti/MuCH-O/main/ontology/mucho.owl#annotation_2
N6a96825e0ae34e99931011b65d023726 https://raw.githubusercontent.com/tommasobattisti/MuCH-O/main/ontology/mucho.owl#hasinformationSource https://raw.githubusercontent.com/tommasobattisti/MuCH-O/main/ontology/mucho.owl#annotation_3
N6a96825e0ae34e99931011b65d023726 https://raw.githubusercontent.com/tommasobatti

<Graph identifier=N41b8c5b96aa14b4caa2630c735cb08a1 (<class 'rdflib.graph.Graph'>)>

Once done, I am able to add to the referenced entity the connections I found on its Wikipedia page specifying the relation type, the source of the information and the text from which it is possible to identify the relation (the same from which the relation is been automatically exracted).

In assigning the type of the connected entities, I also make another query to DBpedia to understand if it is possible to assign a more specific type to the entity (only in the case in which the type is `MusicalWork`). If so, I assign it.

In [45]:
# Add relations with other creative works
for ent in entity_relation:
    ent_uri = URIRef(fast_check[ent][2])
    ent_type = fast_check[ent][1]
    #Add entity type
    if ent_type == 'MusicalWork':
        db_et_query = """
        PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
            
        SELECT ?type
        WHERE { 
                VALUES ?type { dbo:Album dbo:Song dbo:Single}
                <"""+fast_check[ent][2]+"""> rdf:type ?type.
                }
        """

        sparql = SPARQLWrapper("http://dbpedia.org/sparql")
        sparql.setReturnFormat(JSON)
        sparql.setQuery(db_et_query)
        ent_type = sparql.query().convert()['results']['bindings'][0]['type']['value'].replace('http://dbpedia.org/ontology/', '')
        if ent_type == 'Album':
            mucho_gustore.add((ent_uri, RDF.type, MusicAlbum))
        elif ent_type == 'Single' or ent_type == 'Song':
            mucho_gustore.add((ent_uri, RDF.type, Song))
        else:
            mucho_gustore.add((ent_uri, RDF.type, MusicEntity))
        mucho_gustore.add((ent_uri, title, Literal(fast_check[ent][3], datatype=XSD.string, normalize=False))) #I repeat this line for every condiftional block because in some cases the property changes 
    elif ent_type == 'Artwork':
        mucho_gustore.add((ent_uri, RDF.type, VisualArtEntity))
        mucho_gustore.add((ent_uri, title, Literal(fast_check[ent][3], datatype=XSD.string, normalize=False)))
    elif ent_type == 'Film' or ent_type == 'TelevisionShow' or ent_type == "TelevisionSeason" or ent_type == 'TelevisionEpisode':
        mucho_gustore.add((ent_uri, RDF.type, AudiovisualEntity))
        mucho_gustore.add((ent_uri, title, Literal(fast_check[ent][3], datatype=XSD.string, normalize=False)))
    elif ent_type == 'Poem' or ent_type == 'Book' or ent_type == 'Comic' or ent_type == 'Play':
        mucho_gustore.add((ent_uri, RDF.type, LiteraryEntity))
        mucho_gustore.add((ent_uri, title, Literal(fast_check[ent][3], datatype=XSD.string, normalize=False)))
    elif ent_type == 'Person':
        mucho_gustore.add((ent_uri, RDF.type, Person))
        mucho_gustore.add((ent_uri, name, Literal(fast_check[ent][3], datatype=XSD.string, normalize=False)))
    elif ent_type == 'Group':
        mucho_gustore.add((ent_uri, RDF.type, Group))
        mucho_gustore.add((ent_uri, name, Literal(fast_check[ent][3], datatype=XSD.string, normalize=False)))
    
    mucho_gustore.add((ent_uri, RDFS.label, Literal(fast_check[ent][3], datatype=XSD.string, normalize=False))) # In any case I add the label at the end

    
    # Add entity relation
    if entity_relation[ent] == 'based on':
        mucho_gustore.add((ent_uri, isBasedOn, woa_uri))
        based_on_bn = BNode()
        mucho_gustore.add((ent_uri, qualifiedBasis, based_on_bn))
        mucho_gustore.add((based_on_bn, RDF.type, Basis))
        mucho_gustore.add((based_on_bn, entity, woa_uri))
        mucho_gustore.add((based_on_bn, influenceInformationSource, Literal(selected["wikipedia_url"], datatype=URIRef("http://www.w3.org/2001/XMLSchema#anyURI"))))
        for gr in groups:
            if ent in groups[gr]:
                mucho_gustore.add((based_on_bn, influenceSourceText, Literal(src_to_parse[gr], datatype=XSD.string, normalize=False)))

    elif entity_relation[ent] == 'inspired by':
        mucho_gustore.add((ent_uri, wasInspiredBy, woa_uri))
        insp_by_bn = BNode()
        mucho_gustore.add((ent_uri, qualifiedInspiration, insp_by_bn))
        mucho_gustore.add((insp_by_bn, RDF.type, Inspiration))
        mucho_gustore.add((insp_by_bn, entity, woa_uri))
        mucho_gustore.add((insp_by_bn, influenceInformationSource, Literal(selected["wikipedia_url"], datatype=URIRef("http://www.w3.org/2001/XMLSchema#anyURI"))))
        for gr in groups:
            if ent in groups[gr]:
                mucho_gustore.add((insp_by_bn, influenceSourceText, Literal(src_to_parse[gr], datatype=XSD.string, normalize=False)))

    elif entity_relation[ent] == 'derived from':
        mucho_gustore.add((ent_uri, wasDerivedFrom, woa_uri))
        der_from_bn = BNode()
        mucho_gustore.add((ent_uri, qualifiedDerivation, der_from_bn))
        mucho_gustore.add((der_from_bn, RDF.type, Derivation))
        mucho_gustore.add((der_from_bn, entity, woa_uri))
        mucho_gustore.add((der_from_bn, influenceInformationSource, Literal(selected["wikipedia_url"], datatype=URIRef("http://www.w3.org/2001/XMLSchema#anyURI"))))
        for gr in groups:
            if ent in groups[gr]:
                mucho_gustore.add((der_from_bn, influenceSourceText, Literal(src_to_parse[gr], datatype=XSD.string, normalize=False)))

    elif entity_relation[ent] == 'adapted from':
        mucho_gustore.add((ent_uri, isAdaptationOf, woa_uri))
        ad_of_bn = BNode()
        mucho_gustore.add((ent_uri, qualifiedAdaptation, ad_of_bn))
        mucho_gustore.add((ad_of_bn, RDF.type, Adaptation))
        mucho_gustore.add((ad_of_bn, entity, woa_uri))
        mucho_gustore.add((ad_of_bn, influenceInformationSource, Literal(selected["wikipedia_url"], datatype=URIRef("http://www.w3.org/2001/XMLSchema#anyURI"))))
        for gr in groups:
            if ent in groups[gr]:
                mucho_gustore.add((ad_of_bn, influenceSourceText, Literal(src_to_parse[gr], datatype=XSD.string, normalize=False)))

    elif entity_relation[ent] == 'references':
        mucho_gustore.add((ent_uri, references, woa_uri))
        ref_bn = BNode()
        mucho_gustore.add((ent_uri, qualifiedReference, ref_bn))
        mucho_gustore.add((ref_bn, RDF.type, Reference))
        mucho_gustore.add((ref_bn, entity, woa_uri))
        mucho_gustore.add((ref_bn, influenceInformationSource, Literal(selected["wikipedia_url"], datatype=URIRef("http://www.w3.org/2001/XMLSchema#anyURI"))))
        for gr in groups:
            if ent in groups[gr]:
                mucho_gustore.add((ref_bn, influenceSourceText, Literal(src_to_parse[gr], datatype=XSD.string, normalize=False)))

    elif entity_relation[ent] == 'mentions':
        mucho_gustore.add((ent_uri, mentions, woa_uri))
        men_bn = BNode()
        mucho_gustore.add((ent_uri, qualifiedMention, men_bn))
        mucho_gustore.add((men_bn, RDF.type, Mention))
        mucho_gustore.add((men_bn, entity, woa_uri))
        mucho_gustore.add((men_bn, influenceInformationSource, Literal(selected["wikipedia_url"], datatype=URIRef("http://www.w3.org/2001/XMLSchema#anyURI"))))
        for gr in groups:
            if ent in groups[gr]:
                mucho_gustore.add((men_bn, influenceSourceText, Literal(src_to_parse[gr], datatype=XSD.string, normalize=False)))

    elif entity_relation[ent] == 'alludes to':
        mucho_gustore.add((ent_uri, alludesTo, woa_uri))
        all_bn = BNode()
        mucho_gustore.add((ent_uri, qualifiedAllusion, all_bn))
        mucho_gustore.add((all_bn, RDF.type, Allusion))
        mucho_gustore.add((all_bn, entity, woa_uri))
        mucho_gustore.add((all_bn, influenceInformationSource, Literal(selected["wikipedia_url"], datatype=URIRef("http://www.w3.org/2001/XMLSchema#anyURI"))))
        for gr in groups:
            if ent in groups[gr]:
                mucho_gustore.add((all_bn, influenceSourceText, Literal(src_to_parse[gr], datatype=XSD.string, normalize=False)))

    elif entity_relation[ent] == 'quotes':
        mucho_gustore.add((ent_uri, cites, woa_uri))
        cit_bn = BNode()
        mucho_gustore.add((ent_uri, qualifiedCitation, cit_bn))
        mucho_gustore.add((cit_bn, RDF.type, Citation))
        mucho_gustore.add((cit_bn, entity, woa_uri))
        mucho_gustore.add((cit_bn, influenceInformationSource, Literal(selected["wikipedia_url"], datatype=URIRef("http://www.w3.org/2001/XMLSchema#anyURI"))))
        for gr in groups:
            if ent in groups[gr]:
                mucho_gustore.add((cit_bn, influenceSourceText, Literal(src_to_parse[gr], datatype=XSD.string, normalize=False)))

    else: # entity_relation[ent] == 'influenced by' or entity_relation[ent] == 'general influence'
        mucho_gustore.add((ent_uri, wasInfluencedBy, woa_uri))
        inf_by_bn = BNode()
        mucho_gustore.add((ent_uri, qualifiedInfluence, inf_by_bn))
        mucho_gustore.add((inf_by_bn, RDF.type, EntityInfluence))
        mucho_gustore.add((inf_by_bn, entity, woa_uri))
        mucho_gustore.add((inf_by_bn, influenceInformationSource, Literal(selected["wikipedia_url"], datatype=URIRef("http://www.w3.org/2001/XMLSchema#anyURI"))))
        for gr in groups:
            if ent in groups[gr]:
                mucho_gustore.add((inf_by_bn, influenceSourceText, Literal(src_to_parse[gr], datatype=XSD.string, normalize=False)))


With my graph now populated, the last thing to do is to export it in a file. I choose to export it in turtle format.

In [46]:
mucho_gustore.serialize(destination="./kg.ttl", format="turtle")

<Graph identifier=N0872dabfdf1645d293665a7f911a1b96 (<class 'rdflib.graph.Graph'>)>