# **<span style="color:brown">Wikidata Entity Linking**</span> and **Graph Construction**

Test for Wikidata Entity Linking on Seattle Lib Data

*`Python < 3.10` needed for kglab to work (tested on `Python 3.9.18`)*

In [27]:
import requests
import pandas as pd
import re
import kglab

from pathlib import Path
from rdflib import Graph
from tqdm.auto import tqdm

### **Load pickle**

In [28]:
checkouts2021_df = pd.read_pickle('./checkouts2021.pkl')

In [29]:
checkouts2021_df.head()

Unnamed: 0,usageclass,checkouttype,materialtype,checkoutyear,checkoutmonth,checkouts,title,creator,subjects,publisher,publicationyear
0,Digital,OverDrive,EBOOK,2021,1,1,California: A History,Kevin Starr,"History, Nonfiction, Reference","Random House, Inc.",2010
1,Digital,OverDrive,EBOOK,2021,1,2,Agnes Grey,Anne Brontë,"Classic Literature, Fiction",Duke Classics,2020
2,Digital,OverDrive,EBOOK,2021,1,12,Tailspin,Sandra Brown,"Fiction, Romance, Suspense, Thriller","Hachette Digital, Inc.",2018
3,Digital,OverDrive,EBOOK,2021,1,1,Unleashed,Laurien Berenson,"Fiction, Mystery",Lightning Source Inc,2016
4,Physical,Horizon,BOOK,2021,1,1,The stoic / Theodore Dreiser.,"Dreiser, Theodore, 1871-1945",,"World Pub. Co.,",1947


### Check Top 10 Creator DataFrame

In [30]:
top10_df = (
    checkouts2021_df
    .groupby(['materialtype', 
              'creator', 
              'publisher', 
              'publicationyear', 
              'subjects', 
              'title'])['checkouts']
    .sum()
    .sort_values(ascending=False)[:10]
    )
top10_df = pd.DataFrame(top10_df).reset_index()
top10_df.head()

Unnamed: 0,materialtype,creator,publisher,publicationyear,subjects,title,checkouts
0,EBOOK,Barack Obama,"Random House, Inc.",2020,"Biography & Autobiography, History, Politics, ...",A Promised Land,2041
1,AUDIOBOOK,Brit Bennett,Books on Tape,2020,"Fiction, Literature",The Vanishing Half: A Novel (unabridged),1629
2,EBOOK,Matt Haig,"Penguin Group (USA), Inc.",2020,"Fiction, Literature, Science Fiction",The Midnight Library: A Novel,1505
3,AUDIOBOOK,Malcolm Gladwell,"Blackstone Audio, Inc.",2019,"Psychology, Sociology, Nonfiction",Talking to Strangers: What We Should Know abou...,1435
4,EBOOK,Jessica Bruder,W. W. Norton & Company,2017,"Business, Sociology, Nonfiction",Nomadland: Surviving America in the Twenty-Fir...,1434


### **Clean data**

In [31]:
def reverse_name(name):
    name_parts = name.split(', ')
    name_parts.reverse()
    return ' '.join(name_parts).strip()

def clean_title(title):
    clean_title = re.sub(r':.*', '', title)
    return clean_title.strip()

def clean_publisher(publisher):
    clean_publisher = re.sub(r',.*|Inc.|\(U.*', '', publisher)
    return clean_publisher

# If creator column contains birth dates or death dates (or both) delete them
top10_df['creator'] = top10_df['creator'].str.replace(r'[\d-]*', '', regex=True)

# If creator column contains name in wrong order (last_name, first_name)
top10_df['creator'] = top10_df['creator'].apply(reverse_name)

# Clean publicationyear column
top10_df['publicationyear'] = top10_df['publicationyear'].str.replace(r'\W*', '', regex=True)

# Clean title column
top10_df['title_clean'] = top10_df['title'].apply(clean_title)

# Clean publisher column
top10_df['publisher_clean'] = top10_df['publisher'].apply(clean_publisher)

# Clean subjects column
top10_df['subjects'] = top10_df['subjects'].str.split(',')

# Check cleaned DataFrame
top10_df.head()

Unnamed: 0,materialtype,creator,publisher,publicationyear,subjects,title,checkouts,title_clean,publisher_clean
0,EBOOK,Barack Obama,"Random House, Inc.",2020,"[Biography & Autobiography, History, Politic...",A Promised Land,2041,A Promised Land,Random House
1,AUDIOBOOK,Brit Bennett,Books on Tape,2020,"[Fiction, Literature]",The Vanishing Half: A Novel (unabridged),1629,The Vanishing Half,Books on Tape
2,EBOOK,Matt Haig,"Penguin Group (USA), Inc.",2020,"[Fiction, Literature, Science Fiction]",The Midnight Library: A Novel,1505,The Midnight Library,Penguin Group
3,AUDIOBOOK,Malcolm Gladwell,"Blackstone Audio, Inc.",2019,"[Psychology, Sociology, Nonfiction]",Talking to Strangers: What We Should Know abou...,1435,Talking to Strangers,Blackstone Audio
4,EBOOK,Jessica Bruder,W. W. Norton & Company,2017,"[Business, Sociology, Nonfiction]",Nomadland: Surviving America in the Twenty-Fir...,1434,Nomadland,W. W. Norton & Company


### **Rough** Wikidata Entity Linking

*Rough* = First entity of resulting list will be mapped to creator, i.e. mismaps are possible

In [32]:
def get_wikidata_entity(name):
        url = "http://www.wikidata.org/w/api.php"
        params = {
            "search": name,
            "action": "wbsearchentities",
            "format": "json",
            "language": "en",
            "type": "item",
            "continue": "0",
            "limit": "10"
        }
        if requests.get(url).status_code == 200:
            json_data = requests.get(url, params=params).json()
            if len(json_data['search']) > 0:
                return json_data['search'][0]['id'] # Returns Wikidata ID
            else:
                return None
        else:
            print(f'{url} not available.')
        
def construct_wikidata_links(wd_id):
    url_base = 'https://www.wikidata.org/wiki/'
    if wd_id != None and isinstance(wd_id, str):
        return url_base + wd_id
    elif isinstance(wd_id, list):
        return [url_base + id for id in wd_id]
     
def print_wikidata_urls(dataframe):
     for i in range(0, len(dataframe)):
        print(f"{dataframe['creator'][i]}: {construct_wikidata_links(dataframe['c_wd'][i])}")
        print(f"{dataframe['title_clean'][i]}: {construct_wikidata_links(dataframe['t_wd'][i])}")
        print(f"{dataframe['materialtype'][i]}: {construct_wikidata_links(dataframe['m_wd'][i])}")
        print(f"{dataframe['publisher_clean'][i]}: {construct_wikidata_links(dataframe['p_wd'][i])}")
        print(f"Subjects: {construct_wikidata_links(dataframe['s_wd'][i])}")
        print()

In [33]:
# Get Wikidata IDs for cleaned columns
for i in tqdm(['creator', 'materialtype', 'publisher_clean', 'title_clean', 'subjects']):
    if i == 'subjects':
        subject_wds = []
        for subjects in top10_df[i]:
            subjects_temp = []
            for s in subjects:
                s_wd = get_wikidata_entity(s)
                if s_wd != None:
                    subjects_temp.append(s_wd)
            subject_wds.append(subjects_temp)
        top10_df[f'{i[0]}_wd'] = subject_wds
    else:
        top10_df[f'{i[0]}_wd'] = top10_df[i].apply(get_wikidata_entity)

100%|██████████| 5/5 [01:36<00:00, 19.21s/it]


In [34]:
# Create new column layout
top10_df = top10_df.filter(['creator',
                            'c_wd',
                            'title',
                            'title_clean',
                            't_wd',
                            'materialtype',
                            'm_wd',
                            'publisher',
                            'publisher_clean',
                            'p_wd',
                            'subjects',
                            's_wd',
                            'checkouts'])
top10_df.head()

Unnamed: 0,creator,c_wd,title,title_clean,t_wd,materialtype,m_wd,publisher,publisher_clean,p_wd,subjects,s_wd,checkouts
0,Barack Obama,Q76,A Promised Land,A Promised Land,Q101438737,EBOOK,Q128093,"Random House, Inc.",Random House,Q744182,"[Biography & Autobiography, History, Politic...","[Q309, Q7163, Q213051]",2041
1,Brit Bennett,Q27449519,The Vanishing Half: A Novel (unabridged),The Vanishing Half,Q98476957,AUDIOBOOK,Q106833,Books on Tape,Books on Tape,Q4943292,"[Fiction, Literature]","[Q8253, Q8242]",1629
2,Matt Haig,Q926682,The Midnight Library: A Novel,The Midnight Library,Q106814534,EBOOK,Q128093,"Penguin Group (USA), Inc.",Penguin Group,Q3374730,"[Fiction, Literature, Science Fiction]","[Q8253, Q8242, Q24925]",1505
3,Malcolm Gladwell,Q318429,Talking to Strangers: What We Should Know abou...,Talking to Strangers,Q69523853,AUDIOBOOK,Q106833,"Blackstone Audio, Inc.",Blackstone Audio,Q4923391,"[Psychology, Sociology, Nonfiction]","[Q9418, Q21201, Q213051]",1435
4,Jessica Bruder,Q63098703,Nomadland: Surviving America in the Twenty-Fir...,Nomadland,Q61740820,EBOOK,Q128093,W. W. Norton & Company,W. W. Norton & Company,Q1282208,"[Business, Sociology, Nonfiction]","[Q4830453, Q21201, Q213051]",1434


In [36]:
# Print all wikidata links (based on the retrieved wd_entities) for one specific creator
print_wikidata_urls(top10_df.query('creator == "Barack Obama"'))

Barack Obama: https://www.wikidata.org/wiki/Q76
A Promised Land: https://www.wikidata.org/wiki/Q101438737
EBOOK: https://www.wikidata.org/wiki/Q128093
Random House: https://www.wikidata.org/wiki/Q744182
Subjects: ['https://www.wikidata.org/wiki/Q309', 'https://www.wikidata.org/wiki/Q7163', 'https://www.wikidata.org/wiki/Q213051']



In [37]:
# Save dataframe to pickle
top10_df.to_pickle('top10_wikidata.pkl')

 ### Create an RDF graph with turtle serialization

In [38]:
# Load pickled dataframe
top10_df = pd.read_pickle('./top10_wikidata.pkl')

In [39]:
top10_df.head()

Unnamed: 0,creator,c_wd,title,title_clean,t_wd,materialtype,m_wd,publisher,publisher_clean,p_wd,subjects,s_wd,checkouts
0,Barack Obama,Q76,A Promised Land,A Promised Land,Q101438737,EBOOK,Q128093,"Random House, Inc.",Random House,Q744182,"[Biography & Autobiography, History, Politic...","[Q309, Q7163, Q213051]",2041
1,Brit Bennett,Q27449519,The Vanishing Half: A Novel (unabridged),The Vanishing Half,Q98476957,AUDIOBOOK,Q106833,Books on Tape,Books on Tape,Q4943292,"[Fiction, Literature]","[Q8253, Q8242]",1629
2,Matt Haig,Q926682,The Midnight Library: A Novel,The Midnight Library,Q106814534,EBOOK,Q128093,"Penguin Group (USA), Inc.",Penguin Group,Q3374730,"[Fiction, Literature, Science Fiction]","[Q8253, Q8242, Q24925]",1505
3,Malcolm Gladwell,Q318429,Talking to Strangers: What We Should Know abou...,Talking to Strangers,Q69523853,AUDIOBOOK,Q106833,"Blackstone Audio, Inc.",Blackstone Audio,Q4923391,"[Psychology, Sociology, Nonfiction]","[Q9418, Q21201, Q213051]",1435
4,Jessica Bruder,Q63098703,Nomadland: Surviving America in the Twenty-Fir...,Nomadland,Q61740820,EBOOK,Q128093,W. W. Norton & Company,W. W. Norton & Company,Q1282208,"[Business, Sociology, Nonfiction]","[Q4830453, Q21201, Q213051]",1434


In [40]:
g = Graph()

In [41]:
def create_triples(df: pd.DataFrame):
    turtle_namespaces = """\
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX ex: <http://example.org/ontology/>

    """
    collection = []

    for i in range(0, len(df)):
        triples = []
        if df.iloc[i]['c_wd'] != None and df.iloc[i]['t_wd'] != None and df.iloc[i]['m_wd'] != None and df.iloc[i]['p_wd'] != None:
            # Triples for name and person 
            triples.append(f"""wd:{df.iloc[i]['c_wd']} rdf:type dbo:Author .""")
            triples.append(f"""wd:{df.iloc[i]['c_wd']} rdfs:label "{df.iloc[i]['creator']}"@en .""")

            # Triples for title, materialtype, publisher (t_wd, p_wd, m_wd)
            triples.append(f"""wd:{df.iloc[i]['c_wd']} wdt:P800 wd:{df.iloc[i]['t_wd']} .""") # has notable work (P800)
            triples.append(f"""wd:{df.iloc[i]['t_wd']} rdfs:label "{df.iloc[i]['title']}"@en .""")
            triples.append(f"""wd:{df.iloc[i]['t_wd']} rdf:type wd:{df.iloc[i]['m_wd']} .""")
            triples.append(f"""wd:{df.iloc[i]['t_wd']} ex:hasCheckouts "{df.iloc[i]['checkouts']}"^xsd:integer .""")
            triples.append(f"""wd:{df.iloc[i]['m_wd']} rdfs:label "{df.iloc[i]['materialtype']}"@en .""")
            triples.append(f"""wd:{df.iloc[i]['t_wd']} wdt:P123 wd:{df.iloc[i]['p_wd']} .""") # has publisher
            triples.append(f"""wd:{df.iloc[i]['p_wd']} rdfs:label "{df.iloc[i]['publisher']}"@en .""")
            
            # Triples for subject (s_wd)
            for s in df.iloc[i]['s_wd']:
                triples.append(f"""wd:{df.iloc[i]['t_wd']} wdt:P921 wd:{s} .""") # has main subject
            
            # Append all triples
            collection.append(triples)
        
    triples_string = ' '.join(['\n'.join(set) for set in collection])
    graph_data = turtle_namespaces + triples_string
    return graph_data

graph_data = create_triples(top10_df)

In [42]:
# Parse turtle data
g.parse(data=graph_data, format="turtle")

<Graph identifier=Nd1f56a13de84430da32d0b51fb7bcab3 (<class 'rdflib.graph.Graph'>)>

In [43]:
g.print()

@prefix dbo: <http://dbpedia.org/ontology/> .
@prefix ex: <http://example.org/ontology/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix wdt: <http://www.wikidata.org/prop/direct/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

wd:Q19325957 a dbo:Author ;
    rdfs:label "Lucy Foley"@en ;
    wdt:P800 wd:Q122363681 .

wd:Q27449519 a dbo:Author ;
    rdfs:label "Brit Bennett"@en ;
    wdt:P800 wd:Q98476957 .

wd:Q318429 a dbo:Author ;
    rdfs:label "Malcolm Gladwell"@en ;
    wdt:P800 wd:Q69523853 .

wd:Q510034 a dbo:Author ;
    rdfs:label "Sanjay Gupta"@en ;
    wdt:P800 wd:Q115164824 .

wd:Q63098703 a dbo:Author ;
    rdfs:label "Jessica Bruder"@en ;
    wdt:P800 wd:Q61740820 .

wd:Q76 a dbo:Author ;
    rdfs:label "Barack Obama"@en ;
    wdt:P800 wd:Q101438737 .

wd:Q926682 a dbo:Author ;
    rdfs:label "Matt Haig"@en ;
    wdt:P800 wd:Q106814534 .

wd:Q101438737 a wd:Q128093 ;
    rdfs:label "A Promised Land"@

In [44]:
# Set turtle filename
ttl_file = Path('seattleLib_2021.ttl')

# Save the graph
g.serialize(destination=ttl_file)

<Graph identifier=Nd1f56a13de84430da32d0b51fb7bcab3 (<class 'rdflib.graph.Graph'>)>

### **Visualize** the Graph

In [45]:
kg = kglab.KnowledgeGraph().load_rdf(Path(ttl_file))

In [46]:
VIS_STYLE = {
    "wd": {
        "color": "orange",
        "size": 40,
    },
    "dbo":{
        "color": "blue",
        "size": 50,
    },
}

subgraph = kglab.SubgraphTensor(kg)
pyvis_graph = subgraph.build_pyvis_graph(notebook=True, style=VIS_STYLE)



In [47]:
pyvis_graph.force_atlas_2based()
pyvis_graph.show(str(ttl_file).replace('.ttl', '.html'))

seattleLib_2021.html
