# **<span style="color:brown">Wikidata Entity Linking**</span> and **Graph Construction**

Test for Wikidata Entity Linking on Seattle Lib Data

*`Python < 3.10` needed for kglab to work (tested on `Python 3.9.18`)*

In [1]:
import pandas as pd
import re
import kglab

from pathlib import Path
from rdflib import Graph
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Read **.feather**

In [41]:
filename = Path('./2021_seattle_lib_checkouts.feather')

In [42]:
checkouts2021_df = pd.read_feather(filename)

In [43]:
checkouts2021_df.head()

Unnamed: 0,usageclass,checkouttype,materialtype,checkoutyear,checkoutmonth,checkouts,title,creator,subjects,publisher,publicationyear
0,Digital,OverDrive,EBOOK,2021,1,1,California: A History,Kevin Starr,"History, Nonfiction, Reference","Random House, Inc.",2010
1,Digital,OverDrive,EBOOK,2021,1,2,Agnes Grey,Anne Brontë,"Classic Literature, Fiction",Duke Classics,2020
2,Digital,OverDrive,EBOOK,2021,1,12,Tailspin,Sandra Brown,"Fiction, Romance, Suspense, Thriller","Hachette Digital, Inc.",2018
3,Digital,OverDrive,EBOOK,2021,1,1,Unleashed,Laurien Berenson,"Fiction, Mystery",Lightning Source Inc,2016
4,Physical,Horizon,BOOK,2021,1,1,The stoic / Theodore Dreiser.,"Dreiser, Theodore, 1871-1945",,"World Pub. Co.,",[1947]


### **Create a new DataFrame**

In [2]:
def create_dataframe(dataframe: pd.DataFrame, rows: int):
    """Create new pd.DataFrame from existing pd.DataFrame 
    by providing the amount of `rows` the new DataFrame should have.
    """
    new_df = (
        dataframe
        .groupby(['materialtype', 
                'creator', 
                'publisher', 
                'publicationyear', 
                'subjects', 
                'title'])['checkouts']
        .sum()
        .sort_values(ascending=False)[:rows]
        )
    new_df = pd.DataFrame(new_df).reset_index()
    return new_df

In [45]:
df = create_dataframe(checkouts2021_df, rows=100)
df.head()

Unnamed: 0,materialtype,creator,publisher,publicationyear,subjects,title,checkouts
0,EBOOK,Barack Obama,"Random House, Inc.",2020,"Biography & Autobiography, History, Politics, ...",A Promised Land,2041
1,AUDIOBOOK,Brit Bennett,Books on Tape,2020,"Fiction, Literature",The Vanishing Half: A Novel (unabridged),1629
2,EBOOK,Matt Haig,"Penguin Group (USA), Inc.",2020,"Fiction, Literature, Science Fiction",The Midnight Library: A Novel,1505
3,AUDIOBOOK,Malcolm Gladwell,"Blackstone Audio, Inc.",2019,"Psychology, Sociology, Nonfiction",Talking to Strangers: What We Should Know abou...,1435
4,EBOOK,Jessica Bruder,W. W. Norton & Company,2017,"Business, Sociology, Nonfiction",Nomadland: Surviving America in the Twenty-Fir...,1434


### **Clean data**

In [3]:
def reverse_name(name):
    name_parts = name.split(', ')
    name_parts.reverse()
    return ' '.join(name_parts).strip()

def clean_title(title):
    clean_title = re.sub(r':.*', '', title)
    return clean_title.strip()

def clean_publisher(publisher):
    clean_publisher = re.sub(r',.*|Inc.|\(U.*', '', publisher)
    return clean_publisher

def clean_dataframe(df: pd.DataFrame):
    # If creator column contains birth dates or death dates (or both) delete them
    df['creator'] = df['creator'].str.replace(r'\(.*|[\d-]*', '', regex=True)
    # If creator column contains name in wrong order (last_name, first_name)
    df['creator'] = df['creator'].apply(reverse_name)
    # Clean publicationyear column
    df['publicationyear'] = df['publicationyear'].str.replace(r'\W*', '', regex=True)
    # Clean title column
    df['title_clean'] = df['title'].apply(clean_title)
    # Clean publisher column
    df['publisher_clean'] = df['publisher'].apply(clean_publisher)
    # Clean subjects column
    df['subjects'] = df['subjects'].str.split(', ')
    return df

In [47]:
df = clean_dataframe(df)
df.head()

Unnamed: 0,materialtype,creator,publisher,publicationyear,subjects,title,checkouts,title_clean,publisher_clean
0,EBOOK,Barack Obama,"Random House, Inc.",2020,"[Biography & Autobiography, History, Politics,...",A Promised Land,2041,A Promised Land,Random House
1,AUDIOBOOK,Brit Bennett,Books on Tape,2020,"[Fiction, Literature]",The Vanishing Half: A Novel (unabridged),1629,The Vanishing Half,Books on Tape
2,EBOOK,Matt Haig,"Penguin Group (USA), Inc.",2020,"[Fiction, Literature, Science Fiction]",The Midnight Library: A Novel,1505,The Midnight Library,Penguin Group
3,AUDIOBOOK,Malcolm Gladwell,"Blackstone Audio, Inc.",2019,"[Psychology, Sociology, Nonfiction]",Talking to Strangers: What We Should Know abou...,1435,Talking to Strangers,Blackstone Audio
4,EBOOK,Jessica Bruder,W. W. Norton & Company,2017,"[Business, Sociology, Nonfiction]",Nomadland: Surviving America in the Twenty-Fir...,1434,Nomadland,W. W. Norton & Company


### **Rough** Wikidata Entity Linking

*Rough* = First entity of resulting list will be mapped to creator, i.e. mismaps are possible

In [77]:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

def fetch_url(query, ents):
    url = "http://www.wikidata.org/w/api.php"
    # If query string is not already in ents_dict, query wikidata API
    if query not in ents:
        try:
            json_data = requests.get(
                url,
                params={
                    "search": query,
                    "action": "wbsearchentities",
                    "format": "json",
                    "language": "en",
                    "type": "item",
                    "continue": "0",
                    "limit": "10",
                },
            ).json()
            # Check if wikidata entity id was found in json
            if len(json_data["search"]) > 0:
                wikidata_id = json_data["search"][0]["id"]
                ents[query] = wikidata_id
                return wikidata_id
            # If no wikidata entity id was found, return None
            else:
                ents[query] = None
                return None
        except Exception as e:
            # Handle exceptions if necessary
            print(f"Error fetching {query}: {str(e)}")
            return None
    else:
        # If query string is already in ents_dict, return the stored ID
        return ents[query]

def get_wikidata_entity(queries, ents, workers=4):
    with ThreadPoolExecutor(max_workers=workers) as executor:
        # Use list comprehension to create a list of futures
        [executor.submit(fetch_url, query, ents) for query in queries]
    return pd.Series([ents[query] for query in queries]), ents

# if __name__ == "__main__":
#     # Example usage
#     queries_list = ["Barack Obama", "Stephen King", "Gunther Wald"]
#     entities_dict = {}
#     ser, ents = get_wikidata_entity(queries_list, entities_dict)
#     print(ser, ents)

In [61]:
def link_data_multithread(df: pd.DataFrame, 
                          ents: dict, 
                          columns=['creator', 'materialtype', 'publisher_clean', 'title_clean', 'subjects'],
                          worker=4):
    # Get Wikidata IDs for specified *cleaned* columns
    for column in tqdm(columns):
        if column == 'subjects':
            wikidata_ids = []
            for subjects in df[column]:
                subject_ids, ents = get_wikidata_entity(subjects, ents, worker)
                subject_ids = [s_id for s_id in subject_ids if s_id != None]
                wikidata_ids.append(subject_ids)
            df[f'{column[0]}_wd'] = wikidata_ids
        else:
            wikidata_ids, ents = get_wikidata_entity(df[column], ents, worker)
            df[f'{column[0]}_wd'] = wikidata_ids
    return df, ents

# ents = {}
# df, ents = link_data_multithread(df, ents)

In [37]:
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   materialtype     20 non-null     object
 1   creator          20 non-null     object
 2   publisher        20 non-null     object
 3   publicationyear  20 non-null     object
 4   subjects         20 non-null     object
 5   title            20 non-null     object
 6   checkouts        20 non-null     int16 
 7   title_clean      20 non-null     object
 8   publisher_clean  20 non-null     object
 9   c_wd             18 non-null     object
 10  m_wd             20 non-null     object
 11  p_wd             18 non-null     object
 12  t_wd             18 non-null     object
 13  s_wd             20 non-null     object
dtypes: int16(1), object(13)
memory usage: 2.2+ KB


In [10]:
def fetch_wikidata_entity(queries: list, ents: dict):
    """Function to fetch wikidata entity IDs by querying the wikidata API.

    Args:
        queries (list): list of query strings; each list item is queried 
        ents (dict): dict to store already queried entities in, to reduce API 
        calls and speed up the entity linking

    Returns:
        pd.Series: pandas Series with fetched wikidata IDs
        ents: updated dict consisting of all query strings that could be matched 
        to a wikidata entity
    """
    # Wikidata API URL
    url = "http://www.wikidata.org/w/api.php" 
    # Create empty list to temporarily store fetched wikidata entity ids
    wikidata_ids = []
    # Check if wikidata API service is available
    if requests.get(url).status_code == 200:
        for query in queries:
            # If query string is not already in ents_dict, query wikidata API
            if query not in ents:
                json_data = requests.get(url, params={"search": query, 
                                                      "action": "wbsearchentities", 
                                                      "format": "json", 
                                                      "language": "en", 
                                                      "type": "item", 
                                                      "continue": "0", 
                                                      "limit": "10"}).json()
                # Check if wikidata entity id was found in json
                if len(json_data['search']) > 0:
                    wikidata_id = json_data['search'][0]['id']
                    ents[query] = wikidata_id
                    wikidata_ids.append(wikidata_id)
                # If no wikidata entity id was found append None to wikidata_ids
                else:
                    wikidata_id = None
                    ents[query] = wikidata_id
                    wikidata_ids.append(wikidata_id)
            else:
                # print(f'{query} already in dict with ID: {ents[query]}') 
                wikidata_ids.append(ents[query])
    else:
        print(f'{url} not available: {requests.get(url).status_code}')
    return pd.Series(wikidata_ids), ents

In [11]:
def link_data(df: pd.DataFrame, ents: dict, columns=['creator', 'materialtype', 'publisher_clean', 'title_clean', 'subjects']):
    # Get Wikidata IDs for specified *cleaned* columns
    for column in tqdm(columns):
        if column == 'subjects':
            wikidata_ids = []
            for subjects in df[column]:
                subject_ids, ents = fetch_wikidata_entity(subjects, ents)
                subject_ids = [s_id for s_id in subject_ids if s_id != None]
                wikidata_ids.append(subject_ids)
            df[f'{column[0]}_wd'] = wikidata_ids
        else:
            wikidata_ids, ents = fetch_wikidata_entity(df[column], ents)
            df[f'{column[0]}_wd'] = wikidata_ids
    return df, ents

In [12]:
ents = {}
df, ents = link_data(df, ents)

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [01:09<00:00, 14.00s/it]


In [13]:
def construct_wikidata_links(wd_id):
    url_base = 'https://www.wikidata.org/wiki/'
    if wd_id != None and isinstance(wd_id, str):
        return url_base + wd_id
    elif isinstance(wd_id, list):
        return [url_base + id for id in wd_id]
     
def print_wikidata_urls(dataframe):
     for i in range(0, len(dataframe)):
        print(f"{dataframe['creator'][i]}: {construct_wikidata_links(dataframe['c_wd'][i])}")
        print(f"{dataframe['title_clean'][i]}: {construct_wikidata_links(dataframe['t_wd'][i])}")
        print(f"{dataframe['materialtype'][i]}: {construct_wikidata_links(dataframe['m_wd'][i])}")
        print(f"{dataframe['publisher_clean'][i]}: {construct_wikidata_links(dataframe['p_wd'][i])}")
        print(f"Subjects: {construct_wikidata_links(dataframe['s_wd'][i])}")
        print()

# Print all wikidata links (based on the retrieved wd_entities) for one specific creator
print_wikidata_urls(df.query('creator == "Barack Obama"'))

Barack Obama: https://www.wikidata.org/wiki/Q76
A Promised Land: https://www.wikidata.org/wiki/Q101438737
EBOOK: https://www.wikidata.org/wiki/Q128093
Random House: https://www.wikidata.org/wiki/Q744182
Subjects: ['https://www.wikidata.org/wiki/Q309', 'https://www.wikidata.org/wiki/Q7163', 'https://www.wikidata.org/wiki/Q213051']



In [14]:
filename = Path('./seattleLib_top10_wikidata.feather')

# Save dataframe to pickle
df.to_feather(filename)

 ### Create an RDF graph with turtle serialization

In [15]:
# Load pickled dataframe
df = pd.read_feather(filename)

In [16]:
df.head()

Unnamed: 0,materialtype,creator,publisher,publicationyear,subjects,title,checkouts,title_clean,publisher_clean,c_wd,m_wd,p_wd,t_wd,s_wd
0,EBOOK,Barack Obama,"Random House, Inc.",2020,"[Biography & Autobiography, History, Politics,...",A Promised Land,2041,A Promised Land,Random House,Q76,Q128093,Q744182,Q101438737,"[Q309, Q7163, Q213051]"
1,AUDIOBOOK,Brit Bennett,Books on Tape,2020,"[Fiction, Literature]",The Vanishing Half: A Novel (unabridged),1629,The Vanishing Half,Books on Tape,Q27449519,Q106833,Q4943292,Q98476957,"[Q8253, Q8242]"
2,EBOOK,Matt Haig,"Penguin Group (USA), Inc.",2020,"[Fiction, Literature, Science Fiction]",The Midnight Library: A Novel,1505,The Midnight Library,Penguin Group,Q926682,Q128093,Q3374730,Q106814534,"[Q8253, Q8242, Q24925]"
3,AUDIOBOOK,Malcolm Gladwell,"Blackstone Audio, Inc.",2019,"[Psychology, Sociology, Nonfiction]",Talking to Strangers: What We Should Know abou...,1435,Talking to Strangers,Blackstone Audio,Q318429,Q106833,Q4923391,Q69523853,"[Q9418, Q21201, Q213051]"
4,EBOOK,Jessica Bruder,W. W. Norton & Company,2017,"[Business, Sociology, Nonfiction]",Nomadland: Surviving America in the Twenty-Fir...,1434,Nomadland,W. W. Norton & Company,Q63098703,Q128093,Q1282208,Q61740820,"[Q4830453, Q21201, Q213051]"


In [17]:
g = Graph()

In [18]:
def create_triples(df: pd.DataFrame):
    turtle_namespaces = """\
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX ex: <http://example.org/ontology/>

    """
    collection = []

    for i in range(0, len(df)):
        triples = []
        if df.iloc[i]['c_wd'] != None and df.iloc[i]['t_wd'] != None and df.iloc[i]['m_wd'] != None and df.iloc[i]['p_wd'] != None:
            # Triples for name and person 
            triples.append(f"""wd:{df.iloc[i]['c_wd']} rdf:type dbo:Author .""")
            triples.append(f"""wd:{df.iloc[i]['c_wd']} rdfs:label "{df.iloc[i]['creator']}"@en .""")

            # Triples for title, materialtype, publisher (t_wd, p_wd, m_wd)
            triples.append(f"""wd:{df.iloc[i]['c_wd']} wdt:P800 wd:{df.iloc[i]['t_wd']} .""") # has notable work (P800)
            triples.append(f"""wd:{df.iloc[i]['t_wd']} rdfs:label "{df.iloc[i]['title']}"@en .""")
            triples.append(f"""wd:{df.iloc[i]['t_wd']} rdf:type wd:{df.iloc[i]['m_wd']} .""")
            triples.append(f"""wd:{df.iloc[i]['t_wd']} ex:hasCheckouts "{df.iloc[i]['checkouts']}"^xsd:integer .""")
            triples.append(f"""wd:{df.iloc[i]['m_wd']} rdfs:label "{df.iloc[i]['materialtype']}"@en .""")
            triples.append(f"""wd:{df.iloc[i]['t_wd']} wdt:P123 wd:{df.iloc[i]['p_wd']} .""") # has publisher
            triples.append(f"""wd:{df.iloc[i]['p_wd']} rdfs:label "{df.iloc[i]['publisher']}"@en .""")
            
            # Triples for subject (s_wd)
            if 's_wd' in df.columns:
                for s in df.iloc[i]['s_wd']:
                    triples.append(f"""wd:{df.iloc[i]['t_wd']} wdt:P921 wd:{s} .""") # has main subject
                
            # Append all triples
            collection.append(triples)
        
    triples_string = ' '.join(['\n'.join(set) for set in collection])
    graph_data = turtle_namespaces + triples_string
    return graph_data

In [19]:
# Create triples
graph_data = create_triples(df)

In [20]:
# Parse turtle data
g.parse(data=graph_data, format="turtle")

<Graph identifier=Nc40ae0b079a64410947ba4116b9b808a (<class 'rdflib.graph.Graph'>)>

In [21]:
g.print()

@prefix dbo: <http://dbpedia.org/ontology/> .
@prefix ex: <http://example.org/ontology/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix wdt: <http://www.wikidata.org/prop/direct/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

wd:Q18763622 a dbo:Author ;
    rdfs:label "Celeste Ng"@en ;
    wdt:P800 wd:Q55012468 .

wd:Q19325957 a dbo:Author ;
    rdfs:label "Lucy Foley"@en ;
    wdt:P800 wd:Q122363681 .

wd:Q27449519 a dbo:Author ;
    rdfs:label "Brit Bennett"@en ;
    wdt:P800 wd:Q98476957 .

wd:Q318429 a dbo:Author ;
    rdfs:label "Malcolm Gladwell"@en ;
    wdt:P800 wd:Q69523853 .

wd:Q510034 a dbo:Author ;
    rdfs:label "Sanjay Gupta"@en ;
    wdt:P800 wd:Q115164824 .

wd:Q533400 a dbo:Author ;
    rdfs:label "Kristin Hannah"@en ;
    wdt:P800 wd:Q112119139 .

wd:Q6077806 a dbo:Author ;
    rdfs:label "Isabel Wilkerson"@en ;
    wdt:P800 wd:Q484416 .

wd:Q63098703 a dbo:Author ;
    rdfs:label "Jessica Bru

In [22]:
# Set turtle filename
ttl_file = Path('seattleLib_2021.ttl')

# Save the graph
g.serialize(destination=ttl_file)

<Graph identifier=Nc40ae0b079a64410947ba4116b9b808a (<class 'rdflib.graph.Graph'>)>

### **Visualize** the Graph

In [23]:
kg = kglab.KnowledgeGraph().load_rdf(Path(ttl_file))

In [24]:
VIS_STYLE = {
    "wd": {
        "color": "orange",
        "size": 40,
    },
    "dbo":{
        "color": "blue",
        "size": 50,
    },
}

subgraph = kglab.SubgraphTensor(kg)
pyvis_graph = subgraph.build_pyvis_graph(notebook=True, style=VIS_STYLE)



In [25]:
pyvis_graph.force_atlas_2based()
pyvis_graph.show(str(ttl_file).replace('.ttl', '.html'))

seattleLib_2021.html


## **Wikidata Entity Linking** for Top 1000 Creators in 2021

In [84]:
def link_my_df(filename: pd.DataFrame, 
               rows: int, 
               columns=['creator', 'materialtype', 'publisher_clean', 'title_clean', 'subjects'], 
               threads=32):
    """Link values of an existing pd.DataFrame to wikidata entities. 
    The input DataFrame provided by the argument `filename` has to follow this layout:
    
    pd.DataFrame({'creator': ['Barack Obama', 'Stephen King'],
                  'materialtype': ['Book', 'Ebook'], 
                  'publisher_clean': ['Random House', 'Penguin USA'], 
                  'title_clean': ['A Promised Land', 'Carry'], 
                  'subjects': [['Fiction', 'Politics'], ['Fiction', 'Horror']]})

    Args:
        filename (pd.DataFrame): Filename of an existing feathered pd.DataFrame (e.g. `DataFrame.feather`)
        rows (int): Number of rows of the existing DataFrame that sould be considered for linking
        columns (list, optional): Specifies the columns for entity linking. Defaults to ['creator', 'materialtype', 'publisher_clean', 'title_clean', 'subjects'].

    Returns:
        df: pd.DataFrame
        ents: dict
    """
    # Load 2021 checkouts .feather and create a DF
    checkouts2021_df = pd.read_feather(Path(filename))

    # Create new DF with rows=rows of old DF and clean data
    df = create_dataframe(checkouts2021_df, rows=rows)
    df = clean_dataframe(df)
    print('Created new DF and cleaned data ...')

    # Entity linking
    print('Started entity linking ...')
    # ents = {}
    df = fw.link_data(df, ents={}, columns=columns, threads=threads)
    print('Finished entity linking ...')

    # Create graph, triples (turtle rdf) and parse those triples
    g = Graph()
    graph_data = create_triples(df)
    g.parse(data=graph_data, format="turtle")

    # Save turtle rdf 
    ttl_file = Path((filename.replace('.feather', f'_top{rows}.ttl')))
    g.serialize(destination=ttl_file)
    print(f'Saved turtle RDF to {ttl_file} ...')

    # Visualize the graph
    kg = kglab.KnowledgeGraph().load_rdf(Path(ttl_file))
    subgraph = kglab.SubgraphTensor(kg)
    pyvis_graph = subgraph.build_pyvis_graph(notebook=True, style=VIS_STYLE)
    pyvis_graph.force_atlas_2based()
    pyvis_graph.show(str(ttl_file).replace('.ttl', '.html'))
    print(f"Saved graph visualization to {str(ttl_file).replace('.ttl', '.html')} ...")
    return df, ents

In [92]:
df, ents = link_my_df(filename='2021_seattle_lib_checkouts.feather', 
                rows=100,
                columns=['creator', 'materialtype', 'publisher_clean', 'title_clean'],
                threads=64)

Created new DF and cleaned data ...
Started entity linking ...


100%|██████████| 4/4 [00:06<00:00,  1.69s/it]

Finished entity linking ...
Saved turtle RDF to 2021_seattle_lib_checkouts_top100.ttl ...
2021_seattle_lib_checkouts_top100.html
Saved graph visualization to 2021_seattle_lib_checkouts_top100.html ...





# **Dev**

In [9]:
import pandas as pd
import re
import kglab
import entityLinker as el

from pathlib import Path
from rdflib import Graph
from tqdm.auto import tqdm

In [10]:
def create_dataframe(dataframe: pd.DataFrame, rows: int):
    """Create new pd.DataFrame from existing pd.DataFrame 
    by providing the amount of `rows` the new DataFrame should have.
    """
    new_df = (
        dataframe
        .groupby(['materialtype', 
                'creator', 
                'publisher', 
                'publicationyear', 
                'subjects', 
                'title'])['checkouts']
        .sum()
        .sort_values(ascending=False)[:rows]
        )
    new_df = pd.DataFrame(new_df).reset_index()
    return new_df

def reverse_name(name):
    name_parts = name.split(', ')
    name_parts.reverse()
    return ' '.join(name_parts).strip()

def clean_title(title):
    clean_title = re.sub(r':.*', '', title)
    return clean_title.strip()

def clean_publisher(publisher):
    clean_publisher = re.sub(r',.*|Inc.|\(U.*', '', publisher)
    return clean_publisher

def clean_dataframe(df: pd.DataFrame):
    # If creator column contains birth dates or death dates (or both) delete them
    df['creator'] = df['creator'].str.replace(r'\(.*|[\d-]*', '', regex=True)
    # If creator column contains name in wrong order (last_name, first_name)
    df['creator'] = df['creator'].apply(reverse_name)
    # Clean publicationyear column
    df['publicationyear'] = df['publicationyear'].str.replace(r'\W*', '', regex=True)
    # Clean title column
    df['title_clean'] = df['title'].apply(clean_title)
    # Clean publisher column
    df['publisher_clean'] = df['publisher'].apply(clean_publisher)
    # Clean subjects column
    df['subjects'] = df['subjects'].str.split(', ')
    return df

In [11]:
# Load DataFrame
filename = '2021_seattle_lib_checkouts.feather'
df = pd.read_feather(filename)

In [12]:
# Select specific columns for linking to wikidata and clean them
df = create_dataframe(df, rows=30000)
df = clean_dataframe(df)
df.head()

Unnamed: 0,materialtype,creator,publisher,publicationyear,subjects,title,checkouts,title_clean,publisher_clean
0,EBOOK,Barack Obama,"Random House, Inc.",2020,"[Biography & Autobiography, History, Politics,...",A Promised Land,2041,A Promised Land,Random House
1,AUDIOBOOK,Brit Bennett,Books on Tape,2020,"[Fiction, Literature]",The Vanishing Half: A Novel (unabridged),1629,The Vanishing Half,Books on Tape
2,EBOOK,Matt Haig,"Penguin Group (USA), Inc.",2020,"[Fiction, Literature, Science Fiction]",The Midnight Library: A Novel,1505,The Midnight Library,Penguin Group
3,AUDIOBOOK,Malcolm Gladwell,"Blackstone Audio, Inc.",2019,"[Psychology, Sociology, Nonfiction]",Talking to Strangers: What We Should Know abou...,1435,Talking to Strangers,Blackstone Audio
4,EBOOK,Jessica Bruder,W. W. Norton & Company,2017,"[Business, Sociology, Nonfiction]",Nomadland: Surviving America in the Twenty-Fir...,1434,Nomadland,W. W. Norton & Company


In [13]:
df.to_feather('test_df.feather')

In [6]:
df, ents = el.link_to_wikidata(df, 
                               columns=['creator', 'title_clean', 'materialtype', 'publisher_clean'],
                               threads=32,
                               ents={})

  0%|          | 0/4 [00:00<?, ?it/s]

Linking batch 0 ...
Linking batch 2000 ...
Linking batch 4000 ...
Linking batch 6000 ...
Linking batch 8000 ...


 25%|██▌       | 1/4 [02:30<07:31, 150.34s/it]

Linking batch 0 ...
Linking batch 2000 ...
Linking batch 4000 ...
Linking batch 6000 ...
Linking batch 8000 ...


 50%|█████     | 2/4 [05:00<05:00, 150.37s/it]

Linking batch 0 ...
Linking batch 2000 ...
Linking batch 4000 ...
Linking batch 6000 ...
Linking batch 8000 ...


 75%|███████▌  | 3/4 [07:31<02:30, 150.37s/it]

Linking batch 0 ...
Linking batch 2000 ...
Linking batch 4000 ...
Linking batch 6000 ...
Linking batch 8000 ...


100%|██████████| 4/4 [10:01<00:00, 150.35s/it]


In [8]:
df

Unnamed: 0,materialtype,creator,publisher,publicationyear,subjects,title,checkouts,title_clean,publisher_clean,c_wd,t_wd,m_wd,p_wd
0,EBOOK,Barack Obama,"Random House, Inc.",2020,"[Biography & Autobiography, History, Politics,...",A Promised Land,2041,A Promised Land,Random House,Q76,Q101438737,Q128093,Q744182
1,AUDIOBOOK,Brit Bennett,Books on Tape,2020,"[Fiction, Literature]",The Vanishing Half: A Novel (unabridged),1629,The Vanishing Half,Books on Tape,Q27449519,Q98476957,Q106833,Q4943292
2,EBOOK,Matt Haig,"Penguin Group (USA), Inc.",2020,"[Fiction, Literature, Science Fiction]",The Midnight Library: A Novel,1505,The Midnight Library,Penguin Group,Q926682,Q106814534,Q128093,Q3374730
3,AUDIOBOOK,Malcolm Gladwell,"Blackstone Audio, Inc.",2019,"[Psychology, Sociology, Nonfiction]",Talking to Strangers: What We Should Know abou...,1435,Talking to Strangers,Blackstone Audio,Q318429,Q69523853,Q106833,Q4923391
4,EBOOK,Jessica Bruder,W. W. Norton & Company,2017,"[Business, Sociology, Nonfiction]",Nomadland: Surviving America in the Twenty-Fir...,1434,Nomadland,W. W. Norton & Company,Q63098703,Q61740820,Q128093,Q1282208
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,EBOOK,Elizabeth Lim,"Random House, Inc.",2020,"[Fantasy, Romance, Young Adult Fiction, Young ...","Unravel the Dusk: Blood of Stars Series, Book 2",26,Unravel the Dusk,Random House,Q95933481,,Q128093,Q744182
9996,AUDIOBOOK,C. J. Box,"Recorded Books, LLC",2019,"[Fiction, Mystery, Thriller]","Breaking Point: Joe Pickett Series, Book 13 (u...",26,Breaking Point,Recorded Books,Q856006,Q2311935,Q106833,Q17109628
9997,BOOK,Mark Bittman,"Houghton Mifflin Harcourt,",2017,"[Vegetarian cooking, Cookbooks]",How to cook everything vegetarian : simple mea...,26,How to cook everything vegetarian,Houghton Mifflin Harcourt,Q3294002,,Q571,Q390074
9998,BOOK,Danielle Steel,"Delacorte Press,",2020,"[Families Fiction, Grief Fiction, Friendship F...",All that glitters : a novel / Danielle Steel.,26,All that glitters,Delacorte Press,Q72653,Q114247367,Q571,Q87935328
