In [3]:
import pickle
wikidata_candidates_per_name= pickle.load(open('wikidata_candidates_per_name.p','rb'))

In [4]:
import pandas as pd

df_graph = pd.read_csv('/ivi/ilps/personal/vprovat/valkema_graph_directed_final_final.tsv',sep='\t')

In [5]:
import numpy as np
df_graph['candidates1'] = df_graph['person1'].map(lambda x: wikidata_candidates_per_name[x]
                                                 if x in wikidata_candidates_per_name and wikidata_candidates_per_name[x]!=[]
                                                  else np.nan)
df_graph['candidates2'] = df_graph['person2'].map(lambda x: wikidata_candidates_per_name[x]
                                                 if x in wikidata_candidates_per_name and wikidata_candidates_per_name[x]!=[]
                                                  else np.nan)
df_graph_wiki= df_graph[~(df_graph['candidates1'].isnull()) &~(df_graph['candidates2'].isnull())]
df_graph_wiki.head()

Unnamed: 0,person1,person2,document_ids,candidates1,candidates2
2,Haan,Spier,['0001246757-0001246758-0001246759-0001246760-...,"[http://www.wikidata.org/entity/Q86819494, htt...",[http://www.wikidata.org/entity/Q105457909]
311,Berg,Sybren Valkema,['0001246763-0001246764-0001246765-0001246766-...,"[http://www.wikidata.org/entity/Q123450260, ht...","[http://www.wikidata.org/entity/Q2618110, http..."
314,Berg,Wim Crouwel,['0001246763-0001246764-0001246765-0001246766-...,"[http://www.wikidata.org/entity/Q123450260, ht...","[http://www.wikidata.org/entity/Q555389, http:..."
318,Arts,Berg,['0001246763-0001246764-0001246765-0001246766-...,[http://www.wikidata.org/entity/Q110168132],"[http://www.wikidata.org/entity/Q123450260, ht..."
322,Berg,Liang,['0001246763-0001246764-0001246765-0001246766-...,"[http://www.wikidata.org/entity/Q123450260, ht...",[http://www.wikidata.org/entity/Q45441271]


In [6]:
### Try text similarity 

In [7]:
from sentence_transformers import SentenceTransformer, util

# Load SBERT model
model = SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1')


In [8]:
import requests
from functools import lru_cache

@lru_cache(maxsize=1000)  # Cache up to 1000 results
def fetch_wikidata_entity(entity_id):
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={entity_id}&format=json&props=descriptions|claims"
    response = requests.get(url)
    response.raise_for_status()  # Raise an HTTPError for bad responses
    return response.json()

@lru_cache(maxsize=1000)  # Cache up to 1000 results
def fetch_profession_labels(profession_ids, language='en'):
    ids = "|".join(profession_ids)
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={ids}&format=json&props=labels"
    response = requests.get(url)
    response.raise_for_status()  # Raise an HTTPError for bad responses
    return response.json()

def get_wikidata_info(entity_id, language='en'):
    data = fetch_wikidata_entity(entity_id)

    if entity_id not in data['entities']:
        return 'Entity not found.'

    entity_data = data['entities'][entity_id]

    # Get the description in the specified language
    description = entity_data.get('descriptions', {}).get(language, {}).get('value', 'Description not found.')

    # Get the profession (occupation) - Property P106
    claims = entity_data.get('claims', {})
    professions = []
    if 'P106' in claims:
        for profession_claim in claims['P106']:
            if 'mainsnak' in profession_claim and 'datavalue' in profession_claim['mainsnak']:
                profession_id = profession_claim['mainsnak']['datavalue']['value']['id']
                professions.append(profession_id)

    # Fetch the labels for the professions
    if professions:
        profession_data = fetch_profession_labels(tuple(professions), language)  # Use tuple for caching
        labels = []
        for pid in professions:
            label = profession_data['entities'][pid]['labels'].get(language, {}).get('value', 'Unknown profession')
            labels.append(label)
        professions = labels

    return description, ', '.join(professions)

# Example usage with Wikidata ID
entity_id = 'Q2618110'  # Sybren
description, professions = get_wikidata_info(entity_id)
print(f"Description: {description}")
print(f"Professions: {professions}")

Description: Dutch glass artist (1916-1996)
Professions: glass artist, ceramicist, textile artist


In [12]:
Valkema_qid = 'Q2618110'
Valkema_description, Valkema_professions = get_wikidata_info(Valkema_qid)
Valkema_description_embedding = model.encode([Valkema_description], convert_to_tensor=True)[0]
Valkema_profession_embedding = model.encode([Valkema_professions], convert_to_tensor=True)[0]

In [19]:
def disambiguate(name):
    qids = [link.split('/')[-1] for link in wikidata_candidates_per_name[name]]
#     descriptions_and_professions = [get_wikidata_info(qid) for qid in qids]
    descriptions, professions = zip(*[get_wikidata_info(qid) for qid in qids])

    description_embeddings = model.encode(list(descriptions), convert_to_tensor=True)
    profession_embeddings = model.encode(list(professions), convert_to_tensor=True)
    
    similarity_scores_descriptions = np.array([util.pytorch_cos_sim(embedding, Valkema_description_embedding).item()
                                         for embedding in description_embeddings])
    similarity_scores_professions = np.array([util.pytorch_cos_sim(embedding, Valkema_profession_embedding).item()
                                         for embedding in profession_embeddings])
    
    similarity_scores = (similarity_scores_descriptions+similarity_scores_professions)/2
    best_ind = np.argmax(similarity_scores)
    
#     print(descriptions, professions, similarity_scores) # todo remove
    return qids[best_ind], descriptions[best_ind]
    
    

In [17]:
# Valkema_description

'Dutch glass artist (1916-1996)'

In [18]:
disambiguate('Wim Crouwel')

('Dutch graphic designer, type designer, and typographer', 'Dutch baseball player') ('type designer, graphic designer, postage stamp designer, university teacher, designer, typographer, drawer, costume designer', 'baseball player') [0.53427383 0.32968343]


('Q555389', 'Dutch graphic designer, type designer, and typographer')

In [20]:
wiki_people = set(df_graph_wiki['person1'].tolist()+df_graph_wiki['person1'].tolist())
wiki_people_fullnames = [person for person in wiki_people if person.count(' ')>0] # only names longer than 1 word

wiki_people_fullnames

['John Schlesinger',
 'Yvonne Brunhammer',
 'Dorothy Miller',
 'Alice Rooney',
 'Joey Kirkpatrick',
 'Richard Price',
 'Kim MacConnel',
 'Thomas McEvilley',
 'Kenneth M. Wilson',
 'Rudy Autio',
 'Leendert van der Linden',
 'Kees Slegt',
 'Gerrit Komrij',
 'Mary Shaffer',
 'H. Bakker',
 'Dan Hogan',
 'Patrick Reyntiens',
 'Piet Calis',
 'Adriek Westenenk',
 'David Van Arsdale',
 'Gerard Cox',
 'Anton Philips',
 'Seth Gaaikema',
 'Nicolas Morin',
 'Anne Gould',
 'Roberto Niederer',
 'Wim Brusse',
 'Paul Gardner',
 'Nick Mount',
 'Isabelle Monod',
 'Henry Miller',
 'Frank van den Ham',
 'George Ravenscroft',
 'Heikki Kallio',
 'Zamira Jonker',
 'Charles Johnson',
 'Jim Melchert',
 'Claude Bernard',
 'Lambert van Meerten',
 'Marijke Linthorst',
 'Mitchell Wolfson, Jr.',
 'Michael Brown',
 'John Cook',
 'Mark Rosenbaum',
 'Charles Cowles',
 'Harry Turner',
 'Ruth Allen',
 'Margit Kocsis',
 'George Rochester',
 'Seaver Leslie',
 'Chris Steenbergen',
 'Gerard van Westerloo',
 'Jan Rot',
 'Sus

In [21]:
from tqdm.auto import tqdm

qid_per_name = {
    name: disambiguate(name)
    for name in tqdm(wiki_people_fullnames)
}

  0%|          | 0/873 [00:00<?, ?it/s]

In [None]:
qid_per_name

In [None]:
import json

json.dump(qid_per_name, open('wikidata_id_per_name_latest.json','w'),indent=6)

In [None]:
df_graph_with_qids = df_graph[df_graph['person1'].isin(qid_per_name) & df_graph['person2'].isin(qid_per_name)]

In [None]:
df_graph_with_qids

In [None]:
df_graph_with_qids['wikidata_id1'] = df_graph_with_qids['person1'].map(lambda x: qid_per_name[x][0])
df_graph_with_qids['wikidata_description1'] = df_graph_with_qids['person1'].map(lambda x: qid_per_name[x][1])

df_graph_with_qids['wikidata_id2'] = df_graph_with_qids['person2'].map(lambda x: qid_per_name[x][0])
df_graph_with_qids['wikidata_description2'] = df_graph_with_qids['person2'].map(lambda x: qid_per_name[x][1])

df_graph_with_qids.drop(['candidates1','candidates2'],axis=1,inplace=True)

In [None]:
df_graph_with_qids

In [None]:
df_graph_with_qids.to_csv('df_graph_with_wikidata.tsv',sep='\t',index=False)

In [None]:
# todo fix some errors: eg. Wim Crouwel

In [None]:
### Old code begins here


%
# https://graphvite.io/docs/latest/pretrained_model.html

# import sys
# !{sys.executable} -m pip install easydict

import pickle

with open("/ivi/ilps/personal/vprovat/rotate_wikidata5m.pkl", "rb") as fin:
    model = pickle.load(fin)
entity2id = model.graph.entity2id
relation2id = model.graph.relation2id
entity_embeddings = model.solver.entity_embeddings
relation_embeddings = model.solver.relation_embeddings

%
import numpy as np
from scipy.spatial.distance import cosine

# Example embeddings (vectors)
embedding1 = np.array([0.1, 0.2, 0.5, 0.8])
embedding2 = np.array([0.3, 0.4, 0.7, 0.9])

# Calculate Cosine Similarity
cosine_similarity = 1 - cosine(embedding1, embedding2)
print(f"Cosine Similarity: {cosine_similarity}")


%
def similarity(q1, q2):
    if q1 not in entity2id:
        print('No embedding for ',q1)
        return 0
    if q2 not in entity2id:
        print('No embedding for ',q2)
        return 0
    
    id1, id2 = entity2id[q1], entity2id[q2]
    embedding1, embedding2 = entity_embeddings[id1],entity_embeddings[id2]
    
    return 1 - cosine(embedding1, embedding2)

similarity('Q15488059','Q2618110')

In [None]:
from sentence_transformers import SentenceTransformer, util

# Example sentences
sentence1 = "This is sentence one."
sentence2 = "This is sentence two."

# Load SBERT model
model = SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1')

# Encode sentences into embeddings
embeddings = model.encode([sentence1, sentence2], convert_to_tensor=True)

# Calculate cosine similarity between the embeddings
cosine_scores = util.pytorch_cos_sim(embeddings[0], embeddings[1])
similarity_score = cosine_scores.item()

print(f"Cosine Similarity: {similarity_score}")


In [None]:
def text_similarity(q1, q2):
    s1 = get_wikidata_description(q1)
    s2 = get_wikidata_description(q2)
    embeddings = model.encode([s1, s2], convert_to_tensor=True)

    cosine_scores = util.pytorch_cos_sim(embeddings[0], embeddings[1])
#     print(cosine_scores)
    similarity_score = cosine_scores.item()
    return similarity_score

for link in wikidata_candidates_per_name['Sybren Valkema']:
    qid = link.split('/')[-1]
    print(link)
    print(text_similarity(qid,'Q29422094'))