In [1]:
import pandas as pd
df_e = pd.read_csv('e.csv')
df_e.info()
df_e.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424 entries, 0 to 423
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   wiki_id  424 non-null    object
dtypes: object(1)
memory usage: 3.4+ KB


Unnamed: 0,wiki_id
0,Q43436
1,Q43088
2,Q43513
3,Q5283
4,Q4700957


In [2]:
entity_list = list(df_e['wiki_id'])
print(len(entity_list))
entity_list[:3]

424


['Q43436', 'Q43088', 'Q43513']

In [3]:
from SPARQLWrapper import SPARQLWrapper, JSON
def query(entity_id='Q43088'):
    # SPARQL endpoint for Wikidata
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = f"""
    SELECT ?subjectLabel ?predicateLabel ?objectLabel ?subject ?object 
    WHERE {{
      ?subject ?predicate ?object.
      ?subject rdfs:label ?subjectLabel.
      ?x wikibase:directClaim ?predicate.
      ?x rdfs:label ?predicateLabel.
      ?object rdfs:label ?objectLabel.
      BIND(wd:{entity_id} AS ?subject) 
      FILTER(LANG(?subjectLabel) = "en").
      FILTER(LANG(?predicateLabel) = "en").
      FILTER(LANG(?objectLabel) = "en").
    }}
    LIMIT 10000
    """
    # Set the query and return format
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    # Execute the query
    results = sparql.query().convert()
    return results

# query()

In [4]:
import pandas as pd
def e2spo(entity_id='Q43088'):
    results = query(entity_id)
    csv_data = []  # CSV header
    for result in results["results"]["bindings"]:
        subject_label = result["subjectLabel"]["value"]
        predicate_label = result["predicateLabel"]["value"]
        object_label = result["objectLabel"]["value"]
        
        # Extract Wikidata IDs
        subject_id = result["subject"]["value"].split('/')[-1]  # Extract subject Wikidata ID
        object_id = result["object"]["value"].split('/')[-1]    # Extract object Wikidata ID
        
        # Add SPO triples to CSV data
        csv_data.append([subject_label, predicate_label, object_label, subject_id, object_id])
    
    df = pd.DataFrame(
        csv_data,
        columns=["subjectLabel","predicateLabel","objectLabel","subject","object"]
    )
    return df
    
df = e2spo(entity_id='Q43088')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   subjectLabel    23 non-null     object
 1   predicateLabel  23 non-null     object
 2   objectLabel     23 non-null     object
 3   subject         23 non-null     object
 4   object          23 non-null     object
dtypes: object(5)
memory usage: 1.0+ KB


Unnamed: 0,subjectLabel,predicateLabel,objectLabel,subject,object
0,ruby,instance of,mineral variety,Q43088,Q429795
1,ruby,described by source,Brockhaus and Efron Encyclopedic Dictionary,Q43088,Q602358
2,ruby,described by source,Encyclopædia Britannica 11th edition,Q43088,Q867541
3,ruby,described by source,Explanatory Dictionary of the Living Great Rus...,Q43088,Q1970746
4,ruby,described by source,The Nuttall Encyclopædia,Q43088,Q3181656


In [5]:
from tqdm.auto import tqdm
list_df=[]
pbar = tqdm(entity_list)
for e in pbar:
    pbar.set_description(f'Query entity: {e:<15}')
    df=e2spo(entity_id = e)
    df['source']=e
    list_df.append(df)

# Concatenate the DataFrames row-wise (axis=0)
df = pd.concat(list_df, axis=0, ignore_index=True)
df.info()
df.head()

  0%|          | 0/424 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7367 entries, 0 to 7366
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   subjectLabel    7367 non-null   object
 1   predicateLabel  7367 non-null   object
 2   objectLabel     7367 non-null   object
 3   subject         7367 non-null   object
 4   object          7367 non-null   object
 5   source          7367 non-null   object
dtypes: object(6)
memory usage: 345.5+ KB


Unnamed: 0,subjectLabel,predicateLabel,objectLabel,subject,object,source
0,pearl,instance of,spherical body,Q43436,Q112511193,Q43436
1,pearl,subclass of,organic gem,Q43436,Q110688445,Q43436
2,pearl,has use,jewelry,Q43436,Q161439,Q43436
3,pearl,made from material,nacre,Q43436,Q215865,Q43436
4,pearl,subclass of,animal product,Q43436,Q629103,Q43436


In [6]:
df_pivot = pd.pivot_table(
    df.reset_index(),
    # columns=['col1'],
    index=['predicateLabel'],
    values=['index'], 
    aggfunc={
        'index': ["count"],
    },
    fill_value=0
)
df_pivot.columns = ['_'.join(col).strip() for col in df_pivot.columns.values]
df_pivot.sort_values('index_count',ascending=False).head(50)

Unnamed: 0_level_0,index_count
predicateLabel,Unnamed: 1_level_1
diplomatic relation,1028
described by source,709
subclass of,600
member of,471
contains the administrative territorial entity,443
language used,380
instance of,344
different from,230
topic's main category,174
has part(s),134


In [7]:
values_to_drop = {
    'diplomatic relation',
    'contains the administrative territorial entity',
    'emergency phone number',
    'topic\'s main Wikimedia portal',
    'public holiday',
    'highest judicial authority',
    'Wikidata property',
    'Wikimedia outline',
    'described by source',
    'different from',
    "topic's main category",
    'on focus list of Wikimedia project',
    "topic's main template",
    'maintained by WikiProject',
    'located in the administrative territorial entity',
    'top-level Internet domain',
    'flag',
    'currency',
    'language used'
}
df2 = df[~df['predicateLabel'].isin(values_to_drop)]
df_pivot = pd.pivot_table(
    df2.reset_index(),
    # columns=['col1'],
    index=['predicateLabel'],
    values=['index'], 
    aggfunc={
        'index': ["count"],
    },
    fill_value=0
)
df_pivot.columns = ['_'.join(col).strip() for col in df_pivot.columns.values]
df_pivot.sort_values('index_count',ascending=False).head(50)

Unnamed: 0_level_0,index_count
predicateLabel,Unnamed: 1_level_1
subclass of,600
member of,471
instance of,344
has part(s),134
part of,106
shares border with,97
significant event,86
properties for this type,69
taxon range,65
country,65


In [8]:
df2.to_csv('spo_e.csv')
df2.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
Index: 3927 entries, 0 to 7366
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   subjectLabel    3927 non-null   object
 1   predicateLabel  3927 non-null   object
 2   objectLabel     3927 non-null   object
 3   subject         3927 non-null   object
 4   object          3927 non-null   object
 5   source          3927 non-null   object
dtypes: object(6)
memory usage: 214.8+ KB


Unnamed: 0,subjectLabel,predicateLabel,objectLabel,subject,object,source
0,pearl,instance of,spherical body,Q43436,Q112511193,Q43436
1,pearl,subclass of,organic gem,Q43436,Q110688445,Q43436
2,pearl,has use,jewelry,Q43436,Q161439,Q43436
3,pearl,made from material,nacre,Q43436,Q215865,Q43436
4,pearl,subclass of,animal product,Q43436,Q629103,Q43436


In [9]:
import json
def spo2kg(df_spo_wiki):
    df = df_spo_wiki
    df_spo = df_spo_wiki[['subjectLabel','predicateLabel','objectLabel']]
    
    metadata = {}
    # Base Wikipedia URL for metadata
    wiki_base_url = "https://en.wikipedia.org/wiki/"
    
    for i in df.itertuples():
        # print(i.subjectLabel)
        
        # Add subject metadata (Wikidata ID and Wikipedia URL)
        if i.subjectLabel not in metadata:
            metadata[i.subjectLabel] = {
                "url": f"{wiki_base_url}{i.subjectLabel.replace(' ', '_')}",
                "wiki_id": i.subject
            }
        
        # Add object metadata (Wikidata ID and Wikipedia URL)
        if i.objectLabel not in metadata:
            metadata[i.objectLabel] = {
                "url": f"{wiki_base_url}{i.objectLabel.replace(' ', '_')}",
                "wiki_id": i.object
            }
    
    return df_spo, metadata

df_spo, metadata = spo2kg(df2)
df_spo.info()
df_spo.head()

<class 'pandas.core.frame.DataFrame'>
Index: 3927 entries, 0 to 7366
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   subjectLabel    3927 non-null   object
 1   predicateLabel  3927 non-null   object
 2   objectLabel     3927 non-null   object
dtypes: object(3)
memory usage: 122.7+ KB


Unnamed: 0,subjectLabel,predicateLabel,objectLabel
0,pearl,instance of,spherical body
1,pearl,subclass of,organic gem
2,pearl,has use,jewelry
3,pearl,made from material,nacre
4,pearl,subclass of,animal product


In [10]:
csv_file= 'data_e.csv'
metadata_file='metadata_e.json'

df_spo.to_csv(csv_file, header=False, index=False)
with open(metadata_file, 'w', encoding='utf-8') as f:
    json.dump(metadata, f, indent=2)

In [11]:
!kg add -f data_e.csv
!kg meta -f metadata_e.json
!kg start

🎉 Starting the app.
 * Serving Flask app 'kgsearch.app.app' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off
 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [01/Oct/2024 18:45:57] "GET /search/1/1/1/blue HTTP/1.1" 200 -
127.0.0.1 - - [01/Oct/2024 18:46:01] "GET /search/1/1/1/blue;gemstone HTTP/1.1" 200 -
127.0.0.1 - - [01/Oct/2024 18:47:17] "GET /search/1/1/1/blue;gemstone;Sapphire HTTP/1.1" 200 -
127.0.0.1 - - [01/Oct/2024 18:47:33] "GET /search/1/1/1/blue;gemstone;Sapphire;Aquamarine HTTP/1.1" 200 -
127.0.0.1 - - [01/Oct/2024 18:48:10] "GET /search/1/1/1/blue;gemstone;Sapphire;%20beryl HTTP/1.1" 200 -
127.0.0.1 - - [01/Oct/2024 18:48:27] "GET /search/1/1/1/blue;gemstone;Sapphire;%20beryl;Tanzanite HTTP/1.1" 200 -
127.0.0.1 - - [01/Oct/2024 18:48:54] "GET /search/1/1/1/blue;gemstone;Sapphire;%20beryl;zoisite HTTP/1.1" 200 -
127.0.0.1 - - [01/Oct/2024 18:49:04] "GET /search/1/1/1/blue;gemstone;