In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
import csv
import json
import pandas as pd
from tqdm.auto import tqdm

In [2]:
def query(my_wiki_id='Q83437'):
    # SPARQL endpoint for Wikidata
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = """
    SELECT ?subjectLabel ?predicateLabel ?objectLabel ?subject ?object
    WHERE {
      ?subject ?predicate ?object.
      ?subject wdt:P279* wd:[[my_wiki_id]].
      ?subject rdfs:label ?subjectLabel.
      ?object rdfs:label ?objectLabel.
      ?x wikibase:directClaim ?predicate.
      ?x rdfs:label ?predicateLabel.
      FILTER((LANG(?subjectLabel) = "en" )).
      FILTER((LANG(?predicateLabel) = "en" )).
      FILTER((LANG(?objectLabel) = "en" )).
    }
    LIMIT 10000
    """.replace("[[my_wiki_id]]", my_wiki_id)
    
    # Set the query and return format
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    # Execute the query
    results = sparql.query().convert()
    return results
#query()

In [3]:
# def get_spo(results):

def class2spo(my_wiki_id='Q83437'):
    results = query(my_wiki_id)
    csv_data = []  # CSV header
    for result in results["results"]["bindings"]:
        subject_label = result["subjectLabel"]["value"]
        predicate_label = result["predicateLabel"]["value"]
        object_label = result["objectLabel"]["value"]
        
        # Extract Wikidata IDs
        subject_id = result["subject"]["value"].split('/')[-1]  # Extract subject Wikidata ID
        object_id = result["object"]["value"].split('/')[-1]    # Extract object Wikidata ID
        
        # Add SPO triples to CSV data
        csv_data.append([subject_label, predicate_label, object_label, subject_id, object_id])
    
    df = pd.DataFrame(
        csv_data,
        columns=["subjectLabel","predicateLabel","objectLabel","subject","object"]
    )
    return df

df=class2spo(my_wiki_id='Q47069')
df

Unnamed: 0,subjectLabel,predicateLabel,objectLabel,subject,object
0,crystalline schist,subclass of,metamorphic rock,Q15315,Q47069
1,Carrara marble,country of origin,Italy,Q40088,Q38
2,crystalline schist,different from,slate,Q15315,Q207079
3,crystalline schist,different from,shale,Q15315,Q751300
4,nephrite,subclass of,jade,Q138979,Q175089
...,...,...,...,...,...
678,argillisite,subclass of,metasomatic rock,Q56327448,Q12122900
679,alkaline metasomatite,subclass of,metasomatic rock,Q56327480,Q12122900
680,rodingite,subclass of,metasomatic rock,Q58371939,Q12122900
681,verd antique,topic's main category,Category:Verde Antico,Q1223325,Q13336910


In [4]:
# gemstone (Q83437) +
# jewelry (Q161439) +
# jewelry designer (Q2519376)+
# mineral variety (Q429795)+
# mineral species (Q12089225)+
# metamorphic rock (Q47069)+
# nesosilicates (Q429635)
S=['Q83437','Q161439','Q2519376','Q429795','Q12089225','Q47069','Q429635']
# S=['Q47069',]
list_df=[]
pbar=tqdm(S)
for s in pbar:
    pbar.set_description(f'Query class: {s:<15}')
    # pbar.d()
    # print(f'')
    df=class2spo(my_wiki_id=s)
    df['source']=s
    list_df.append(df)

# Concatenate the DataFrames row-wise (axis=0)
df_concatenated = pd.concat(list_df, axis=0, ignore_index=True)
df_concatenated.to_csv('./data/1_spo_wiki.csv',index=False)
df_concatenated.info()
df_concatenated.head()

  0%|          | 0/7 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2921 entries, 0 to 2920
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   subjectLabel    2921 non-null   object
 1   predicateLabel  2921 non-null   object
 2   objectLabel     2921 non-null   object
 3   subject         2921 non-null   object
 4   object          2921 non-null   object
 5   source          2921 non-null   object
dtypes: object(6)
memory usage: 137.0+ KB


Unnamed: 0,subjectLabel,predicateLabel,objectLabel,subject,object,source
0,pearl,has use,jewelry,Q43436,Q161439,Q83437
1,pearl,described by source,Brockhaus and Efron Encyclopedic Dictionary,Q43436,Q602358,Q83437
2,pearl,described by source,Encyclopædia Britannica 11th edition,Q43436,Q867541,Q83437
3,ruby,color,red,Q43088,Q3142,Q83437
4,nephrite,described by source,Brockhaus and Efron Encyclopedic Dictionary,Q138979,Q602358,Q83437
