In [8]:
from SPARQLWrapper import SPARQLWrapper, JSON
import csv
import json
import pandas as pd
from tqdm import tqdm

In [13]:
def query(my_wiki_id='Q83437'):
    # SPARQL endpoint for Wikidata
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = """
    SELECT ?subjectLabel ?predicateLabel ?objectLabel ?subject ?object
    WHERE {
      ?subject ?predicate ?object.
      ?subject wdt:P279* wd:[[my_wiki_id]].
      ?subject rdfs:label ?subjectLabel.
      ?object rdfs:label ?objectLabel.
      ?x wikibase:directClaim ?predicate.
      ?x rdfs:label ?predicateLabel.
      FILTER((LANG(?subjectLabel) = "en" )).
      FILTER((LANG(?predicateLabel) = "en" )).
      FILTER((LANG(?objectLabel) = "en" )).
    }
    LIMIT 10000
    """.replace("[[my_wiki_id]]", my_wiki_id)
    
    # Set the query and return format
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    # Execute the query
    results = sparql.query().convert()
    return results
#query()

In [24]:
# def get_spo(results):

def class2spo(my_wiki_id='Q83437'):
    results = query(my_wiki_id)
    csv_data = []  # CSV header
    for result in results["results"]["bindings"]:
        subject_label = result["subjectLabel"]["value"]
        predicate_label = result["predicateLabel"]["value"]
        object_label = result["objectLabel"]["value"]
        
        # Extract Wikidata IDs
        subject_id = result["subject"]["value"].split('/')[-1]  # Extract subject Wikidata ID
        object_id = result["object"]["value"].split('/')[-1]    # Extract object Wikidata ID
        
        # Add SPO triples to CSV data
        csv_data.append([subject_label, predicate_label, object_label, subject_id, object_id])
    
    df = pd.DataFrame(
        csv_data,
        columns=["subjectLabel","predicateLabel","objectLabel","subject","object"]
    )
    return df

df=class2spo(my_wiki_id='Q83437')
df

Unnamed: 0,subjectLabel,predicateLabel,objectLabel,subject,object
0,pearl,instance of,spherical body,Q43436,Q112511193
1,ruby,streak color,white,Q43088,Q23444
2,emerald,streak color,white,Q43513,Q23444
3,ruby,color,red,Q43088,Q3142
4,pearl,made from material,nacre,Q43436,Q215865
...,...,...,...,...,...
454,iolite,subclass of,cordierite,Q122998805,Q410336
455,pink diamond,color,pink,Q16191831,Q429220
456,cat's eye quartz,subclass of,cat's eye,Q1736875,Q25510612
457,engraved gem,described by source,Real'nyj slovar' klassicheskih drevnostej po L...,Q1501187,Q30059240


In [27]:
# gemstone (Q83437) +
# jewelry (Q161439) +
# jewelry designer (Q2519376)+
# mineral variety (Q429795)
# mineral species (Q12089225)
# metamorphic rock
S=['Q83437','Q161439','Q2519376']

list_df=[]
for s in tqdm(S):
    print(f'Query class: {s}')
    df=class2spo(my_wiki_id=s)
    df['class']=s
    list_df.append(df)

# Concatenate the DataFrames row-wise (axis=0)
df_concatenated = pd.concat(list_df, axis=0, ignore_index=True)
df_concatenated.info()
df_concatenated.head()

  0%|                                                                                                                                                              | 0/3 [00:00<?, ?it/s]

Query class: Q83437


 33%|██████████████████████████████████████████████████                                                                                                    | 1/3 [00:00<00:00,  3.68it/s]

Query class: Q161439


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  4.30it/s]

Query class: Q2519376
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   subjectLabel    768 non-null    object
 1   predicateLabel  768 non-null    object
 2   objectLabel     768 non-null    object
 3   subject         768 non-null    object
 4   object          768 non-null    object
 5   class           768 non-null    object
dtypes: object(6)
memory usage: 36.1+ KB





Unnamed: 0,subjectLabel,predicateLabel,objectLabel,subject,object,class
0,pearl,instance of,spherical body,Q43436,Q112511193,Q83437
1,ruby,color,red,Q43088,Q3142,Q83437
2,ruby,described by source,Brockhaus and Efron Encyclopedic Dictionary,Q43088,Q602358,Q83437
3,emerald,streak color,white,Q43513,Q23444,Q83437
4,diamond,streak color,white,Q5283,Q23444,Q83437


In [32]:
df_concatenated.to_csv('spo_wiki.csv',index=False)