In [16]:
from SPARQLWrapper import SPARQLWrapper, JSON

def get_wikiurl_wikidata(my_wiki_id ='Q83437'):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

    query = """
    SELECT ?entity ?entityLabel ?wikipedia_url WHERE {
      BIND(wd:[[my_wiki_id]] AS ?entity).
      ?wikipedia_url schema:about ?entity;
                     schema:isPartOf <https://en.wikipedia.org/>.
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    """.replace('[[my_wiki_id]]',my_wiki_id)
    
    # Set up the SPARQL query
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    # Execute the query and retrieve the results
    results = sparql.query().convert()
    
    # Process and print results
    for result in results["results"]["bindings"]:
        entity_label = result["entityLabel"]["value"]
        wikipedia_url = result["wikipedia_url"]["value"]
        if wikipedia_url is not None:
            return wikipedia_url
    raise Exception('wikipedia_url is not found')

get_wikiurl_wikidata()

'https://en.wikipedia.org/wiki/Gemstone'

In [17]:
import pandas as pd
df_e=pd.read_csv('e.csv')
df_e.info()
df_e.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424 entries, 0 to 423
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   wiki_id  424 non-null    object
dtypes: object(1)
memory usage: 3.4+ KB


Unnamed: 0,wiki_id
0,Q43436
1,Q43088
2,Q43513
3,Q5283
4,Q4700957


In [18]:
# pip install monadsquishy -U

In [19]:
from monadsquishy import Squishy, sf
sq_config = {
    'transformations':[
        {
            'input_table': df_e,
            'transformed_path':'./staging/url',
            'exploded_path':'./staging/url',
            'out_columns': {
                 'wiki_id': {
                     'input':'wiki_id',
                     'funcs':[lambda x:x, ],
                 },
                 'url': {
                     'input':'wiki_id',
                     'funcs':[get_wikiurl_wikidata, ],
                 }
             }
        }
    ]
}
sq=Squishy(sq_config)
sq.run()

1/2 Output: wiki_id
Input: wiki_id              -->['<lambda>']                                                                                                                                                                                            

2/2 Output: url
Input: wiki_id              -->['get_wikiurl_wikidata']                                                                                                                                                                                

Finished transformations!


In [20]:
sq.clean_report()

Unnamed: 0,input_column,output_column,message,clean_count
1,wiki_id,wiki_id,Passed: <lambda>(),424
0,wiki_id,url,Passed: get_wikiurl_wikidata(),271


In [21]:
sq.dirty_report()

Unnamed: 0,input_column,output_column,input_value,dirty_count
0,wiki_id,url,Q10350376,1
1,wiki_id,url,Q10428697,1
2,wiki_id,url,Q104708160,1
3,wiki_id,url,Q1058175,1
4,wiki_id,url,Q106878323,1
...,...,...,...,...
148,wiki_id,url,Q89675973,1
149,wiki_id,url,Q9206678,1
150,wiki_id,url,Q9324384,1
151,wiki_id,url,Q97097388,1


In [22]:
sq.output()

Unnamed: 0,wiki_id,url
0,Q43436,https://en.wikipedia.org/wiki/Pearl
1,Q43088,https://en.wikipedia.org/wiki/Ruby
2,Q43513,https://en.wikipedia.org/wiki/Emerald
3,Q5283,https://en.wikipedia.org/wiki/Diamond
4,Q4700957,https://en.wikipedia.org/wiki/Aqeeq
...,...,...
419,Q59922524,
420,Q47458415,
421,Q88789639,
422,Q113921884,


In [23]:
entity_list = list(df_e['wiki_id'])[:3]
len(entity_list)
entity_list

['Q43436', 'Q43088', 'Q43513']

In [24]:
from tqdm.auto import tqdm
urls={}
pbar = tqdm(entity_list)
for e in pbar:
    pbar.set_description(f'Query entity: {e:<15}')
    urls[e] = get_wikiurl_wikidata(e)

urls

  0%|          | 0/3 [00:00<?, ?it/s]

{'Q43436': 'https://en.wikipedia.org/wiki/Pearl',
 'Q43088': 'https://en.wikipedia.org/wiki/Ruby',
 'Q43513': 'https://en.wikipedia.org/wiki/Emerald'}