In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON

def get_wikiurl_wikidata(my_wiki_id ='Q83437'):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

    query = """
    SELECT ?entity ?entityLabel ?wikipedia_url WHERE {
      BIND(wd:[[my_wiki_id]] AS ?entity).
      ?wikipedia_url schema:about ?entity;
                     schema:isPartOf <https://en.wikipedia.org/>.
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    """.replace('[[my_wiki_id]]',my_wiki_id)
    
    # Set up the SPARQL query
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    # Execute the query and retrieve the results
    results = sparql.query().convert()
    
    # Process and print results
    for result in results["results"]["bindings"]:
        entity_label = result["entityLabel"]["value"]
        wikipedia_url = result["wikipedia_url"]["value"]
        if wikipedia_url is not None:
            return wikipedia_url
    raise Exception('wikipedia_url is not found')

get_wikiurl_wikidata()

'https://en.wikipedia.org/wiki/Gemstone'

In [2]:
import pandas as pd
df_e=pd.read_csv('./data/3_e.csv')
df_e.info()
df_e.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1372 entries, 0 to 1371
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   wiki_id  1372 non-null   object
dtypes: object(1)
memory usage: 10.8+ KB


Unnamed: 0,wiki_id
0,Q43436
1,Q43088
2,Q5283
3,Q573870
4,Q612430


In [3]:
pip install monadsquishy -U

Note: you may need to restart the kernel to use updated packages.


In [4]:
from monadsquishy import Squishy, sf
sq_config = {
    'transformations':[
        {
            'input_table': df_e,
            'transformed_path':'./staging/url',
            'exploded_path':'./staging/url',
            'out_columns': {
                 'wiki_id': {
                     'input':'wiki_id',
                     'funcs':[lambda x:x, ],
                 },
                 'url': {
                     'input':'wiki_id',
                     'funcs':[get_wikiurl_wikidata, ],
                 }
             }
        }
    ]
}
sq=Squishy(sq_config)
sq.run()

1/2 Output: wiki_id
Input: wiki_id             
Process: ['<lambda>']


  0%|          | 0/1372 [00:00<?, ?it/s]

2/2 Output: url
Input: wiki_id             
Process: ['get_wikiurl_wikidata']


  0%|          | 0/1372 [00:00<?, ?it/s]

>> Finished transformations!


In [5]:
sq.clean_report()

Unnamed: 0,input_column,output_column,message,clean_count
1,wiki_id,wiki_id,Passed: <lambda>(),1372
0,wiki_id,url,Passed: get_wikiurl_wikidata(),682


In [6]:
sq.dirty_report()

Unnamed: 0,input_column,output_column,input_value,dirty_count
0,wiki_id,url,P1088,1
1,wiki_id,url,P1632,1
2,wiki_id,url,P2054,1
3,wiki_id,url,P2177,1
4,wiki_id,url,P462,1
...,...,...,...,...
685,wiki_id,url,Q98194857,1
686,wiki_id,url,Q98592852,1
687,wiki_id,url,Q98876273,1
688,wiki_id,url,Q98878795,1


In [7]:
sq.output()

Unnamed: 0,wiki_id,url
0,Q43436,https://en.wikipedia.org/wiki/Pearl
1,Q43088,https://en.wikipedia.org/wiki/Ruby
2,Q5283,https://en.wikipedia.org/wiki/Diamond
3,Q573870,https://en.wikipedia.org/wiki/Bi_(jade)
4,Q612430,
...,...,...
1367,Q483958,"https://en.wikipedia.org/wiki/Okanogan_County,..."
1368,Q122510,
1369,Q7972122,https://en.wikipedia.org/wiki/Washington_Pass
1370,Q6731423,https://en.wikipedia.org/wiki/Magnet_Cove_igne...


In [8]:
entity_list = list(df_e['wiki_id'])[:3]
len(entity_list)
entity_list

['Q43436', 'Q43088', 'Q5283']

In [12]:
from tqdm.auto import tqdm
urls={}
pbar = tqdm(entity_list)
for e in pbar:
    pbar.set_description(f'Query entity: {e:<15}')
    urls[e] = get_wikiurl_wikidata(e)

urls

  0%|          | 0/3 [00:00<?, ?it/s]

{'Q43436': 'https://en.wikipedia.org/wiki/Pearl',
 'Q43088': 'https://en.wikipedia.org/wiki/Ruby',
 'Q5283': 'https://en.wikipedia.org/wiki/Diamond'}