In [1]:
from ast import literal_eval
from tqdm.notebook import tqdm

from pathlib import Path

from time import sleep

import json
import jsonlines
import pandas as pd

# import stanza
from SPARQLWrapper import SPARQLWrapper, JSON

from utills import get_prop_info


In [2]:
output_dir = '../../outputs/'
stats_dir = f'{output_dir}prop_stats/wikibase-item_quantity_time/'
conllu_dir = f'{output_dir}conllu/wikibase-item_quantity_time/'
conllu_all_dir = f'{output_dir}conllu/all_datatypes/'
results_dir = f'{output_dir}RDF/wikibase-item_quantity_time/'

entities_info_path = f'{stats_dir}entities_info.json'
data_path = f'{stats_dir}relevent_prop.csv'
conllu_ar_path = f'{conllu_dir}udp_ar.conllu'
conllu_all_ar_path = f'{conllu_all_dir}udp_ar.conllu'


In [3]:
USER_AGENT = {
    'User-Agent': 'A bot to get subclass/isinstence. (guszarzmo@student.gu.se)'
}  # noqa:


In [4]:


def sparql_query(ent: str):
    endpoint_url = "https://query.wikidata.org/sparql"

    query_rdf = f"""SELECT ?object ?p ?pq ?predicateLabel_en ?predicateLabel_ar ?statementLabel_en ?statementLabel_ar ?qualifierLabel_en ?qualifierLabel_ar ?valuenode ?valuenodeLabel_en ?valuenodeLabel_ar ?unitLabel_en ?unitLabel_ar{{

    VALUES ?subject {{wd:{ent}}}
    ?subject ?p ?object .
    ?object ?ps ?statement .

    ?predicate wikibase:claim ?p .
    ?predicate wikibase:statementProperty ?ps .

    ?predicate rdfs:label ?predicateLabel_en filter (lang(?predicateLabel_en) = "en") .
    ?statement rdfs:label ?statementLabel_en filter (lang(?statementLabel_en) = "en") .
    ?statement rdfs:label ?statementLabel_ar filter (lang(?statementLabel_ar) = "ar") .
    ?predicate rdfs:label ?predicateLabel_ar filter (lang(?predicateLabel_ar) = "ar") .

    OPTIONAL {{
        ?object ?pq ?valuenode .
        ?qualifier wikibase:qualifier ?pq .
        ?qualifier wikibase:qualifierValue ?pqv .

        ?qualifier rdfs:label ?qualifierLabel_en filter (lang(?qualifierLabel_en) = "en") .
        ?qualifier rdfs:label ?qualifierLabel_ar filter (lang(?qualifierLabel_ar) = "ar") .

        OPTIONAL {{
        ?object ?pqv [wikibase:quantityUnit ?unit] .
        ?unit rdfs:label ?unitLabel_en filter (lang(?unitLabel_en) = "en") .
        ?unit rdfs:label ?unitLabel_ar filter (lang(?unitLabel_ar) = "ar") .
        }}

        OPTIONAL {{
        ?valuenode rdfs:label ?valuenodeLabel_en filter (lang(?valuenodeLabel_en) = "en") .
        ?valuenode rdfs:label ?valuenodeLabel_ar filter (lang(?valuenodeLabel_ar) = "ar") .
        }}

    }}
    }}"""  # noqa

    def get_results(endpoint_url, query):
        user_agent = f"{USER_AGENT}"
        sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        return sparql.query().convert()

    results = get_results(endpoint_url, query_rdf)

    return results["results"]["bindings"]



In [5]:
with open(entities_info_path, "r") as f:
    entities_info = json.load(f)
prop_ar_info = get_prop_info(conllu_ar_path)
prop_all_ar_ids = set(get_prop_info(conllu_all_ar_path))


In [6]:
prop_label_ar = lambda idx: prop_ar_info[idx]
ent_label_en = lambda e: entities_info[e]['en']
ent_label_ar = lambda e: entities_info[e]['ar']


In [7]:
prop_data_df = pd.read_csv(data_path, index_col=0)
prop_ent_df = prop_data_df.copy().drop('entity_count', axis=1)
prop_ent_df['entity'] = prop_ent_df['entity'].apply(literal_eval)
prop_ent_df = prop_ent_df.explode('entity', ignore_index=True)
prop_ent_df['ent_label_en'] = prop_ent_df['entity'].apply(ent_label_en)
prop_ent_df['ent_label_ar'] = prop_ent_df['entity'].apply(ent_label_ar)
prop_ent_df['prop_label_ar'] = prop_ent_df['property_id'].apply(prop_label_ar)
prop_ent_df.rename(columns={
    "prop_label": "prop_label_en",
    "entity": "entity_id"
},
                   inplace=True)


In [11]:
prop_ent_df

Unnamed: 0,property_id,entity_id,prop_label_en,ent_label_en,ent_label_ar,prop_label_ar
0,P1000,Q1189,record held,Usain Bolt,يوسين بولت,رقم قياسي حققه
1,P1000,Q1659,record held,David Rudisha,ديفيد روديشا,رقم قياسي حققه
2,P1000,Q1665,record held,Taoufik Makhloufi,توفيق مخلوفي,رقم قياسي حققه
3,P1000,Q1668,record held,Leonel Manzano,ليونيل مانزانو,رقم قياسي حققه
4,P1000,Q1679,record held,Mahiedine Mekhissi-Benabbad,محيي الدين مخيسي,رقم قياسي حققه
...,...,...,...,...,...,...
156608,P9929,Q111718690,madhhab,Hassan Kacimi,حسن القاسمي,مذاهب إسلامية
156609,P9929,Q112623711,madhhab,Boualem Boushaki,بوعلام بوسحاقي,مذاهب إسلامية
156610,P9929,Q113129815,madhhab,Muhammad Taha al-Huwayzi,محمد طه الحويزي,مذاهب إسلامية
156611,P9929,Q113771520,madhhab,Ibn Yuzayy,أبو عبد الله بن جزي,مذاهب إسلامية


In [8]:
entity_ids = prop_ent_df['entity_id'].sort_values().drop_duplicates().tolist()
length = len(entity_ids)


In [9]:
# output file path
api_results = Path(f'{results_dir}api_results_sparql.jsonl')


In [10]:
# query rdf using SPARQL

query = False
if query:
    if api_results.is_file():
        with jsonlines.open(str(api_results), 'r') as reader:
            resume_idx = len(list(reader.iter()))
            entity_ids = entity_ids[resume_idx:]
    else:
        resume_idx = 0

    pbar = tqdm(total=length, initial=resume_idx)
    for ent_id in entity_ids:
        retry = True
        retry_num = 0
        results = None
        while retry:
            try:
                pbar.set_description(f'{ent_id}')
                results = sparql_query(ent_id)
                retry = False
                sleep(1.1)

            except Exception as e:
                e_name = type(e).__name__
                descr = pbar.desc
                pbar.set_description(
                    f'{descr} - Exception: {e_name} - retry: {retry_num + 1}')
                sleep(90)
                retry_num += 1
                if retry_num > 4:
                    retry = False

        with jsonlines.open(str(api_results), mode='a') as writer:
            writer.write({ent_id: results})

        pbar.update()
