In [2]:
from ast import literal_eval
from tqdm.notebook import tqdm

from pathlib import Path

from time import sleep, time

from math import ceil
from random import uniform

import json
import jsonlines
import pandas as pd

# import stanza
from SPARQLWrapper import SPARQLWrapper, JSON

from utills import get_prop_info


In [3]:
output_dir = '../../outputs/'
stats_dir = f'{output_dir}prop_stats/wikibase-item_quantity_time/'
conllu_dir = f'{output_dir}conllu/wikibase-item_quantity_time/'
conllu_all_dir = f'{output_dir}conllu/all_datatypes/'
results_dir = f'{output_dir}RDF/wikibase-item_quantity_time/'

entities_info_path = f'{stats_dir}entities_info.json'
data_path = f'{stats_dir}relevent_prop.csv'
conllu_ar_path = f'{conllu_dir}udp_ar.conllu'
conllu_all_ar_path = f'{conllu_all_dir}udp_ar.conllu'


In [4]:
relevent_fields = [
    'subject.value',
    'subjectLabel_ar.value',
    'subjectLabel_en.value',
    'predicate.value',
    'predicateLabel_ar.value',
    'predicateLabel_en.value',
    'ptype.value',
    'statement.value',
    'object.value',
    'object.xml:lang',
    'object.datatype',
    'objectLabel_ar.value',
    'objectLabel_en.value',
    'qualifier.value',
    'qualifierLabel_ar.value',
    'qualifierLabel_en.value',
    'pqtype.value',
    'object_sub.value',
    'object_sub.datatype',
    'object_sub.xml:lang',
    'object_subLabel_ar.value',
    'object_subLabel_en.value',
    'unit.value',
    'unitLabel_ar.value',
    'unitLabel_en.value',
]


In [5]:
def split_string(value):
    if not isinstance(value, str):
        return value
    elif value.startswith('http://www.wikidata.org/entity/'):
        return value.split('/')[-1]
    elif value.startswith(
            'http://www.w3.org/2001/XMLSchema#') or value.startswith(
                'http://www.opengis.net/ont/geosparql#') or value.startswith(
                    'http://wikiba.se/ontology#'):
        return value.split('#')[-1]
    else:
        return value

In [6]:
USER_AGENT = {
    'User-Agent':
    'A bot to get statements in Arabic and English. (guszarzmo@student.gu.se)'
}  # noqa:


In [7]:
def sparql_query(ent: str):
    endpoint_url = "https://query.wikidata.org/sparql"

    query_rdf = f"""SELECT ?subject    ?subjectLabel_en    ?subjectLabel_ar 
?predicate  ?predicateLabel_en  ?predicateLabel_ar  ?ptype 
?object     ?objectLabel_en     ?objectLabel_ar 
?qualifier  ?qualifierLabel_en  ?qualifierLabel_ar  ?pqtype 
?object_sub ?object_subLabel_en ?object_subLabel_ar 
?unit       ?unitLabel_en       ?unitLabel_ar
?statement
{{
  VALUES (?subject) {{(wd:{ent})}}

  ?subject ?p ?statement .
  ?predicate wikibase:claim ?p.

  ?subject rdfs:label ?subjectLabel_ar filter (lang(?subjectLabel_ar) = "ar") .
  ?subject rdfs:label ?subjectLabel_en filter (lang(?subjectLabel_en) = "en") .

  {{
    ?statement ?ps ?object .
    ?predicate wikibase:statementProperty ?ps .

    ?predicate wikibase:propertyType ?ptype .
    filter (?ptype != wikibase:ExternalId) .
    filter (?ptype != wikibase:CommonsMedia) .

    OPTIONAL {{
      ?statement ?pq ?object_sub .
      ?qualifier wikibase:qualifier ?pq .
      ?qualifier wikibase:propertyType ?pqtype .

      ?qualifier rdfs:label ?qualifierLabel_ar filter (lang(?qualifierLabel_ar) = "ar") .
      ?qualifier rdfs:label ?qualifierLabel_en filter (lang(?qualifierLabel_en) = "en") .
      
      ?object_sub rdfs:label ?object_subLabel_ar filter (lang(?object_subLabel_ar) = "ar") .
      ?object_sub rdfs:label ?object_subLabel_en filter (lang(?object_subLabel_en) = "en") .
    }}

    ?predicate rdfs:label ?predicateLabel_ar filter (lang(?predicateLabel_ar) = "ar") .
    ?predicate rdfs:label ?predicateLabel_en filter (lang(?predicateLabel_en) = "en") .

    OPTIONAL {{
      ?object rdfs:label ?objectLabel_en filter (lang(?objectLabel_en) = "en") .
      ?object rdfs:label ?objectLabel_ar filter (lang(?objectLabel_ar) = "ar") .
    }}
  }}
  UNION {{
    ?statement ?psv ?valuenode .
    ?predicate_sub wikibase:statementValue ?psv .
    ?statement ?psv [wikibase:quantityUnit ?unit] .

    ?unit rdfs:label ?unitLabel_ar filter (lang(?unitLabel_ar) = "ar") .
    ?unit rdfs:label ?unitLabel_en filter (lang(?unitLabel_en) = "en") .
  }}
}}
"""  # noqa

    def get_results(endpoint_url, query):
        user_agent = f"{USER_AGENT}"
        sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        return sparql.query().convert()

    results = get_results(endpoint_url, query_rdf)

    return results["results"]["bindings"]

In [8]:
with open(entities_info_path, "r") as f:
    entities_info = json.load(f)
prop_ar_info = get_prop_info(conllu_ar_path)
prop_all_ar_ids = set(get_prop_info(conllu_all_ar_path))


In [9]:
prop_label_ar = lambda idx: prop_ar_info[idx]
ent_label_en = lambda e: entities_info[e]['en']
ent_label_ar = lambda e: entities_info[e]['ar']


In [10]:
prop_data_df = pd.read_csv(data_path, index_col=0)
prop_ent_df = prop_data_df.copy().drop('entity_count', axis=1)
prop_ent_df['entity'] = prop_ent_df['entity'].apply(literal_eval)
prop_ent_df = prop_ent_df.explode('entity', ignore_index=True)
prop_ent_df['ent_label_en'] = prop_ent_df['entity'].apply(ent_label_en)
prop_ent_df['ent_label_ar'] = prop_ent_df['entity'].apply(ent_label_ar)
prop_ent_df['prop_label_ar'] = prop_ent_df['property_id'].apply(prop_label_ar)
prop_ent_df.rename(columns={
    "prop_label": "prop_label_en",
    "entity": "entity_id"
},
                   inplace=True)


In [11]:
prop_ent_df

Unnamed: 0,property_id,entity_id,prop_label_en,ent_label_en,ent_label_ar,prop_label_ar
0,P1000,Q1189,record held,Usain Bolt,يوسين بولت,رقم قياسي حققه
1,P1000,Q1659,record held,David Rudisha,ديفيد روديشا,رقم قياسي حققه
2,P1000,Q1665,record held,Taoufik Makhloufi,توفيق مخلوفي,رقم قياسي حققه
3,P1000,Q1668,record held,Leonel Manzano,ليونيل مانزانو,رقم قياسي حققه
4,P1000,Q1679,record held,Mahiedine Mekhissi-Benabbad,محيي الدين مخيسي,رقم قياسي حققه
...,...,...,...,...,...,...
156608,P9929,Q111718690,madhhab,Hassan Kacimi,حسن القاسمي,مذاهب إسلامية
156609,P9929,Q112623711,madhhab,Boualem Boushaki,بوعلام بوسحاقي,مذاهب إسلامية
156610,P9929,Q113129815,madhhab,Muhammad Taha al-Huwayzi,محمد طه الحويزي,مذاهب إسلامية
156611,P9929,Q113771520,madhhab,Ibn Yuzayy,أبو عبد الله بن جزي,مذاهب إسلامية


In [63]:
entity_ids_ = prop_ent_df['entity_id'].drop_duplicates().sort_values().to_list()
length = len(entity_ids_)

In [24]:
# output file path
api_results_raw = Path(f'{results_dir}api_results_sparql_v.2.jsonl')
api_results = Path(f'{results_dir}api_results_processed_v.1.jsonl')

In [64]:
if api_results.is_file():
    with jsonlines.open(str(api_results), 'r') as reader:
        jlines = reader.iter()
        for jline in tqdm(jlines):
            pass
    ent_id_resume = dict(json.loads(jline))['subject.value']['0']
    resume_idx = entity_ids_.index(ent_id_resume)
    entity_ids = entity_ids_[resume_idx + 1:]
else:
    entity_ids = entity_ids_[::]
    resume_idx = 0
    ent_id_resume = None

print(resume_idx, ent_id_resume, entity_ids[0])

0it [00:00, ?it/s]

70827 Q600568 Q60059


In [65]:
# query rdf using SPARQL
relevent_columns_set = set(relevent_fields)
query = True
results = []
duration = -1
if query:
    pbar = tqdm(total=len(entity_ids_), initial=resume_idx + 1)
    for ent_id in entity_ids:
        retry = True
        retry_num = 0
        results = None
        while retry:
            try:
                start_time = time()
                pbar.set_description(f'd: {duration:.2f} - {ent_id:>10}')
                results = sparql_query(ent_id)
                retry = False
                if duration == -1:
                    sleep(1.15)
                else:
                    r = round(uniform(.1, .4), 2)
                    if duration >= 1.15:
                        sleep(r)
                    else:
                        sleep(round(1.15 - duration + r, 2))

            except Exception as e:
                e_name = type(e).__name__
                descr = pbar.desc
                pbar.set_description(
                    f'{descr} - Exception: {e_name} - retry: {retry_num + 1}')
                sleep(90)
                retry_num += 1
                if retry_num > 4:
                    retry = False
        if results:
            result_df = pd.json_normalize(results)
            # remove uncessary field, and add missing field from nesseary
            # fields.
            df_fields = set(result_df.columns)
            missing_fields = list(relevent_columns_set.difference(df_fields))
            if missing_fields:
                result_df[missing_fields] = pd.DataFrame(
                    [[None] * len(missing_fields)], index=result_df.index)
            unrelevent_columns = df_fields.difference(relevent_columns_set)
            if unrelevent_columns:
                result_df.drop(unrelevent_columns,
                               axis='columns',
                               inplace=True)

            result_df['subject.value'] = result_df['subject.value'].apply(
                split_string)
            result_df['statement.value'] = result_df['statement.value'].apply(
                split_string)
            result_df['predicate.value'] = result_df['predicate.value'].apply(
                split_string)
            result_df['unit.value'] = result_df['unit.value'].apply(
                split_string)
            result_df['object.value'] = result_df['object.value'].apply(
                split_string)
            result_df['ptype.value'] = result_df['ptype.value'].apply(
                split_string)
            result_df['object.datatype'] = result_df['object.datatype'].apply(
                split_string)
            result_df['object_sub.value'] = result_df[
                'object_sub.value'].apply(split_string)
            result_df['qualifier.value'] = result_df['qualifier.value'].apply(
                split_string)
            result_df['pqtype.value'] = result_df['pqtype.value'].apply(
                split_string)

            results_dict = result_df.to_json(force_ascii=False)

            with jsonlines.open(str(api_results), mode='a') as writer:
                writer.write(results_dict)

            with jsonlines.open(str(api_results_raw), mode='a') as writer:
                writer.write(results)

        end_time = time()
        duration = round(end_time - start_time, 2)
        pbar.update()


 78%|#######7  | 70828/91179 [00:00<?, ?it/s]