Visualisation des différentes statistiques de Dbnary
=============

In [9]:
# RDF
import rdflib

# Other
import ipywidgets as widgets
from IPython.display import display, Markdown

#  PLotting
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
sns.set_theme(style="darkgrid")

### Fonction qui retourne un DataFrame des résultats d'une requête SPARQL

In [20]:
# %load ../../src/SPARQL_query
import time as tm
import pandas as pd
from SPARQLWrapper import SPARQLWrapper
from typing import NoReturn


class SPARQLquery:
    """
    Class allowing to make a query on a remote SPARQL server, its main characteristics are :
     - Taking into account the big answers by concatenating them as they are received
     - Ability to access the size of the database
     - Ability to retrieve the response in `pandas` data frame format
    """

    def __init__(self, endpoint: str, query: str, verbose: bool = False, step: int = 5000) -> NoReturn:
        """


        :param endpoint: Url to the remote SPARQL service
        :param query: The query
        :param verbose: If the detail text will be displayed
        :param step: The max number of result to receive
        """
        self.sparql = SPARQLWrapper(endpoint)
        self.sparql.setReturnFormat("json")

        self.query: str = query
        self.verbose: bool = verbose
        self.step: int = step
        self.resultSize: int = self.get_result_size()

    def get_result_size(self) -> int:
        """
        Function return the size of a query (only in SELECT query)
        """

        if self.query.strip().startswith("SELECT"):  # Modifie the query to count the number of answer

            if self.verbose:
                print(tm.strftime(f"[%H:%M:%S] Obtention du nombre de résultats avant exécuter la requête"))

            start: int = 7  # We detect the position of the first variable after the select
            while self.query[start] != '?':
                start += 1
            end: int = start
            while self.query[end] != ' ' and self.query[end] != '\n':
                end += 1

            mot: str = self.query[start: end]  # THe name of the variable

            self.sparql.setQuery(self.query.replace(mot, f"(COUNT ({mot}) as ?cnt)", 1))
            processed_results: dict = self.sparql.query().convert()  # Do the query
            number_of_results: int = int(processed_results['results']['bindings'][0]['cnt']['value'])

            if self.verbose:
                print(tm.strftime(f"[%H:%M:%S] Il y a  {number_of_results} résultats..."))

            return number_of_results
        return -1

    def get_sparql_dataframe(self, query: str, text: str = "") -> pd.DataFrame:
        """
        Helper function to convert SPARQL results into a Pandas data frame.

        Credit: Douglas Fils

        :param query: The query to perform
        :param text: optional text to print in verbose mode
        """

        if self.verbose:
            print(tm.strftime(f"[%H:%M:%S] Transmission {text} en cours..."), end='')

        self.sparql.setQuery(query)

        processed_results: dict = self.sparql.query().convert()

        if self.verbose:
            print(tm.strftime(f"\r[%H:%M:%S] Transmission {text} réussi, conversion en Data Frame..."), end='')

        cols = processed_results['head']['vars']

        out = [[row.get(c, {}).get('value') for c in cols] for row in processed_results['results']['bindings']]

        if self.verbose:
            print(tm.strftime(f" Effectué"))

        return pd.DataFrame(out, columns=cols)

    def do_query(self) -> pd.DataFrame:

        if self.resultSize > self.step:
            query = self.query + f" LIMIT {self.step}"
            return pd.concat(
                [self.get_sparql_dataframe(query + f" OFFSET {value}", f"{value:6} sur {self.resultSize}") for value in
                 range(0, self.resultSize, self.step)])

        return self.get_sparql_dataframe(self.query)


### On commence par chercher tout les différents types de datasets

In [23]:
ENDPOINT: str = "http://kaiko.getalp.org/sparql"
  
print(tm.strftime(f"[%H:%M:%S] Requête au serveur des différents datasets disponible... "))
list_datasets = query = SPARQLquery(ENDPOINT, """SELECT DISTINCT ?values WHERE {?values a qb:DataSet }""", verbose = True).do_query()  # We recovers all DataSets Structure
print(tm.strftime(f"[%H:%M:%S] Il y a {len(list_datasets)} datasets disponibles"))


list_datasets_short = pd.DataFrame(list_datasets['values'].map(lambda x: x.split('/')[-1]))
list_datasets_short

[10:46:20] Requête au serveur des différents datasets disponible... 
[10:46:20] Obtention du nombre de résultats avant exécuter la requête
[10:46:21] Il y a  5 résultats...
[10:46:21] Transmission  réussi, conversion en Data Frame... Effectué
[10:46:21] Transmission  réussi, conversion en Data Frame... Effectué
[10:46:21] Il y a 5 datasets disponibles


Unnamed: 0,values
0,dbnaryNymRelationsCube
1,dbnaryStatisticsCube
2,dbnaryTranslationsCube
3,enhancementConfidenceDataCube
4,translationGlossesCube


In [24]:
output = widgets.Output()
dataset_user_choice = widgets.Dropdown(options = [(name, full_name) for name, full_name in zip(list_datasets_short.values.reshape(-1), list_datasets.values.reshape(-1))], description="Choix:", layout=widgets.Layout(width='50%'))
user_choice_confirm = widgets.Button(description='Soumettre')

def user_choice_confirm_eventhandler(obj):
    choice: str = dataset_user_choice.value
    with output:
        g = rdflib.Graph()
        g.parse(choice)

        print(g.serialize(format="turtle").decode("utf-8"))

user_choice_confirm.on_click(user_choice_confirm_eventhandler)

display(Markdown(data="#### Veuillez choisir un DataSet à étudier: "))
display(dataset_user_choice)
display(user_choice_confirm, output)

#### Veuillez choisir un DataSet à étudier: 

Dropdown(description='Choix:', layout=Layout(width='50%'), options=(('dbnaryNymRelationsCube', 'http://kaiko.g…

Button(description='Soumettre', style=ButtonStyle())

Output()

In [None]:
from rdflib.namespace import *

g = rdflib.Graph()
g.parse("http://kaiko.getalp.org/dbnary#dbnaryStatisticsDataStructure", format = 'xml')
print(g.serialize(format="turtle").decode("utf-8"))

In [None]:
sparql = SPARQLWrapper("http://kaiko.getalp.org/sparql/")
sparql.setQuery("describe <http://kaiko.getalp.org/dbnary#dbnaryStatisticsDataStructure>")
sparql.setReturnFormat("xml")
processed_results: dict = sparql.query().convert()
print(processed_results.serialize(format="turtle").decode("utf-8"))

In [None]:
get_sparql_dataframe("http://kaiko.getalp.org/sparql/", "describe <nodeID:b10849>")

In [None]:
help(rdflib.Graph())

In [None]:
get_sparql_dataframe("http://kaiko.getalp.org/sparql", "DESCRIBE <http://kaiko.getalp.org/dbnary#dbnaryStatisticsDataStructure>")

In [None]:
print(type(dataset_user_choice.value))

In [None]:
uri: str = "http://kaiko.getalp.org/dbnary#dbnaryNymRelationsDataStructure"
g = rdflib.Graph()  # create a Graph
result: rdflib.Graph = g.parse(uri, format="xml")  # parse in an RDF file hosted on the Internet

In [None]:
res = g.query("""SELECT ?a WHERE {?a a <http://purl.org/linked-data/cube#DataStructureDefinition>}""")

In [None]:
res.serialize(format="json").decode("utf-8")