### Télécharger des données statistiques de Dbnary

In [2]:
from ipywidgets import Layout, Box, HBox, VBox
from IPython.display import Markdown, clear_output
import warnings
warnings.filterwarnings("ignore")

ENDPOINT: str = "http://kaiko.getalp.org/sparql"

In [8]:
# %load ../../src/SPARQL_query
import time as tm
from typing import NoReturn

import pandas as pd
from IPython.display import display
from SPARQLWrapper import SPARQLWrapper
from ipywidgets import widgets


def add_progress_bar(fun: callable):
    def function_modif(*args, **kwargs):
        progress_bar = widgets.IntProgress(bar_style='success', description='Loading:')
        display(progress_bar)
        kwargs['widget'] = progress_bar
        ret = fun(*args, **kwargs)
        progress_bar.close()
        return ret

    return function_modif


class SPARQLquery:
    """
    Class allowing to make a query on a remote SPARQL server, its main characteristics are :
     - Taking into account the big answers by concatenating them as they are received
     - Ability to access the size of the database
     - Ability to retrieve the response in `pandas` data frame format
    """

    def __init__(self, endpoint: str, query: str, verbose: bool = False, step: int = 5000,
                 widget: widgets.IntProgress = None) -> NoReturn:
        """


        :param endpoint: Url to the remote SPARQL service
        :param query: The query
        :param verbose: If the detail text will be displayed
        :param step: The max number of result to receive
        """
        self.sparql = SPARQLWrapper(endpoint)
        self.sparql.setReturnFormat("json")

        self.query: str = query
        self.verbose: bool = verbose
        self.step: int = step
        self.resultSize: int = self.get_result_size()
        self.is_widget: bool = False

        if widget:
            self.widget = widget
            self.widget.max = self.resultSize
            self.widget.value = 0
            self.is_widget = True

    def get_result_size(self) -> int:
        """
        Function return the size of a query (only in SELECT query).
        """

        if self.query.strip().startswith("SELECT") or self.query.strip().startswith(
                "select"):  # Modifie the query to count the number of answer

            if self.verbose:
                print(tm.strftime(f"[%H:%M:%S] Obtention du nombre de résultats avant exécuter la requête"))

            start: int = 7  # We detect the position of the first variable after the select
            while self.query[start] != '?':
                start += 1
            end: int = start
            while self.query[end:end + 5] != "WHERE" and self.query[end:end + 5] != "where":
                end += 1

            mot: str = self.query[start: end - 1]  # THe name of the variable

            self.sparql.setQuery(self.query.replace(mot, f"(COUNT (*) as ?cnt)", 1))
            processed_results: dict = self.sparql.query().convert()  # Do the query
            number_of_results: int = int(processed_results['results']['bindings'][0]['cnt']['value'])

            if self.verbose:
                print(tm.strftime(f"[%H:%M:%S] Il y a  {number_of_results} résultats..."))

            return number_of_results
        return 1

    def get_sparql_dataframe(self, query: str, text: str = "") -> pd.DataFrame:
        """
        Helper function to convert SPARQL results into a Pandas data frame.

        Credit: Douglas Fils

        :param query: The query to perform
        :param text: optional text to print in verbose mode
        """

        if self.verbose:
            print(tm.strftime(f"[%H:%M:%S] Transmission {text} en cours..."), end='')

        self.sparql.setQuery(query)

        processed_results: dict = self.sparql.query().convert()

        if self.verbose:
            print(tm.strftime(f"\r[%H:%M:%S] Transmission {text} réussi, conversion en Data Frame..."), end='')

        cols = processed_results['head']['vars']

        out = [[row.get(c, {}).get('value') for c in cols] for row in processed_results['results']['bindings']]

        if self.is_widget:
            if text == "":
                self.widget.value = self.widget.max
            else:
                self.widget.value = int(text.split(' ')[0])

        if self.verbose:
            print(tm.strftime(f" Effectué"))

        return pd.DataFrame(out, columns=cols)

    def do_query(self) -> pd.DataFrame:
        """
        Performs the query all at once if the result is not too big or little by little otherwise,
        if the query is not a selection it will be done all at once.

        :return: The result of the query
        """
        if self.resultSize > self.step:
            query = self.query + f" LIMIT {self.step}"
            return pd.concat(
                [self.get_sparql_dataframe(query + f" OFFSET {value}", f"{value} sur {self.resultSize}") for value in
                 range(0, self.resultSize, self.step)])
        return self.get_sparql_dataframe(self.query)


@add_progress_bar
def get_datasets(endpoint: str, verbose: bool = False, widget: widgets.IntProgress = None):
    """
    Dbnary specific function;

    Get all datasets available names on Dbnary and their description.

    :param endpoint: The address of the SPARQL server
    :param verbose: If the detail text will be displayed
    :param widget: If the detail widget will be displayed
    :return: The data frame of all datasets available names and their description
    """

    query: str = "SELECT ?dataset ?commentaire WHERE {?dataset a qb:DataSet ; rdfs:comment ?commentaire}"

    if verbose:
        print(tm.strftime(f"[%H:%M:%S] Requête au serveur des différents datasets disponible... "))

    list_datasets: pd.DataFrame = SPARQLquery(endpoint, query, verbose=verbose,
                                              widget=widget).do_query()  # We recovers all DataSets Structure

    if verbose:
        print(tm.strftime(f"[%H:%M:%S] Il y a {len(list_datasets)} datasets disponibles"))

    return list_datasets


@add_progress_bar
def get_features(endpoint: str, dataset_name: str, widget: widgets.IntProgress = None) -> pd.DataFrame:
    """
    Dbnary specific function;

    Get all features available names on a dataset in Dbnary.

    :param endpoint: The address of the SPARQL server
    :param dataset_name: The name of the dataset where you want to have its features
    :param widget: If the detail widget will be displayed
    :return: The data frame of all datasets features names available
    """
    query: str = f"""DESCRIBE ?item WHERE {'{'} ?item qb:dataSet <{dataset_name}> {'}'} LIMIT 1"""
    result: pd.DataFrame = SPARQLquery(endpoint, query, widget=widget).do_query()
    return result['p'].to_frame(name=None).set_axis(["Caractéristiques"], axis=1)


@add_progress_bar
def download_dataset(endpoint: str, dataset_name: str, features_names: list[str],
                     widget: widgets.IntProgress = None) -> pd.DataFrame:
    """
    Dbnary specific function;

    Download and return all selected features of a dataset

    :param endpoint: The address of the SPARQL server
    :param dataset_name: The name of the dataset where you want to download its features
    :param features_names: The names of features to download
    :param widget: If the detail widget will be displayed
    :return: The data frame of selected and downloaded characteristics of a dataset
    """

    # We will build the query
    query: str = "SELECT "
    vars_list: list[str] = [item.split('#')[-1] for item in features_names]
    for item in vars_list:
        query += f"?{item} "
    query += f"WHERE {'{'} ?o qb:dataSet <{dataset_name}> . "
    for uri, name in zip(features_names, vars_list):
        query += f"?o <{uri}> ?{name} . "
    query += "} "

    # Do the query
    return SPARQLquery(endpoint, query, widget=widget).do_query()


In [9]:
list_datasets = get_datasets(ENDPOINT, verbose = False)
list_datasets_short = list_datasets['dataset'].map(lambda x: x.split('/')[-1]).to_frame().join(list_datasets['commentaire'].to_frame())

DataFrame = pd.DataFrame()
output = widgets.Output()

dataset_user_choice = widgets.Dropdown(options = [(name, full_name) for name, full_name in zip(list_datasets_short['dataset'].values.reshape(-1),
                                                                                               list_datasets['dataset'].values.reshape(-1))], 
                                       description="Choix:", layout=Layout(flex='1 3 auto', width='auto'))
user_choice_confirm = widgets.Button(description='Soumettre', icon='check', layout=Layout(flex='1 1 auto', width='auto'),
                                     tooltip = "Cliquer ici pour confirmer votre choix")
description = widgets.Label(value=f"Description: {list_datasets[list_datasets['dataset'] == dataset_user_choice.value]['commentaire'].values[0]}",
                            layout=Layout(flex='1 1 auto', width='auto'))

items = [dataset_user_choice, user_choice_confirm]

box_layout = Layout(display='flex', flex_flow='row', align_items='stretch', width='90%')
                            
line_1 = Box(children=items, layout=box_layout)
line_2 = Box(children=[description], layout=box_layout)
ui = VBox([line_1, line_2])

def user_choice_change(obj):
    description.value=f"Description: {list_datasets[list_datasets['dataset'] == dataset_user_choice.value]['commentaire'].values[0]}"


def user_choice_confirm_eventhandler(obj):
    choice: str = dataset_user_choice.value
    with output:
        clear_output()
        output2 = widgets.Output()
        display(Markdown(data="#### Les différentes catégories de ce dataset sont: "))
        categories_long = get_features(ENDPOINT, choice)
        categories_short = categories_long["Caractéristiques"].map(lambda x: x.split('#')[-1]).to_frame()
        
        display(categories_short)
        
        select = widgets.SelectMultiple(options=[(name, full_name) for name, full_name in zip(categories_short.values.reshape(-1), categories_long.values.reshape(-1))], 
                                        description='Critères: ', disabled=False, layout=Layout(flex='1 1 auto', width='auto', height = "auto"),
                                        style={'description_width': 'initial'})
        select_box = Box(children=[select], layout=box_layout)
        display(Markdown("####  Veuillez choisir au moins deux critères à télécharger:"))
        select_confirm = widgets.Button(description='Soumettre', icon='check', layout=Layout(flex='1 1 auto', width='auto'),
                                     tooltip = "Cliquer ici pour confirmer votre choix")
        select_confirm_box = Box(children=[select_confirm], layout=box_layout)
        select_ui = VBox([select_box, select_confirm_box])
        
        def selection_confirm_eventhandler(obj):
            with output2:
                if len(select.value) >= 2: # We will constuct the query
                    global DataFrame
                    clear_output()
                    print(choice, select.value)
                    DataFrame = download_dataset(ENDPOINT, choice, select.value)
                    display(DataFrame.head())
            
        select_confirm.on_click(selection_confirm_eventhandler)
        display(select_ui, output2)
        
        
user_choice_confirm.on_click(user_choice_confirm_eventhandler)
dataset_user_choice.observe(user_choice_change, 'value')

display(Markdown(data="#### Veuillez choisir un DataSet à étudier: "))
display(ui, output)

IntProgress(value=0, bar_style='success', description='Loading:')

#### Veuillez choisir un DataSet à étudier: 

VBox(children=(Box(children=(Dropdown(description='Choix:', layout=Layout(flex='1 3 auto', width='auto'), opti…

Output()

In [13]:
DataFrame.sort_values(by="wiktionaryDumpVersion")

Unnamed: 0,translationsWithSenseNumber,translationsWithSenseNumberAndTextualGloss,translationsWithTextualGloss,wiktionaryDumpVersion,observationLanguage,translationsWithNoGloss
323,0,0,0,20200701,mg,295688
415,522582,0,0,20200701,pl,2
277,0,0,0,20200701,la,4583
369,5937,72034,6417,20200701,pt,227145
139,0,0,67647,20200701,it,49663
...,...,...,...,...,...,...
460,0,0,0,20210601,sh,606
253,7343,5313,39250,20210601,ja,119724
299,0,0,159,20210601,no,299
345,0,0,170639,20210601,nl,56588


In [12]:
DataFrame.translationsWithSenseNumber.unique()

array(['0', '397', '324', '316', '317', '318', '325', '326', '384', '381',
       '379', '391', '393', '395', '392', '399', '11', '17', '756980',
       '677498', '679271', '680443', '683532', '687357', '693835',
       '696287', '700830', '702878', '707139', '711843', '714769',
       '717569', '720254', '722512', '728864', '731706', '738602',
       '741801', '747787', '750689', '754206', '97106', '94489', '94691',
       '94806', '94883', '94888', '95278', '95622', '95789', '95939',
       '96015', '96103', '96232', '96339', '96410', '96543', '96681',
       '96735', '96916', '96873', '96905', '96951', '97056', '13', '10',
       '12', '5716', '5242', '5261', '5272', '5284', '5304', '5327',
       '5309', '5322', '5329', '5353', '5448', '5416', '5425', '5440',
       '5464', '5469', '5476', '5487', '5513', '5572', '5694', '76', '67',
       '75', '7343', '6433', '6444', '6441', '6496', '6522', '6655',
       '6735', '6751', '7116', '7121', '7342', '6094', '5937', '5942',
       '594