Visualisation des différentes statistiques de Dbnary
=============

In [1]:
# Data analys 
import numpy as np

#  PLotting
import bqplot as bq
from ipywidgets import Layout, Box, HBox, VBox


from IPython.display import display, Markdown, clear_output

import warnings
warnings.filterwarnings("ignore")

import datetime

ENDPOINT: str = "http://kaiko.getalp.org/sparql"

### Classe qui retourne un DataFrame des résultats d'une requête SPARQL et autes fonctions utilitaires

In [2]:
# %load ../../src/SPARQL_query
import time as tm
from typing import NoReturn

import pandas as pd
from SPARQLWrapper import SPARQLWrapper
from ipywidgets import widgets


class SPARQLquery:
    """
    Class allowing to make a query on a remote SPARQL server, its main characteristics are :
     - Taking into account the big answers by concatenating them as they are received
     - Ability to access the size of the database
     - Ability to retrieve the response in `pandas` data frame format
    """

    def __init__(self, endpoint: str, query: str, verbose: bool = False, step: int = 1000,
                 widget: widgets.IntProgress = None) -> NoReturn:
        """


        :param endpoint: Url to the remote SPARQL service
        :param query: The query
        :param verbose: If the detail text will be displayed
        :param step: The max number of result to receive
        """
        self.sparql = SPARQLWrapper(endpoint)
        self.sparql.setReturnFormat("json")

        self.query: str = query
        self.verbose: bool = verbose
        self.step: int = step
        self.resultSize: int = self.get_result_size()
        self.is_widget: bool = False

        if widget:
            self.widget = widget
            self.widget.max = self.resultSize
            self.widget.value = 0
            self.is_widget = True

    def get_result_size(self) -> int:
        """
        Function return the size of a query (only in SELECT query).
        """

        if self.query.strip().startswith("SELECT") or self.query.strip().startswith(
                "select"):  # Modifie the query to count the number of answer

            if self.verbose:
                print(tm.strftime(f"[%H:%M:%S] Obtention du nombre de résultats avant exécuter la requête"))

            start: int = 7  # We detect the position of the first variable after the select
            while self.query[start] != '?':
                start += 1
            end: int = start
            while self.query[end:end + 5] != "WHERE" and self.query[end:end + 5] != "where":
                end += 1

            mot: str = self.query[start: end - 1]  # THe name of the variable

            self.sparql.setQuery(self.query.replace(mot, f"(COUNT (*) as ?cnt)", 1))
            processed_results: dict = self.sparql.query().convert()  # Do the query
            number_of_results: int = int(processed_results['results']['bindings'][0]['cnt']['value'])

            if self.verbose:
                print(tm.strftime(f"[%H:%M:%S] Il y a  {number_of_results} résultats..."))

            return number_of_results
        return 1

    def get_sparql_dataframe(self, query: str, text: str = "") -> pd.DataFrame:
        """
        Helper function to convert SPARQL results into a Pandas data frame.

        Credit: Douglas Fils

        :param query: The query to perform
        :param text: optional text to print in verbose mode
        """

        if self.verbose:
            print(tm.strftime(f"[%H:%M:%S] Transmission {text} en cours..."), end='')

        self.sparql.setQuery(query)

        processed_results: dict = self.sparql.query().convert()

        if self.verbose:
            print(tm.strftime(f"\r[%H:%M:%S] Transmission {text} réussi, conversion en Data Frame..."), end='')

        cols = processed_results['head']['vars']

        out = [[row.get(c, {}).get('value') for c in cols] for row in processed_results['results']['bindings']]

        if self.is_widget:
            if text == "":
                self.widget.value = self.widget.max
            else:
                self.widget.value = int(text.split(' ')[0])

        if self.verbose:
            print(tm.strftime(f" Effectué"))

        return pd.DataFrame(out, columns=cols)

    def do_query(self) -> pd.DataFrame:
        """
        Performs the query all at once if the result is not too big or little by little otherwise,
        if the query is not a selection it will be done all at once.

        :return: The result of the query
        """
        if self.resultSize > self.step:
            query = self.query + f" LIMIT {self.step}"
            return pd.concat(
                [self.get_sparql_dataframe(query + f" OFFSET {value}", f"{value} sur {self.resultSize}") for value in
                 range(0, self.resultSize, self.step)])
        return self.get_sparql_dataframe(self.query)


def get_datasets(endpoint: str, verbose: bool = False, widget: widgets.IntProgress = None):
    """
    Dbnary specific function;

    Get all datasets available names on Dbnary and their description.

    :param endpoint: The address of the SPARQL server
    :param verbose: If the detail text will be displayed
    :param widget: If the detail widget will be displayed
    :return: The data frame of all datasets available names and their description
    """

    query: str = "SELECT ?dataset ?commentaire WHERE {?dataset a qb:DataSet ; rdfs:comment ?commentaire}"

    if verbose:
        print(tm.strftime(f"[%H:%M:%S] Requête au serveur des différents datasets disponible... "))

    list_datasets: pd.DataFrame = SPARQLquery(endpoint, query, verbose=verbose,
                                              widget=widget).do_query()  # We recovers all DataSets Structure

    if verbose:
        print(tm.strftime(f"[%H:%M:%S] Il y a {len(list_datasets)} datasets disponibles"))

    return list_datasets


def get_features(endpoint: str, dataset_name: str, widget: widgets.IntProgress = None) -> pd.DataFrame:
    """
    Dbnary specific function;

    Get all features available names on a dataset in Dbnary.

    :param endpoint: The address of the SPARQL server
    :param dataset_name: The name of the dataset where you want to have its features
    :param widget: If the detail widget will be displayed
    :return: The data frame of all datasets features names available
    """
    query: str = f"""DESCRIBE ?item WHERE {'{'} ?item qb:dataSet <{dataset_name}> {'}'} LIMIT 1"""
    result: pd.DataFrame = SPARQLquery(endpoint, query, widget=widget).do_query()
    return result['p'].to_frame(name=None).set_axis(["Caractéristiques"], axis=1)


def download_dataset(endpoint: str, dataset_name: str, features_names: list[str],
                     widget: widgets.IntProgress = None) -> pd.DataFrame:
    """
    Dbnary specific function;

    Download and return all selected features of a dataset

    :param endpoint: The address of the SPARQL server
    :param dataset_name: The name of the dataset where you want to download its features
    :param features_names: The names of features to download
    :param widget: If the detail widget will be displayed
    :return: The data frame of selected and downloaded characteristics of a dataset
    """

    # We will build the query
    query: str = "SELECT "
    vars_list: list[str] = [item.split('#')[-1] for item in features_names]
    for item in vars_list:
        query += f"?{item} "
    query += f"WHERE {'{'} ?o qb:dataSet <{dataset_name}> . "
    for uri, name in zip(features_names, vars_list):
        query += f"?o <{uri}> ?{name} . "
    query += "} "

    # Do the query
    return SPARQLquery(endpoint, query, widget=widget).do_query()


### On commence par chercher tout les différents types de datasets et on va proposer à l'utilisateur de choisir quel dataset télécharger

In [3]:
ui = widgets.IntProgress(bar_style='success', description='Loading:',)
display(ui)
list_datasets = get_datasets(ENDPOINT, verbose = False, widget = ui)
ui.close()
list_datasets_short = list_datasets['dataset'].map(lambda x: x.split('/')[-1]).to_frame().join(list_datasets['commentaire'].to_frame())

DataFrame = pd.DataFrame()
output = widgets.Output()

dataset_user_choice = widgets.Dropdown(options = [(name, full_name) for name, full_name in zip(list_datasets_short['dataset'].values.reshape(-1),
                                                                                               list_datasets['dataset'].values.reshape(-1))], 
                                       description="Choix:", layout=Layout(flex='1 3 auto', width='auto'))
user_choice_confirm = widgets.Button(description='Soumettre', icon='check', layout=Layout(flex='1 1 auto', width='auto'),
                                     tooltip = "Cliquer ici pour confirmer votre choix")
description = widgets.Label(value=f"Description: {list_datasets[list_datasets['dataset'] == dataset_user_choice.value]['commentaire'].values[0]}",
                            layout=Layout(flex='1 1 auto', width='auto'))

items = [dataset_user_choice, user_choice_confirm]

box_layout = Layout(display='flex', flex_flow='row', align_items='stretch', width='90%')
                            
line_1 = Box(children=items, layout=box_layout)
line_2 = Box(children=[description], layout=box_layout)
ui = VBox([line_1, line_2])

def user_choice_change(obj):
    description.value=f"Description: {list_datasets[list_datasets['dataset'] == dataset_user_choice.value]['commentaire'].values[0]}"


def user_choice_confirm_eventhandler(obj):
    choice: str = dataset_user_choice.value
    with output:
        clear_output()
        output2 = widgets.Output()
        display(Markdown(data="#### Les différentes catégories de ce dataset sont: "))
        progress_bar = widgets.IntProgress(bar_style='success', description='Loading:')
        display(progress_bar)
        categories_long = get_features(ENDPOINT, choice)
        categories_short = categories_long["Caractéristiques"].map(lambda x: x.split('#')[-1]).to_frame()
        progress_bar.close()
        
        display(categories_short)
        
        select = widgets.SelectMultiple(options=[(name, full_name) for name, full_name in zip(categories_short.values.reshape(-1), categories_long.values.reshape(-1))], 
                                        description='Critères: ', disabled=False, layout=Layout(flex='1 1 auto', width='auto', height = "auto"),
                                        style={'description_width': 'initial'})
        select_box = Box(children=[select], layout=box_layout)
        display(Markdown("####  Veuillez choisir au moins deux critères à télécharger:"))
        select_confirm = widgets.Button(description='Soumettre', icon='check', layout=Layout(flex='1 1 auto', width='auto'),
                                     tooltip = "Cliquer ici pour confirmer votre choix")
        select_confirm_box = Box(children=[select_confirm], layout=box_layout)
        select_ui = VBox([select_box, select_confirm_box])
        
        def selection_confirm_eventhandler(obj):
            with output2:
                if len(select.value) >= 2: # We will constuct the query
                    global DataFrame
                    clear_output()
                    progress_bar = widgets.IntProgress(bar_style='success', description='Loading:')
                    display(progress_bar)
                    DataFrame = download_dataset(ENDPOINT, choice, select.value, progress_bar)
                    progress_bar.close()
                    display(DataFrame.head())                 
            
        select_confirm.on_click(selection_confirm_eventhandler)
        display(select_ui, output2)
        
        
user_choice_confirm.on_click(user_choice_confirm_eventhandler)
dataset_user_choice.observe(user_choice_change, 'value')

display(Markdown(data="#### Veuillez choisir un DataSet à étudier: "))
display(ui, output)

IntProgress(value=0, bar_style='success', description='Loading:')

#### Veuillez choisir un DataSet à étudier: 

VBox(children=(Box(children=(Dropdown(description='Choix:', layout=Layout(flex='1 3 auto', width='auto'), opti…

Output()

### Traitement des certains Data set particulier, le code ci-dessous n'est pas généralisable
#### 1. dbnaryNymRelationsCube

In [5]:
DataFrame['count'] = DataFrame['count'].astype(int)

relations = DataFrame['nymRelation'].unique()
labels = [item.split('#')[-1] for item in relations]
data1 = DataFrame.pivot_table(columns='nymRelation', index = ['wiktionaryDumpVersion', 'observationLanguage'], aggfunc=lambda x: max(x)).reset_index().sort_values(by='observationLanguage').sort_values(by='wiktionaryDumpVersion')

def transformation_date(date: int) -> datetime.datetime:
    if int(date[6:]) == 0:
        return datetime.datetime(year=int(date[:4]), month=int(date[4:6]), day=int(date[6:]) + 1)
    return datetime.datetime(year=int(date[:4]), month=int(date[4:6]), day=int(date[6:]))

data1["wiktionaryDumpVersion"] = data1["wiktionaryDumpVersion"].map(transformation_date)


out = widgets.Output()

choice = widgets.ToggleButtons(options=[('Statistiques globales', 'glob'), ('Par pays', 'pays')],  description='Choix:',
    disabled=False,
    tooltips=['Statistiques de tout les pays par années', 'Statistiques d\' pays au cours du temps']
)

def event(obj):
    with out:
        clear_output()
        if choice.value == "pays":
            user_choice = widgets.Dropdown(options = list(data1["observationLanguage"].unique()), description="Choix:")

            choosed_data = data1[data1["observationLanguage"] == user_choice.value]

            y_sc = bq.LinearScale()
            x_ord = bq.scales.DateScale()
            
            line = bq.Lines(x=choosed_data["wiktionaryDumpVersion"] , y=choosed_data["count"][relations].T, stroke_width=1, display_legend=True, labels= labels, scales={'x': x_ord, 'y': y_sc})
            ax_x = bq.Axis(scale=x_ord, grid_lines='solid', label='Date', tick_format = '%m %Y' , tick_style = {'font-size': 18})
            ax_y = bq.Axis(scale=y_sc, orientation='vertical', grid_lines='solid', label='Valeur', label_offset='-50')
            fig = bq.Figure(marks=[line], axes=[ax_x, ax_y], title=f"Différentes relations lexicales dans l'extraction {user_choice.value}", animation_duration = 1000)

            def edit_graph(obj):
                choosed_data = data1[data1["observationLanguage"] == user_choice.value]
                line.y = choosed_data["count"][relations].T
                line.x = choosed_data["wiktionaryDumpVersion"]
                fig.title = f"Différentes relations lexicales dans l'extraction {user_choice.value}"
            
        if choice.value == "glob":
            user_choice = widgets.Dropdown(options = [(np.datetime_as_string(item, unit='D'), item) for item in data1["wiktionaryDumpVersion"].unique()], description="Choix:", value = max(data1["wiktionaryDumpVersion"].unique()))
            
            x_ord = bq.OrdinalScale()
            y_sc = bq.LinearScale()
            
            choosed_data = data1[data1["wiktionaryDumpVersion"] == user_choice.value]
            
            x = choosed_data["observationLanguage"].values
            y = choosed_data["count"][relations].T
            
            bar = bq.Bars(x=x, y=y, scales={'x': x_ord, 'y':y_sc}, type='stacked', labels = labels, color_mode = 'element', display_legend=True,  colors =["red", "blue", "cyan", "pink", "lime", "purple", "orange", "brown"])
            ax_x = bq.Axis(scale=x_ord, grid_lines='solid', label='Pays', tick_style = {'font-size': 18})
            ax_y = bq.Axis(scale=y_sc, orientation='vertical', grid_lines='solid', label='Valeur', label_offset='-50')
            fig = bq.Figure(marks=[bar], axes=[ax_x, ax_y], title=f"Nombre de relations lexicales dans l'extraction du {np.datetime_as_string(user_choice.value, unit='D')}", animation_duration = 1000)
            
            def edit_graph(obj):
                choosed_data = data1[data1["wiktionaryDumpVersion"] == user_choice.value]
                bar.x = choosed_data["observationLanguage"].values
                bar.y = choosed_data["count"][relations].T
                fig.title = f"Nombre de relations lexicales dans l'extraction du {np.datetime_as_string(user_choice.value, unit='D')}"
            
        display(user_choice)
        user_choice.observe(edit_graph,'value')
        display(fig)  
    
choice.observe(event, 'value')
display(choice, out)
event(None)

ToggleButtons(description='Choix:', options=(('Statistiques globales', 'glob'), ('Par pays', 'pays')), tooltip…

Output()

#### 2. dbnaryStatisticsCube

In [6]:
categories = ["lexicalEntryCount", "translationsCount", "lexicalSenseCount", "pageCount"]

data2 = DataFrame.sort_values(by='wiktionaryDumpVersion')
data2[categories] = data2[categories].astype(int)

def transformation_date(date: int) -> datetime.datetime:
    if int(date[6:]) == 0:
        return datetime.datetime(year=int(date[:4]), month=int(date[4:6]), day=int(date[6:]) + 1)
    return datetime.datetime(year=int(date[:4]), month=int(date[4:6]), day=int(date[6:]))

data2["wiktionaryDumpVersion"] = data2["wiktionaryDumpVersion"].map(transformation_date)

out = widgets.Output()

choice = widgets.ToggleButtons(options=[('Statistiques globales', 'glob'), ('Par pays', 'pays')],  description='Choix:',
    disabled=False,
    tooltips=['Statistiques de tout les pays par années', 'Statistiques d\' pays au cours du temps']
)

def event(obj):
    with out:
        clear_output()
        if choice.value == "pays":
            user_choice = widgets.Dropdown(options = list(data1["observationLanguage"].unique()), description="Choix:")

            choosed_data = data2[data2["observationLanguage"] == user_choice.value]

            y_sc = bq.LinearScale()
            x_ord = bq.scales.DateScale()

            line = bq.Lines(x=choosed_data["wiktionaryDumpVersion"] , y=choosed_data[categories].T, stroke_width=1, display_legend=True, labels=categories, scales={'x': x_ord, 'y': y_sc})
            ax_x = bq.Axis(scale=x_ord, grid_lines='solid', label='Date', tick_format = '%m %Y', tick_style = {'font-size': 18})
            ax_y = bq.Axis(scale=y_sc, orientation='vertical', grid_lines='solid', label='Valeur', label_offset='-50')
            fig = bq.Figure(marks=[line], axes=[ax_x, ax_y], title=f"Nombre d'éléments dans l'extraction {user_choice.value}", animation_duration = 1000)

            def edit_graph(obj):
                choosed_data = data2[data2["observationLanguage"] == user_choice.value]
                line.y = choosed_data[categories].T
                line.x = choosed_data["wiktionaryDumpVersion"]
                
        if choice.value == "glob":
            user_choice = widgets.Dropdown(options = [(np.datetime_as_string(item, unit='D'), item) for item in data1["wiktionaryDumpVersion"].unique()], description="Choix:", value = max(data1["wiktionaryDumpVersion"].unique()))
            
            x_ord = bq.OrdinalScale()
            y_sc = bq.LinearScale()
            
            choosed_data = data2[data2["wiktionaryDumpVersion"] == user_choice.value]
            
            x = choosed_data["observationLanguage"].values
            y = choosed_data[categories].T
            
            bar = bq.Bars(x=x, y=y, scales={'x': x_ord, 'y':y_sc}, type='stacked', labels = categories, color_mode = 'element', display_legend=True,  colors =["red", "blue", "cyan", "pink", "lime", "purple", "orange", "brown"])
            ax_x = bq.Axis(scale=x_ord, grid_lines='solid', label='Pays', tick_style = {'font-size': 18})
            ax_y = bq.Axis(scale=y_sc, orientation='vertical', grid_lines='solid', label='Valeur', label_offset='-50')
            fig = bq.Figure(marks=[bar], axes=[ax_x, ax_y], title=f"Nombre de relations lexicales dans l'extraction du {np.datetime_as_string(user_choice.value, unit='D')}", animation_duration = 1000)
            
            def edit_graph(obj):
                choosed_data = data2[data2["wiktionaryDumpVersion"] == user_choice.value]
                bar.x = choosed_data["observationLanguage"].values
                bar.y = choosed_data[categories].T
                fig.title = f"Nombre de relations lexicales dans l'extraction du {np.datetime_as_string(user_choice.value, unit='D')}"
            
        display(user_choice)
        user_choice.observe(edit_graph,'value')
        display(fig)  
        
choice.observe(event, 'value')
display(choice, out)
event(None)

ToggleButtons(description='Choix:', options=(('Statistiques globales', 'glob'), ('Par pays', 'pays')), tooltip…

Output()

#### 3. dbnaryTranslationsCube