In [1]:
!pip install --upgrade git+https://github.com/unimib-datAI/Semtui-python.git

Collecting git+https://github.com/unimib-datAI/Semtui-python.git
  Cloning https://github.com/unimib-datAI/Semtui-python.git to /private/var/folders/tj/464jl5ls75l9zfnpxmsx2lrh0000gn/T/pip-req-build-hel6x798
  Running command git clone --filter=blob:none --quiet https://github.com/unimib-datAI/Semtui-python.git /private/var/folders/tj/464jl5ls75l9zfnpxmsx2lrh0000gn/T/pip-req-build-hel6x798
  Resolved https://github.com/unimib-datAI/Semtui-python.git to commit 4bcb80041f0aa3c5fbcecf702f720de44db0d8e3
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [2]:
import os 
import json 
import numpy as np
import pandas as pd

# Set pandas display options

pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_rows', 20)  # Limit to 20 rows for display


# API Authentication Setup and Token Retrieval

This segment of code is responsible for setting up the API URL and credentials, initializing the `TokenManager` class, and retrieving an authentication token. This token will be used for subsequent API requests.

### Code Explanation
- Import necessary modules from the `semtui_refactored` package.
- Define the API URL and user credentials.
- Initialize the `TokenManager` with the API URL, credentials, and required headers.
- Retrieve the authentication token using the `TokenManager`.


In [3]:
#Testing the semtui package
import semtui_refactored

In [4]:
# Import necessary classes and functions from the semtui_refactored package
from semtui_refactored.data_handler import DataHandler
from semtui_refactored.token_manager import TokenManager
from semtui_refactored.extension_manager import ExtensionManager
from semtui_refactored.reconciliation_manager import ReconciliationManager
from semtui_refactored.utils import Utility
from semtui_refactored.dataset_manager import DatasetManager
from semtui_refactored.semtui_evals import EvaluationManager
from semtui_refactored.data_modifier import DataModifier


In [5]:
# Set up the API URL and credentials

# Request credentials from inside.disco.unimib.it

base_url = "http://vm.chronos.disco.unimib.it:3003"
api_url = "http://vm.chronos.disco.unimib.it:3003/api"
username = ""
password = ""

token_manager = TokenManager(api_url, username, password)
# Get the token
token = token_manager.get_token()
utility = Utility(base_url, token_manager)


# Initialization of Managers

This segment of code initializes various manager classes needed for handling data operations, reconciliation, dataset management, evaluations, and extensions. Each manager class is configured with the necessary API URL and authentication token.

### Code Explanation
- Initialize the `DataManager` for handling data-related operations.
- Initialize the `ReconciliationManager` for managing data reconciliation tasks.
- Initialize the `DatasetManager` for managing datasets.
- Initialize the `EvaluationManager` for handling evaluations.
- Initialize the `ExtensionManager` for managing extensions.


In [6]:
# Initialize ReconciliationManager
reconciliation_manager = ReconciliationManager(base_url, token_manager)  # Create an instance of ReconciliationManager with API URL and token manager

# Initialize DatasetManager
dataset_manager = DatasetManager(base_url, token_manager)  # Create an instance of DatasetManager with API URL and token manager


# Initialize ExtensionManager
extension_manager = ExtensionManager(base_url, token)  # Create an instance of ExtensionManager with API URL and token

# Instantiate the utility class
utility = Utility(base_url, token_manager)

# Initialize the DataModifier
data_modifier_manager = DataModifier()  # Create an instance of EvaluationManager

# Importing and Displaying CSV Data

This segment of code handles the importation of data from a CSV file using the `DataManager` class. It reads the CSV file into a Pandas DataFrame and displays the first few rows. Error handling is included to catch and report any issues that arise during the import process.

### Code Explanation
- Define the path to the CSV file.
- Attempt to read the CSV file using the `DataManager` and store it in a DataFrame.
- Print a success message and display the first few rows of the DataFrame.
- Catch and print any errors that occur during the CSV import.


In [7]:
# Get the current working directory
cwd = os.getcwd()
print("Current working directory:", cwd)

# Path to your CSV file
csv_file_path = "sample_table.csv"  # Define the path to the CSV file


# Read CSV data using DataManager
try:
    df = pd.read_csv(csv_file_path)  # Read the CSV file into a DataFrame using DataManager
    print("CSV file imported successfully!")  # Print success message
    display(df.head())  # Display the first few rows of the DataFrame
except Exception as e:
    print(f"Error importing CSV file: {e}")  # Print error message if CSV import fails


Current working directory: /Users/abubakarialidu/Documents/SEMT-py/semtui1.1/sample Notebooks
CSV file imported successfully!


Unnamed: 0,Fecha_id,Keyword,Impresiones,Clicks,City,County,Country
0,20230101,alquiler pisos colindres,1,0,Madrid,Community of Madrid,Spain
1,20230101,alquiler pisos sestao,1,0,Barcelona,Catalonia,Spain
2,20230101,steelcraft pedal car,1,0,Buffalo,New York,United States


# Processing DataFrame for Date Conversion

This segment of code processes the DataFrame to convert the 'Fecha_id' column to ISO date format using the `DataManager` class. It includes error handling to manage any issues that arise during the data processing.

### Code Explanation
- Attempt to process the DataFrame to convert the 'Fecha_id' column to ISO format.
- Print a success message and display the first few rows of the processed DataFrame.
- Catch and print any errors that occur during the data processing.


In [8]:
# Use DataModifier to modify the DataFrame
try:
    # Convert the 'Fecha_id' column to ISO date format
    df = DataModifier.iso_date(df, date_col='Fecha_id')

    # Uncomment the following lines if you need to perform the respective operations

    # df = DataModifier.lower_case(df, column='column_name')  # Convert column values to lowercase
    # df = DataModifier.drop_na(df)  # Remove rows with missing values
    # df = DataModifier.rename_columns(df, column_rename_dict={'old_name': 'new_name'})  # Rename columns
    # df = DataModifier.convert_dtypes(df, dtype_dict={'column_name': 'int'})  # Convert column data types
    # df = DataModifier.reorder_columns(df, new_column_order=['col1', 'col2', 'col3'])  # Reorder columns

    print("DataFrame modification successful!")

except ValueError as e:
    print(e)

DataFrame modification successful!


In [9]:
df

Unnamed: 0,Fecha_id,Keyword,Impresiones,Clicks,City,County,Country
0,2023-01-01,alquiler pisos colindres,1,0,Madrid,Community of Madrid,Spain
1,2023-01-01,alquiler pisos sestao,1,0,Barcelona,Catalonia,Spain
2,2023-01-01,steelcraft pedal car,1,0,Buffalo,New York,United States


# Retrieving and Displaying the List of Datasets

This segment of code retrieves the list of datasets from the server using the `DatasetManager` class. It then displays the retrieved datasets in a DataFrame format. Error handling is included to manage any issues during the retrieval process.

### Code Explanation
- Attempt to retrieve the list of datasets using the `DatasetManager`.
- Print a success message and display the DataFrame if datasets are retrieved successfully.
- Print a failure message if the retrieval fails.
- Catch and print any errors that occur during the retrieval process.


In [10]:
# Get the list of datasets
try:
    df_datasets = dataset_manager.get_database_list()  # Retrieve the list of datasets
    if df_datasets is not None:
        print("Datasets retrieved successfully!")  # Print success message
        display(df_datasets)  # Display the DataFrame containing the datasets
    else:
        print("Failed to retrieve datasets.")  # Print failure message if no datasets are retrieved
except Exception as e:
    print(f"Error retrieving datasets: {e}")  # Print error message if dataset retrieval fails


Datasets retrieved successfully!


(  id  userId                 name  nTables          lastModifiedDate
 0  4       0  Museums_correct-try        7  2024-04-27T18:54:00.810Z
 1  5       0             Tutorial       18  2024-06-17T17:32:50.388Z
 2  6       0     Tutorial Museums        4  2024-05-23T11:50:05.536Z
 3  7       0            All_Cases       21  2024-06-25T15:31:11.216Z,
 {'name': {'label': 'Name',
   'type': 'link',
   'props': {'url': '/datasets/:id/tables'}},
  'description': {'label': 'Description'},
  'nTables': {'label': 'N. Tables'},
  'mentions': {'label': 'N. Mentions'},
  'lastModifiedDate': {'label': 'Last Modified', 'type': 'date'}})

# Adding a Table to a Dataset

This segment of code demonstrates how to add a table to an existing dataset on the server using the `DatasetManager` class. It specifies the dataset ID and table name, then attempts to add the DataFrame as a table to the specified dataset. Error handling is included to manage any issues that arise during this process.

### Code Explanation
- Define the dataset ID and table name.
- Attempt to add the DataFrame as a table to the specified dataset using the `DatasetManager`.
- NOTE: this adding creates the JSON version of the table to be used in the next enrichment steps (reconciliation and extension)
- Catch and print any errors that occur during the process.

In [11]:
# Add the table to the dataset
dataset_id = "7"  # Replace with the actual dataset ID
table_name = "Bucharest_Jot"  # Define the name of the new table to add

In [12]:

try:
    dataset_manager.add_table_to_dataset(dataset_id, df, table_name)  # Attempt to add the DataFrame as a table to the dataset
    print(f"Table '{table_name}' added to dataset ID {dataset_id} successfully.")  # Print success message
except Exception as e:
    print(f"Error adding table to dataset: {e}")  # Print error message if adding table fails


Table added successfully!
New table added: ID: 169, Name: Bucharest_Jot
Table 'Bucharest_Jot' added to dataset ID 7 successfully.


# Listing Tables in a Dataset

This segment of code retrieves and lists the tables within a specified dataset using the `DatasetManager` class. It specifies the dataset ID and attempts to retrieve the list of tables. Error handling is included to manage any issues that arise during this process.

### Code Explanation
- Define the dataset ID.
- Attempt to list the tables in the specified dataset using the `DatasetManager`.
- Catch and print any errors that occur during the process.


In [13]:
try:
    dataset_manager.list_tables_in_dataset(dataset_id) # Attempt to list the tables in the dataset
except Exception as e:
    print(f"Error listing tables in dataset: {e}")

Tables in dataset 7:
ID: 90, Name: JOT_Case
ID: 100, Name: SpendNetwork-TenderData Alligator
ID: 102, Name: JOT_Case
ID: 103, Name: JOT_Example
ID: 112, Name: JOT data tutorial fixed
ID: 114, Name: JOT data tutorial mini
ID: 115, Name: JOT small table
ID: 147, Name: New_JOT_tiny_OK_reconciled
ID: 148, Name: New_JOT_tiny_20240624_UI
ID: 149, Name: New_JOT_tiny_reconciled_20240624
ID: 150, Name: New_JOT_tiny_fixed reconciled
ID: 151, Name: table 1
ID: 152, Name: New_JOT_tiny_reconciled_20240628
ID: 153, Name: New_JOT_tiny_reconciled_final
ID: 154, Name: New_JOT_tiny_reconciled_last
ID: 155, Name: New_JOT_tiny_reconciled_OK
ID: 164, Name: New_tiny_Bucharest
ID: 165, Name: New_test_Bucharest
ID: 166, Name: New_test2_Bucharest
ID: 167, Name: New_Bucharest3
ID: 168, Name: New_JOT_19_sep
ID: 169, Name: Bucharest_Jot


In [14]:
try:
    table_data = dataset_manager.get_table_by_name(dataset_id, table_name)  # Attempt to retrieve the specified table from the dataset
    if table_data:
        print(f"Table '{table_name}' retrieved successfully!")  # Print success message if table is retrieved
        # No need to display the DataFrame
    else:
        print(f"Table '{table_name}' not found in the dataset.")  # Print message if table is not found
except Exception as e:
    print(f"Error retrieving table '{table_name}': {e}")  # Print error message if retrieving the table fails

Table 'Bucharest_Jot' retrieved successfully!


In [16]:
table_data

{'table': {'id': '246',
  'idDataset': '27',
  'name': 'New_JOT_tiny_02072024_4',
  'nCols': 7,
  'nRows': 3,
  'nCells': 21,
  'nCellsReconciliated': 0,
  'lastModifiedDate': '2024-07-02T19:52:13.354Z'},
 'columns': {'Fecha_id': {'id': 'Fecha_id',
   'label': 'Fecha_id',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'Keyword': {'id': 'Keyword',
   'label': 'Keyword',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'Impresiones': {'id': 'Impresiones',
   'label': 'Impresiones',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'Clicks': {'id': 'Clicks',
   'label': 'Clicks',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'City': {'id': 'City',
   'label': 'City',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'County': {'id': 'County',
   'label': 'County',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'Country': {'id': 'Country',
   'label': 'Country',
   'status': 'empty',
   'context': {},
   'me

# Retrieving a Table by Name from a Dataset

This segment of code demonstrates how to retrieve a specific table from an existing dataset on the server using the `DatasetManager` class. It specifies the dataset ID and table name, then attempts to fetch the table data. Error handling is included to manage any issues that arise during this process.

### Code Explanation
- Define the dataset ID and the table name to be retrieved.
- Attempt to retrieve the specified table from the dataset using the `DatasetManager`.
- NOTE: the retrived table is in JSON format, ready to be reconciled, and/or extended.
- Print a success message if the table is retrieved successfully.
- Print a failure message if the table is not found in the dataset.
- Catch and print any errors that occur during the retrieval process.


In [12]:
try:
    table_data = dataset_manager.get_table_by_name(dataset_id, table_name)  # Attempt to retrieve the specified table from the dataset
    if table_data:
        print(f"Table '{table_name}' retrieved successfully!")  # Print success message if table is retrieved
        # No need to display the DataFrame
    else:
        print(f"Table '{table_name}' not found in the dataset.")  # Print message if table is not found
except Exception as e:
    print(f"Error retrieving table '{table_name}': {e}")  # Print error message if retrieving the table fails


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 149.132.176.67:3001
DEBUG:urllib3.connectionpool:http://149.132.176.67:3001 "GET /api/dataset/27/table HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 149.132.176.67:3001
DEBUG:urllib3.connectionpool:http://149.132.176.67:3001 "GET /api/dataset/27/table/242 HTTP/1.1" 200 None


Table 'New_JOT_tiny_02072024_2' retrieved successfully!


In [18]:
# Show the retrieved table
#table_data

# Retrieving the List of Reconcilers

This segment of code tests the retrieval of the list of reconcilers using the `ReconciliationManager` class. It attempts to fetch and display the reconcilers in a DataFrame. Error handling is included to manage any issues that arise during the retrieval process.

### Code Explanation
- Attempt to retrieve the list of reconcilers using the `ReconciliationManager`.
- Print a success message and display the DataFrame if reconcilers are retrieved successfully.
- Print a failure message if the retrieval fails.
- Catch and print any errors that occur during the retrieval process.


In [17]:
# Test get_reconciliators_list
try:
    reconciliators_list = reconciliation_manager.get_reconciliators_list()  # Attempt to retrieve the list of reconciliators
    if reconciliators_list is not None:
        print("Reconciliators retrieved successfully!")  # Print success message
        display(reconciliators_list.head())  # Display the first few rows of the DataFrame
    else:
        print("Failed to retrieve reconciliators.")  # Print failure message if no reconciliators are retrieved
except Exception as e:
    print(f"Error retrieving reconciliators: {e}")  # Print error message if retrieving reconciliators fails

DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 149.132.176.67:3001
DEBUG:urllib3.connectionpool:http://149.132.176.67:3001 "GET /api/reconciliators/list HTTP/1.1" 200 None


Reconciliators retrieved successfully!


Unnamed: 0,id,relativeUrl,name
0,geocodingGeonames,/dataset,Geocoding: geo coordinates (GeoNames)
1,geocodingHere,/here,Geocoding: geo coordinates (HERE)
2,geonames,/dataset,Linking: GeoNames (GeoNames)
3,wikidataAlligator,/dataset,Linking: Wikidata (Alligator)
4,wikidataOpenRefine,/wikidata,Linking: Wikidata (OpenRefine)


In [38]:
# Test get_reconciler_parameters
reconciliator_id = "geocodingHere"  # Replace with the actual reconciliator ID

# Get the reconciliator parameters
try:
    params = reconciliation_manager.get_reconciliator_parameters(id_reconciliator=reconciliator_id, print_params=True)  # Attempt to retrieve parameters
    if params:
        print(f"Parameters for reconciliator '{reconciliator_id}' retrieved successfully!")  # Print success message
    else:
        print(f"Failed to retrieve parameters for reconciliator '{reconciliator_id}'.")  # Print failure message if retrieval fails
except Exception as e:
    print(f"Error retrieving parameters for reconciliator '{reconciliator_id}': {e}")  # Print error message if retrieving parameters fails

Parameters for reconciliator 'geocodingHere':
Mandatory parameters:
- table (json): Mandatory
  Description: The table data in JSON format
- columnName (string): Mandatory
  Description: The name of the column to reconcile
- idReconciliator (string): Mandatory
  Description: The ID of the reconciliator to use

Optional parameters:
- secondPart (selectColumns): Optional
  Description: Optional column to add information to support reconciliation.
  Label: Select a column with information about the location to reconcile
  Info Text: 
- thirdPart (selectColumns): Optional
  Description: Optional column to add information to support reconciliation.
  Label: Select a column with information about the location to reconcile
  Info Text: 
- fourthPart (selectColumns): Optional
  Description: Optional column to add information to support reconciliation.
  Label: Select a column with information about the location to reconcile
  Info Text: 
Parameters for reconciliator 'geocodingHere' retrieved s

# Reconciling a Column in a Table

This segment of code tests the reconciliation of a specific column in a table using the `ReconciliationManager` class. It specifies the dataset ID, table name, column name, and reconciliator ID, then attempts to reconcile the column. Error handling is included to manage any issues that arise during the reconciliation process.

### Code Explanation
- Define the table name and the column name to be reconciled.
- Define the ID of the reconciliator.
- Attempt to retrieve the table data.
- Attempt to reconcile the specified column using the `ReconciliationManager`.
- Print a success message if the column is reconciled successfully.
- Print a failure message if the reconciliation fails.
- Catch and print any errors that occur during the reconciliation process.


In [58]:
column_name = "City"
reconciliator_id = "geocodingHere"
optional_columns = ['County', 'Country']

try:
    reconciled_table, backend_payload = reconciliation_manager.reconcile(
        table_data, 
        column_name, 
        reconciliator_id, 
        optional_columns
    )
    if reconciled_table and backend_payload:
        print("Column reconciled successfully and backend payload created!")
    else:
        print("Failed to reconcile column or create backend payload.")
except Exception as e:
    print(f"Error during reconciliation process: {e}")


Column reconciled successfully and backend payload created!


In [59]:
# Show the reconciled table
reconciled_table

{'table': {'id': '246',
  'idDataset': '27',
  'name': 'New_JOT_tiny_02072024_4',
  'nCols': 7,
  'nRows': 3,
  'nCells': 21,
  'nCellsReconciliated': 0,
  'lastModifiedDate': '2024-07-02T19:52:13.354Z'},
 'columns': {'Fecha_id': {'id': 'Fecha_id',
   'label': 'Fecha_id',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'Keyword': {'id': 'Keyword',
   'label': 'Keyword',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'Impresiones': {'id': 'Impresiones',
   'label': 'Impresiones',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'Clicks': {'id': 'Clicks',
   'label': 'Clicks',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'City': {'id': 'City',
   'label': 'City',
   'status': 'pending',
   'context': {'georss': {'uri': 'http://www.google.com/maps/place/',
     'total': 3,
     'reconciliated': 3}},
   'metadata': [{'id': '',
     'match': True,
     'score': 0,
     'name': {'value': '', 'uri': ''},
     'entity': [{'id': 'wd:Q29

In [None]:
# Show the reconciled payload
# backend_payload

# Push reconciled table to the backend

In [18]:
# Push to backend
successMessage, sentPayload = utility.push_to_backend(
    dataset_id="27", 
    table_id="238", 
    payload=backend_payload, 
    enable_logging=True
)

print(successMessage)

Payload being sent:
{
  "tableInstance": {
    "id": "245",
    "idDataset": "27",
    "name": "New_JOT_tiny_02072024_3",
    "nCols": 7,
    "nRows": 3,
    "nCells": 21,
    "nCellsReconciliated": 3,
    "lastModifiedDate": "2024-07-02T18:58:23.750Z",
    "minMetaScore": 0,
    "maxMetaScore": 1
  },
  "columns": {
    "byId": {
      "Fecha_id": {
        "id": "Fecha_id",
        "label": "Fecha_id",
        "status": "empty",
        "context": {},
        "metadata": []
      },
      "Keyword": {
        "id": "Keyword",
        "label": "Keyword",
        "status": "empty",
        "context": {},
        "metadata": []
      },
      "Impresiones": {
        "id": "Impresiones",
        "label": "Impresiones",
        "status": "empty",
        "context": {},
        "metadata": []
      },
      "Clicks": {
        "id": "Clicks",
        "label": "Clicks",
        "status": "empty",
        "context": {},
        "metadata": []
      },
      "City": {
        "id": "City",
 

# Retrieving the List of Extenders

This segment of code retrieves the list of extenders using the `ExtensionManager` class. It attempts to fetch and display the extenders in a DataFrame. Error handling is included to manage any issues that arise during the retrieval process.

### Code Explanation
- Attempt to retrieve the list of extenders using the `ExtensionManager`.
- Print a success message and display the DataFrame if extenders are retrieved successfully.
- Print a failure message if the retrieval fails.
- Catch and print any errors that occur during the retrieval process.


In [19]:
# Get Extender List
try:
    extenders_list = extension_manager.get_extenders_list()  # Attempt to retrieve the list of extenders
    if extenders_list is not None:
        print("Extenders retrieved successfully!")  # Print success message
        display(extenders_list.head())  # Display the first few rows of the DataFrame
    else:
        print("Failed to retrieve extenders.")  # Print failure message if no extenders are retrieved
except Exception as e:
    print(f"Error retrieving extenders: {e}")  # Print error message if retrieving extenders fails

Extenders retrieved successfully!


Unnamed: 0,id,relativeUrl,name
0,geoPropertiesWikidata,/wikidata/entities,Geo Properties (Wikidata)
1,geoRouteHere,,Geo Route (HERE)
2,meteoPropertiesOpenMeteo,,Meteo Properties (OpenMeteo)
3,reconciledColumnExt,,Annotation properties
4,reconciledColumnExtWikidata,/entity/labels,Annotation properties (Wikidata)


# Retrieving Parameters for a Specific Extender

This segment of code tests the retrieval of parameters for a specific extender using the `ExtensionManager` class. It specifies the extender ID and attempts to fetch its parameters. The parameters are printed if the retrieval is successful. Error handling is included to manage any issues that arise during the retrieval process.

### Code Explanation
- Define the ID of the extender.
- Attempt to retrieve the parameters for the specified extender using the `ExtensionManager`.
- Print a success message if the parameters are retrieved successfully.
- Print a failure message if the retrieval fails.
- Catch and print any errors that occur during the retrieval process.


In [20]:
# Test get_extender_parameters
extender_id = "meteoPropertiesOpenMeteo"  # Replace with the actual extender ID

# Get the extender parameters
try:
    params = extension_manager.get_extender_parameters(extender_id, print_params=True)  # Attempt to retrieve parameters
    if params:
        print(f"Parameters for extender '{extender_id}' retrieved successfully!")  # Print success message
    else:
        print(f"Failed to retrieve parameters for extender '{extender_id}'.")  # Print failure message if retrieval fails
except Exception as e:
    print(f"Error retrieving parameters for extender '{extender_id}': {e}")  # Print error message if retrieving parameters fails

Parameters for extender 'meteoPropertiesOpenMeteo':
Mandatory parameters:
- dates (selectColumns): Mandatory
  Description: Select a column with the days on which to retrieve the weather data:
  Label: Select a column with days in ISO8601 format (yyyy-mm-dd)
  Info Text: Only dates prior to 10 days are covered (ISO8601 format yyyy-mm-dd)
  Options: []

- weatherParams (checkbox): Mandatory
  Description: Select one or more <b>weather</b> parameters:
  Label: Weather parameters
  Info Text: Meteo parameters to extend the table
  Options: [{'id': 'daylight_duration', 'label': 'Number of seconds of daylight', 'value': 'daylight_duration'}, {'id': 'light_hours', 'label': 'Sun rise and set times UTC in ISO8601', 'value': 'light_hours'}, {'id': 'apparent_temperature_max', 'label': 'Maximum daily temperature in °C', 'value': 'apparent_temperature_max'}, {'id': 'apparent_temperature_min', 'label': 'Minimum daily temperature in °C', 'value': 'apparent_temperature_min'}, {'id': 'precipitation_su

In [21]:

try:
    # Get options for the 'weatherParams' parameter of 'meteoPropertiesOpenMeteo' extender
    options = extension_manager.get_parameter_options('meteoPropertiesOpenMeteo', 'weatherParams')
    print(f"Options for 'weatherParams' parameter of 'meteoPropertiesOpenMeteo' extender: {options}")
except Exception as e:
    print(f"Error retrieving options for 'meteoPropertiesOpenMeteo' extender: {e}")


try:
    # Get options for the 'property' parameter of 'geoPropertiesWikidata' extender
    options = extension_manager.get_parameter_options('geoPropertiesWikidata', 'property')
    print(f"Options for 'property' parameter of 'geoPropertiesWikidata' extender: {options}")
except Exception as e:
    print(f"Error retrieving options for 'geoPropertiesWikidata' extender: {e}")

try:
    # Get options for the 'labels' parameter of 'reconciledColumnExtWikidata' extender
    options = extension_manager.get_parameter_options('reconciledColumnExtWikidata', 'labels')
    print(f"Options for 'labels' parameter of 'reconciledColumnExtWikidata' extender: {options}")
except Exception as e:
    print(f"Error retrieving options for 'reconciledColumnExtWikidata' extender: {e}")

try:
    # Get options for the 'property' parameter of 'geoRouteHere' extender
    options = extension_manager.get_parameter_options('geoRouteHere', 'property')
    print(f"Options for 'property' parameter of 'geoRouteHere' extender: {options}")
except Exception as e:
    print(f"Error retrieving options for 'geoRouteHere' extender: {e}")

try:
    # Get options for the 'poi_property' parameter of 'geoRouteHere' extender
    options = extension_manager.get_parameter_options('geoRouteHere', 'poi_property')
    print(f"Options for 'poi_property' parameter of 'geoRouteHere' extender: {options}")
except Exception as e:
    print(f"Error retrieving options for 'poi_property' parameter of 'geoRouteHere' extender: {e}")

try:
    # Get options for the 'property' parameter of 'reconciledColumnExt' extender
    options = extension_manager.get_parameter_options('reconciledColumnExt', 'property')
    print(f"Options for 'property' parameter of 'reconciledColumnExt' extender: {options}")
except Exception as e:
    print(f"Error retrieving options for 'reconciledColumnExt' extender: {e}")


Options for 'weatherParams' parameter of 'meteoPropertiesOpenMeteo' extender: ['daylight_duration', 'light_hours', 'apparent_temperature_max', 'apparent_temperature_min', 'precipitation_sum', 'precipitation_hours']
Options for 'property' parameter of 'geoPropertiesWikidata' extender: ['P625', 'P421', 'P281']
Options for 'labels' parameter of 'reconciledColumnExtWikidata' extender: ['id', 'url', 'name', 'description']
Options for 'property' parameter of 'geoRouteHere' extender: ['duration', 'length', 'route']
Options for 'poi_property' parameter of 'geoRouteHere' extender: ['poi']
Options for 'property' parameter of 'reconciledColumnExt' extender: ['id', 'name']


# Extending a Column with Additional Properties

This segment of code tests the extension of a column in the reconciled table using the `ExtensionManager` class. It specifies the column containing reconciled IDs, the properties to extend, the new column names, the date column, and the extender ID. The properties are added to the DataFrame, creating new columns. Error handling is included to manage any issues that arise during the extension process.

### Code Explanation
- Define the column containing reconciled IDs.
- Specify the properties to be added and their corresponding new column names.
- Define the date column name.
- Define the ID of the extender.
- Attempt to extend the specified column using the `ExtensionManager`.
- Print a success message if the column is extended successfully.
- Print a failure message if the extension fails.
- Catch and print any errors that occur during the extension process.


In [22]:
# Mandatory parameters
reconciliated_column_name = 'City'  # Column that contains reconciled IDs
extender_id = "meteoPropertiesOpenMeteo"  # ID for Open Meteo Properties extender
properties = ['apparent_temperature_max', 'apparent_temperature_min', 'precipitation_sum', 'precipitation_hours']  # Properties to extend

# Optional parameters
date_column_name = "Fecha_id"  # Date column name for weather data
separator_format = "comma"  # Decimal format, options are "default" or "comma"

try:
    extended_table, extension_payload = extension_manager.extend_column(
        table=reconciled_table,
        reconciliated_column_name=reconciliated_column_name,
        id_extender=extender_id,
        properties=properties,
        date_column_name=date_column_name,
        decimal_format=separator_format
    )

    if extended_table:
        print("Column extended successfully!")
    else:
        print("Failed to extend column.")
except Exception as e:
    print(f"Error extending column: {str(e)}")

Column extended successfully!


In [23]:
extended_table

{'table': {'id': '245',
  'idDataset': '27',
  'name': 'New_JOT_tiny_02072024_3',
  'nCols': 7,
  'nRows': 3,
  'nCells': 21,
  'nCellsReconciliated': 0,
  'lastModifiedDate': '2024-07-02T18:58:23.750Z'},
 'columns': {'Fecha_id': {'id': 'Fecha_id',
   'label': 'Fecha_id',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'Keyword': {'id': 'Keyword',
   'label': 'Keyword',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'Impresiones': {'id': 'Impresiones',
   'label': 'Impresiones',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'Clicks': {'id': 'Clicks',
   'label': 'Clicks',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'City': {'id': 'City',
   'label': 'City',
   'status': 'pending',
   'context': {'georss': {'uri': 'http://www.google.com/maps/place/',
     'total': 3,
     'reconciliated': 3}},
   'metadata': [{'id': '',
     'match': True,
     'score': 0,
     'name': {'value': '', 'uri': ''},
     'entity': [{'id': 'wd:Q29

In [31]:
#extension_payload

In [24]:
# Call for reconciledColumnExt
properties = ['id', 'name']
reconciliated_column_name = 'City'
extender_id = 'reconciledColumnExt'

try:
    extended_table, backend_payload = extension_manager.extend_reconciledColumnExt(
        table=extended_table,
        reconciliated_column_name=reconciliated_column_name,
        id_extender=extender_id,
        properties=properties
    )

    if extended_table and backend_payload:
        print("Column extended successfully!")
        print("Extended table:", json.dumps(extended_table, indent=2))
        print("Backend payload:", json.dumps(backend_payload, indent=2))
    else:
        print("Failed to extend column.")
except Exception as e:
    print(f"Error extending column: {str(e)}")

API Response: {
  "columns": {
    "id_City": {
      "label": "id_City",
      "metadata": [],
      "cells": {
        "r0": {
          "label": "40.41955,-3.69196",
          "metadata": []
        },
        "r1": {
          "label": "41.38804,2.17001",
          "metadata": []
        },
        "r2": {
          "label": "42.88545,-78.87846",
          "metadata": []
        }
      }
    },
    "name_City": {
      "label": "name_City",
      "metadata": [],
      "cells": {
        "r0": {
          "label": "Madrid, Community of Madrid, Spain",
          "metadata": []
        },
        "r1": {
          "label": "Barcelona, Catalonia, Spain",
          "metadata": []
        },
        "r2": {
          "label": "Buffalo, NY, United States",
          "metadata": []
        }
      }
    }
  },
  "meta": {}
}
Column extended successfully!
Extended table: {
  "table": {
    "id": "245",
    "idDataset": "27",
    "name": "New_JOT_tiny_02072024_3",
    "nCols": 9,
    "nRows

In [25]:
extended_table

{'table': {'id': '245',
  'idDataset': '27',
  'name': 'New_JOT_tiny_02072024_3',
  'nCols': 9,
  'nRows': 3,
  'nCells': 27,
  'nCellsReconciliated': 3,
  'lastModifiedDate': '2024-07-02T18:58:23.750Z',
  'minMetaScore': 0,
  'maxMetaScore': 1},
 'columns': {'Fecha_id': {'id': 'Fecha_id',
   'label': 'Fecha_id',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'Keyword': {'id': 'Keyword',
   'label': 'Keyword',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'Impresiones': {'id': 'Impresiones',
   'label': 'Impresiones',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'Clicks': {'id': 'Clicks',
   'label': 'Clicks',
   'status': 'empty',
   'context': {},
   'metadata': []},
  'City': {'id': 'City',
   'label': 'City',
   'status': 'reconciliated',
   'context': {'georss': {'uri': 'http://www.google.com/maps/place/',
     'total': 3,
     'reconciliated': 3}},
   'metadata': [{'id': '',
     'match': True,
     'score': 0,
     'name': {'value'

In [26]:
# Push to backend
successMessage, sentPayload = utility.push_to_backend(
    dataset_id="27", 
    table_id="245", 
    payload=backend_payload, 
    enable_logging=True
)

print(successMessage)

Payload being sent:
{
  "tableInstance": {
    "id": "245",
    "idDataset": "27",
    "name": "New_JOT_tiny_02072024_3",
    "nCols": 9,
    "nRows": 3,
    "nCells": 27,
    "nCellsReconciliated": 3,
    "lastModifiedDate": "2024-07-02T18:58:23.750Z",
    "minMetaScore": 0,
    "maxMetaScore": 1
  },
  "columns": {
    "byId": {
      "Fecha_id": {
        "id": "Fecha_id",
        "label": "Fecha_id",
        "status": "empty",
        "context": {},
        "metadata": []
      },
      "Keyword": {
        "id": "Keyword",
        "label": "Keyword",
        "status": "empty",
        "context": {},
        "metadata": []
      },
      "Impresiones": {
        "id": "Impresiones",
        "label": "Impresiones",
        "status": "empty",
        "context": {},
        "metadata": []
      },
      "Clicks": {
        "id": "Clicks",
        "label": "Clicks",
        "status": "empty",
        "context": {},
        "metadata": []
      },
      "City": {
        "id": "City",
 

In [26]:
import requests
import json

def extract_payload(input_data):
    city_data = input_data['columns']['City']
    rows = input_data['rows']
    
    items = {}
    column = {}
    
    for row_id, row_data in rows.items():
        city_cell = row_data['cells']['City']
        city_name = city_cell['label']
        city_metadata = city_cell['metadata'][0]
        
        items.setdefault('City', {})[row_id] = city_metadata['id']
        column[row_id] = [
            city_name,
            [city_metadata],
            "City"
        ]
    
    return {
        "serviceId": "reconciledColumnExt",
        "items": items,
        "column": column,
        "property": ["id"]
    }



# Extract the payload for the POST request
post_payload = extract_payload(extended_table)


In [27]:
# Define the URL
url = "http://149.132.176.67:3001/api/extenders"

# Define the headers
headers = {
    "Content-Type": "application/json;charset=UTF-8",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    "Origin": "http://149.132.176.67:3000",
    "Referer": "http://149.132.176.67:3000/"
}

# Send the POST request
response = requests.post(url, headers=headers, data=json.dumps(post_payload))

# Check the response
if response.status_code == 200:
    print("Request successful!")
    print("Response:", response.json())
else:
    print("Request failed with status code:", response.status_code)
    print("Response:", response.text)

DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 149.132.176.67:3001
DEBUG:urllib3.connectionpool:http://149.132.176.67:3001 "POST /api/extenders HTTP/1.1" 200 224


Request successful!
Response: {'columns': {'id_City': {'label': 'id_City', 'metadata': [], 'cells': {'r0': {'label': '40.41955,-3.69196', 'metadata': []}, 'r1': {'label': '41.38804,2.17001', 'metadata': []}, 'r2': {'label': '42.88545,-78.87846', 'metadata': []}}}}, 'meta': {}}


In [34]:
import copy

def merge_data(input_data, api_response):
    output = copy.deepcopy(input_data)
    
    # Update table data
    output['table']['nCols'] += 2  # Adding id_City and name_City
    output['table']['nCells'] = output['table']['nRows'] * output['table']['nCols']
    output['table']['nCellsReconciliated'] = 3
    output['table']['minMetaScore'] = 0
    output['table']['maxMetaScore'] = 1

    # Update columns
    output['columns']['City']['status'] = 'reconciliated'
    output['columns']['City']['metadata'] = [{
        'id': 'None:',
        'match': True,
        'score': 0,
        'name': {'value': '', 'uri': ''},
        'entity': [
            {
                'id': 'wd:Q29934236',
                'name': {'value': 'GlobeCoordinate', 'uri': 'http://149.132.176.67:3002/map?polyline=Q29934236'},
                'score': 0,
                'match': True,
                'type': []
            },
            {
                'id': 'georss:point',
                'name': {'value': 'point', 'uri': 'http://149.132.176.67:3002/map?polyline=point'},
                'score': 0,
                'match': True,
                'type': []
            }
        ]
    }]
    output['columns']['City']['annotationMeta'] = {
        'annotated': True,
        'match': {'value': True, 'reason': 'reconciliator'},
        'lowestScore': 0,
        'highestScore': 0
    }
    
    # Remove unnecessary fields from City column
    output['columns']['City'].pop('kind', None)
    
    # Add new columns
    new_columns = ['id_City', 'name_City']
    for col in new_columns:
        output['columns'][col] = {
            'id': col,
            'label': col,
            'metadata': [],
            'status': 'empty',
            'context': {}
        }

    # Update rows
    for row_id, row_data in output['rows'].items():
        city_cell = row_data['cells']['City']
        city_metadata = city_cell['metadata'][0]
        
        # Update City cell
        city_cell['metadata'] = [{
            'id': city_metadata['id'],
            'name': {
                'value': city_metadata['name']['value'],
                'uri': f"http://149.132.176.67:3002/map?polyline={city_metadata['id'].split(':')[1]}"
            },
            'feature': city_metadata['feature'],
            'score': city_metadata['score'],
            'match': city_metadata['match'],
            'type': city_metadata['type']
        }]
        city_cell['annotationMeta'] = {
            'annotated': True,
            'match': {'value': True, 'reason': 'reconciliator'},
            'lowestScore': 1,
            'highestScore': 1
        }
        
        # Add new cells
        row_data['cells']['id_City'] = {
            'id': f"{row_id}$id_City",
            'label': api_response['columns']['id_City']['cells'][row_id]['label'],
            'metadata': []
        }
        row_data['cells']['name_City'] = {
            'id': f"{row_id}$name_City",
            'label': city_metadata['name']['value'],
            'metadata': []
        }

        # Keep existing temperature and precipitation fields
        for field in ['City_apparent_temperature_max', 'City_apparent_temperature_min', 
                      'City_precipitation_sum', 'City_precipitation_hours']:
            if field in row_data['cells']:
                row_data['cells'][field]['label'] = row_data['cells'][field]['label'][0]  # Convert list to string

    return output

In [35]:
# Your input payload and API response (as before)
input_payload = extended_table
api_response = response.json()

# Merge the data
output_data = merge_data(input_payload, api_response)

# Print or process the output_data as needed
print(json.dumps(output_data, indent=2))

{
  "table": {
    "id": "240",
    "idDataset": "27",
    "name": "New_JOT_tiny_02072024_1",
    "nCols": 9,
    "nRows": 3,
    "nCells": 27,
    "nCellsReconciliated": 3,
    "lastModifiedDate": "2024-07-02T12:35:13.235Z",
    "minMetaScore": 0,
    "maxMetaScore": 1
  },
  "columns": {
    "Fecha_id": {
      "id": "Fecha_id",
      "label": "Fecha_id",
      "status": "empty",
      "context": {},
      "metadata": []
    },
    "Keyword": {
      "id": "Keyword",
      "label": "Keyword",
      "status": "empty",
      "context": {},
      "metadata": []
    },
    "Impresiones": {
      "id": "Impresiones",
      "label": "Impresiones",
      "status": "empty",
      "context": {},
      "metadata": []
    },
    "Clicks": {
      "id": "Clicks",
      "label": "Clicks",
      "status": "empty",
      "context": {},
      "metadata": []
    },
    "City": {
      "id": "City",
      "label": "City",
      "status": "reconciliated",
      "context": {
        "georss": {
        

# Push extended table to the backend

In [32]:
# Push to backend
successMessage, sentPayload = utility.push_to_backend(
    dataset_id="27", 
    table_id="238", 
    payload=extension_payload, 
    enable_logging=True
)

print(successMessage)

DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 149.132.176.67:3001
DEBUG:urllib3.connectionpool:http://149.132.176.67:3001 "PUT /api/dataset/27/table/238 HTTP/1.1" 200 197


Payload being sent:
{
  "tableInstance": {
    "id": "238",
    "idDataset": "27",
    "name": "New_JOT_tiny_02072024",
    "nCols": 7,
    "nRows": 3,
    "nCells": 21,
    "nCellsReconciliated": 3,
    "lastModifiedDate": "2024-07-02T08:23:37.013Z",
    "minMetaScore": 0,
    "maxMetaScore": 1
  },
  "columns": {
    "byId": {
      "Fecha_id": {
        "id": "Fecha_id",
        "label": "Fecha_id",
        "status": "empty",
        "context": {},
        "metadata": []
      },
      "Keyword": {
        "id": "Keyword",
        "label": "Keyword",
        "status": "empty",
        "context": {},
        "metadata": []
      },
      "Impresiones": {
        "id": "Impresiones",
        "label": "Impresiones",
        "status": "empty",
        "context": {},
        "metadata": []
      },
      "Clicks": {
        "id": "Clicks",
        "label": "Clicks",
        "status": "empty",
        "context": {},
        "metadata": []
      },
      "City": {
        "id": "City",
   

# Download Final Table

In [34]:
#downloaded_file = utility.download_csv(dataset_id=30, table_id=228)
downloaded_file = utility.download_csv(dataset_id=27, table_id=238, output_file="JoT_Extended.csv")

DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 149.132.176.67:3001
DEBUG:urllib3.connectionpool:http://149.132.176.67:3001 "GET /api/dataset/27/table/238/export?format=csv HTTP/1.1" 200 567


CSV file has been downloaded successfully and saved as JoT_Extended.csv


In [35]:
# Download W3C JSON
json_file = utility.download_w3c_json(dataset_id='27', table_id='238')

# If you want to parse the JSON into a DataFrame
with open(json_file, 'r') as f:
    json_data = json.load(f)

df = utility.parse_w3c_json(json_data)

DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 149.132.176.67:3001
DEBUG:urllib3.connectionpool:http://149.132.176.67:3001 "GET /api/dataset/27/table/238/export?format=w3c HTTP/1.1" 200 None


W3C JSON file has been downloaded successfully and saved as downloaded_data.json
     Fecha_id                   Keyword Impresiones Clicks       City  \
0  2023-01-01  alquiler pisos colindres           1      0     Madrid   
1  2023-01-01     alquiler pisos sestao           1      0  Barcelona   
2  2023-01-01      steelcraft pedal car           1      0    Buffalo   

                County        Country City_apparent_temperature_max  \
0  Community of Madrid          Spain                        [12,7]   
1            Catalonia          Spain                        [16,5]   
2             New York  United States                         [2,6]   

  City_apparent_temperature_min City_precipitation_sum  \
0                         [0,8]                    [0]   
1                         [4,2]                    [0]   
2                        [-2,8]                  [2,4]   

  City_precipitation_hours  
0                      [0]  
1                      [0]  
2                    

In [36]:
df

Unnamed: 0,Fecha_id,Keyword,Impresiones,Clicks,City,County,Country,City_apparent_temperature_max,City_apparent_temperature_min,City_precipitation_sum,City_precipitation_hours
0,2023-01-01,alquiler pisos colindres,1,0,Madrid,Community of Madrid,Spain,"[12,7]","[0,8]",[0],[0]
1,2023-01-01,alquiler pisos sestao,1,0,Barcelona,Catalonia,Spain,"[16,5]","[4,2]",[0],[0]
2,2023-01-01,steelcraft pedal car,1,0,Buffalo,New York,United States,"[2,6]","[-2,8]","[2,4]",[5]
