In [67]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import Optional
from omegaconf import DictConfig
from ydata_profiling import ProfileReport, compare

%matplotlib inline

from src.utils import load_data
from src.utils import load_config

In [36]:
env = "prod"
model_type = "XGBoost"

In [37]:
data_config = load_config(file_name="data_config", env=env, folder="../../config")
processing_config = load_config(
    file_name="processing_config", env=env, folder="../../config"
)
# training_config = load_config(file_name="training_config", env=env, folder='../../config')

# Raw dataset

In [39]:
def __load_incidencias_data(
    config: DictConfig,
    tables: list[str],
    raw_data_folder: str,
) -> tuple[Optional[pd.DataFrame], ...]:
    """
    Get the data from the tables sav_incidencias, sav_piezas, sav_estados, and sav_incidencias_tipo
    :return: Tuple with the data from the tables
    """
    return tuple(
        load_data(
            data_path=os.path.join(raw_data_folder, f"{table}.csv"),
            step_config=config.processing[table],
        )
        for table in tables
    )

In [40]:
raw_tables = [
    "incidencias",
    "piezas",
    "estados",
    "incidencias_tipo",
]

raw_data_folder = f"../../raw_data/{env}/"

In [41]:
# Get the data
incidencias, piezas, estados, incidencias_tipo = __load_incidencias_data(
    config=processing_config, tables=raw_tables, raw_data_folder=raw_data_folder
)

In [42]:
# Merge the data
dataset = incidencias.merge(
    piezas,
    left_on="codigo",
    right_on="codigo_incidencia",
    how="left",
    suffixes=(None, "_pieza"),
)

dataset = dataset.merge(
    estados,
    left_on="estado",
    right_on="id",
    how="left",
    suffixes=(None, "_estado"),
)

dataset = dataset.merge(
    incidencias_tipo,
    left_on="tipo",
    right_on="id",
    how="left",
    suffixes=(None, "_tipo"),
)

# Convert the modification_date to datetime
dataset["modification_date"] = pd.to_datetime(
    dataset["modification_date"], errors="coerce"
)

In [56]:
dataset["tipo"] = pd.to_numeric(dataset["tipo"], errors="coerce")
dataset["estado"] = pd.to_numeric(dataset["estado"], errors="coerce")

In [57]:
dataset.head()

Unnamed: 0,id,web_id,codigo,creation_date,modification_date,company_id,user_id,ref_cliente,portes_airzone,devaluacion,...,titulo_en,titulo_fr,titulo_it,titulo_pt,id_tipo,titulo_es_tipo,titulo_en_tipo,titulo_fr_tipo,titulo_it_tipo,titulo_pt_tipo
0,19552,1,MGHQM2LT55,2020-01-02 09:04:37,2020-01-20 10:06:04,208,314,PAL190646,0,0,...,Accepted pickup,Retour accepté,Verifica reso,,2,devolucion,return,retour,ritorno,
1,19552,1,MGHQM2LT55,2020-01-02 09:04:37,2020-01-20 10:06:04,208,314,PAL190646,0,0,...,Accepted pickup,Retour accepté,Verifica reso,,2,devolucion,return,retour,ritorno,
2,19553,1,LMPOM2TR8B,2020-01-02 09:34:16,2020-02-07 12:40:37,31,73,CAMBIO TERMOSTATO LITE,0,0,...,Closed,Fermée,Chiusa,,2,devolucion,return,retour,ritorno,
3,19554,1,LMNWLG1U1A,2020-01-02 10:52:38,2020-01-28 07:03:28,67,4168,PASARELAS SAMSUNG,0,0,...,Closed,Fermée,Chiusa,,2,devolucion,return,retour,ritorno,
4,19554,1,LMNWLG1U1A,2020-01-02 10:52:38,2020-01-28 07:03:28,67,4168,PASARELAS SAMSUNG,0,0,...,Closed,Fermée,Chiusa,,2,devolucion,return,retour,ritorno,


### Profiling

In [68]:
# Generate a profile report
raw_data_profile = ProfileReport(
    dataset, title="Raw Dataset Profile", explorative=True, infer_dtypes=False
)

# Save the report
raw_data_profile.to_file(f"../../data_profiling/{env}/raw_dataset_profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Preprocessed dataset

In [64]:
preprocessed_dataset = load_data(
    data_path=f"../../output_data/{env}/preprocessed_data.csv",
    step_config=processing_config.processing.preprocessed_data,
)

In [65]:
preprocessed_dataset.head()

Unnamed: 0,codigo,id_pieza,cod_articulo,desc_problema_translated,descripcion_translated,problema_translated,CODART_A3,Fuzzy_Score,CODART,DESCART,CAR1,CAR2,CAR3,CAR4,text_to_analyse,processed_text_to_analyse
0,MMZPL2LO50,29479,AZXWSCLOUDWIFI,Después del diagnóstico de HOTLINE,,"No más comunicación, asociación WIFI imposible.",AZXWSCLOUDWIFI,100.0,AZXWSCLOUDWIFI,Webserver Airzone Cloud Wi-Fi (2013),1,260,93,,Después del diagnóstico de HOTLINE No más com...,diagnostico hotline comunicacion asociacion wi...
1,MMZPL2LO50,29480,AZX6QADAPTHIT,Después del diagnóstico de HOTLINE,Pasarela de comunicaciones HITACHI RPI,problema de comunicacion,AZX6QADAPTHIT,100.0,AZX6QADAPTHIT,Pasarela comunicaciones Airzone-Hitachi RPI,1,260,49,,Después del diagnóstico de HOTLINE Pasarela de...,diagnostico hotline pasarela comunicaciones hi...
2,MMZPL2LO50,29481,AZX6QADAPTHIT,Después del diagnóstico de HOTLINE,Puerta de enlace de comunicación,problema de comunicacion,AZX6QADAPTHIT,100.0,AZX6QADAPTHIT,Pasarela comunicaciones Airzone-Hitachi RPI,1,260,49,,Después del diagnóstico de HOTLINE Puerta de e...,diagnostico hotline puerta enlace comunicacion...
3,L2VQL2LVF3,29482,AZX6CCP,CAMBIO CPP EN GARANTIA POR PROBLEMAS COMUNICACION,CENTRAL DE PRODUCCION,fallo en comunicacion central de produccion,AZX6CCP,100.0,AZX6CCP,Central de control de producción Airzone,1,260,92,,CAMBIO CPP EN GARANTIA POR PROBLEMAS COMUNICAC...,cambio cpp garantia problemas comunicacion cen...
4,LGRPLMLUAE,29487,AZCE6EXP8Z,TARJETA POTENTE PARA 8 ZONAS,0000,RAS,AZCE6EXP8Z,100.0,AZCE6EXP8Z,Módulo de expansión Airzone 2 zonas (7 y 8),1,250,90,,TARJETA POTENTE PARA 8 ZONAS 0000 RAS,tarjeta potente 8 zonas 0000 ras


### Profiling

In [69]:
# Generate a profile report
processed_data_profile = ProfileReport(
    preprocessed_dataset,
    title="Preprocessed Dataset Profile",
    explorative=True,
    infer_dtypes=False,
)

# Save the report
processed_data_profile.to_file(
    f"../../data_profiling/{env}/preprocessed_dataset_profile.html"
)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Comparing the raw and preprocessed datasets

In [70]:
comparison_report = compare([raw_data_profile, processed_data_profile])

# Obtain merged statistics
statistics = comparison_report.get_description()

# Save report to file
comparison_report.to_file(f"../../data_profiling/{env}/comparison_profile.html")



Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]