# Split datasets into training and testing

In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine

In [2]:
def query_data(query):
    """
    Function to query data from the database using sqlalchemy
    :param query:
    :return: pd.DataFrame

    Connection parameters:
    user = readmyzone
    password = (get from environment variable MYSQL_PASSWORD)
    host = 192.168.2.7
    port = 3306
    """

    # Create the connection string
    user = "readmyzone"
    password = os.environ.get("MYSQL_PASSWORD")
    host = "192.168.2.7"
    port = "3306"
    db = "myzone"
    connection_string = f"mysql+pymysql://{user}:{password}@{host}:{port}/{db}"

    # Create the engine
    engine = create_engine(connection_string)

    try:
        # Query the data
        data = pd.read_sql(query, engine)
    except Exception as e:
        print(e)
        data = None

    return data

In [3]:
sav_incidencias = query_data("SELECT * FROM sav_incidencias")
sav_piezas = query_data("SELECT * FROM sav_piezas")
sav_estados = query_data("SELECT * FROM sav_estados")
sav_incidencias_tipo = query_data("SELECT * FROM sav_incidencias_tipo")

In [4]:
dataset = sav_incidencias.merge(
    sav_piezas,
    left_on="codigo",
    right_on="codigo_incidencia",
    how="left",
    suffixes=(None, "_pieza"),
)
dataset = dataset.merge(
    sav_estados, left_on="estado", right_on="id", how="left", suffixes=(None, "_estado")
)
dataset = dataset.merge(
    sav_incidencias_tipo,
    left_on="tipo",
    right_on="id",
    how="left",
    suffixes=(None, "_tipo"),
)

In [78]:
dataset["modification_date"] = pd.to_datetime(
    dataset["modification_date"], errors="coerce"
)
dataset["creation_date"] = pd.to_datetime(dataset["creation_date"], errors="coerce")
clean_dataset = dataset[
    (dataset["tipo"] == 1)
    & (dataset["estado"].isin([2, 6]))
    & (dataset["modification_date"] < "2024-05-09")
]

In [79]:
# Load from disk the text to translate dictionary
fields_to_translate = ["desc_problema", "problema", "descripcion"]
text_to_translate = {}
for text in fields_to_translate:
    text_to_translate[text] = pd.read_csv(
        f"../DATA/{text}.csv", sep="¬", encoding="utf-8-sig"
    )

In [80]:
desc_problema_translated = pd.read_csv(
    "../DATA/desc_problema_translated.csv",
    sep="¬",
    encoding="utf-8-sig",
    engine="python",
)
descripcion_translated = pd.read_csv(
    "../DATA/descripcion_translated.csv", sep="¬", encoding="utf-8-sig", engine="python"
)
problema_translated = pd.read_csv(
    "../DATA/problema_translated.csv", sep="¬", encoding="utf-8-sig", engine="python"
)  # Data preprocessing (Merging the translated text)

In [81]:
# Delete rows with values (desc_problema, desc_problema_translated)
desc_problema_translated = desc_problema_translated[
    ~desc_problema_translated["desc_problema_translated"].isin(
        ["desc_problema_translated"]
    )
]
descripcion_translated = descripcion_translated[
    ~descripcion_translated["descripcion_translated"].isin(["descripcion_translated"])
]
problema_translated = problema_translated[
    ~problema_translated["problema_translated"].isin(["problema_translated"])
]

In [82]:
desc_problema_translated.count()

In [83]:
# Merge the translated text with the text_to_translate dataframe
desc_problema_translated = text_to_translate["desc_problema"].merge(
    desc_problema_translated,
    left_on="desc_problema",
    right_on="desc_problema",
    how="left",
)
descripcion_translated = text_to_translate["descripcion"].merge(
    descripcion_translated, left_on="descripcion", right_on="descripcion", how="left"
)
problema_translated = text_to_translate["problema"].merge(
    problema_translated, left_on="problema", right_on="problema", how="left"
)

In [84]:
# Fill NA with the original texts
desc_problema_translated.fillna(
    {"desc_problema_translated": desc_problema_translated["desc_problema"]},
    inplace=True,
)
descripcion_translated.fillna(
    {"descripcion_translated": descripcion_translated["descripcion"]}, inplace=True
)
problema_translated.fillna(
    {"problema_translated": problema_translated["problema"]}, inplace=True
)

In [85]:
desc_problema_translated.head(5)

In [86]:
# Merge the translated text with the original dataset
clean_dataset = clean_dataset.merge(
    desc_problema_translated,
    left_on="desc_problema",
    right_on="desc_problema",
    how="left",
)
clean_dataset = clean_dataset.merge(
    descripcion_translated, left_on="descripcion", right_on="descripcion", how="left"
)
clean_dataset = clean_dataset.merge(
    problema_translated, left_on="problema", right_on="problema", how="left"
)

In [87]:
clean_dataset.describe(include="all")

In [88]:
for column in clean_dataset.columns:
    print(f"Column: {column}")

In [89]:
train_dataset = clean_dataset[clean_dataset["creation_date"] < "2024-04-16"]
test_dataset = clean_dataset[clean_dataset["creation_date"] >= "2024-04-16"]

In [90]:
train_dataset["codigo"].size, test_dataset["codigo"].size

In [91]:
test_dataset.describe(include="all")

In [92]:
test_dataset.sample(10)

In [93]:
test_dataset[test_dataset["pedido_a3"].isna()]

## Load data from A3

In [47]:
import pyodbc


# Def function to connect with sql server using pyodbc and query data
def query_data_a3(query, database):
    r"""
    Function to query data from a SQL database
    :param query:
    :param database:
    :return: pd.DataFrame

        Connection parameters:
        - user: voliveira
        - password: (get password from environment variable SQL_PASSWORD)
        - host: ROMPETECHOS\REPLICA
        - port: 53373
    """
    # Create the connection string
    user = "voliveira"
    password = os.environ.get("SQL_PASSWORD")
    host = r"ROMPETECHOS"
    instance = "REPLICA"
    port = "53373"
    conn_str = f"DRIVER=SQL Server;SERVER={host}\\{instance},{port};DATABASE={database};UID={user};PWD={password}"

    # Create the connection
    try:
        conn = pyodbc.connect(conn_str)
    except Exception as e:
        print(f"Error creating connection: {e}")
        return None

    # query the data
    try:
        data = pd.read_sql(query, conn)
    except Exception as e:
        print(f"Error: {e}")
        data = None

    return data

In [48]:
# Query data from the database
cabe_airzone = query_data_a3(f"SELECT * FROM dbo.CABEPEDV", "Airzone")
cabe_airzone_france = query_data_a3(f"SELECT * FROM dbo.CABEPEDV", "AirzoneFrance")
cabe_airzone_italia = query_data_a3(f"SELECT * FROM dbo.CABEPEDV", "AirzoneItalia")
cabe_airzone_usa = query_data_a3(f"SELECT * FROM dbo.CABEPEDV", "AirzoneUsa")
cabe_airzone_altra = query_data_a3(f"SELECT * FROM dbo.CABEPEDV", "Altra")

# Query only lines with IDPEDV not null (exclude PEDC)
lin_airzone = query_data_a3(
    f"SELECT * FROM dbo.LINEPEDI WHERE IDPEDV IS NOT NULL", "Airzone"
)
lin_airzone_france = query_data_a3(
    f"SELECT * FROM dbo.LINEPEDI WHERE IDPEDV IS NOT NULL", "AirzoneFrance"
)
lin_airzone_italia = query_data_a3(
    f"SELECT * FROM dbo.LINEPEDI WHERE IDPEDV IS NOT NULL", "AirzoneItalia"
)
lin_airzone_usa = query_data_a3(
    f"SELECT * FROM dbo.LINEPEDI WHERE IDPEDV IS NOT NULL", "AirzoneUsa"
)
lin_airzone_altra = query_data_a3(
    f"SELECT * FROM dbo.LINEPEDI WHERE IDPEDV IS NOT NULL", "Altra"
)

In [49]:
# Add one column to each dataframe to identify the country
cabe_airzone["Country"] = "Spain"
cabe_airzone_france["Country"] = "France"
cabe_airzone_italia["Country"] = "Italia"
cabe_airzone_usa["Country"] = "USA"
cabe_airzone_altra["Country"] = "Altra"

In [50]:
# Add month and year colulmns based on FECHA to cabe dataframes
cabe_airzone["FECHA"] = pd.to_datetime(cabe_airzone["FECHA"])
cabe_airzone["Month"] = cabe_airzone["FECHA"].dt.month
cabe_airzone["Year"] = cabe_airzone["FECHA"].dt.year
cabe_airzone_france["FECHA"] = pd.to_datetime(cabe_airzone_france["FECHA"])
cabe_airzone_france["Month"] = cabe_airzone_france["FECHA"].dt.month
cabe_airzone_france["Year"] = cabe_airzone_france["FECHA"].dt.year
cabe_airzone_italia["FECHA"] = pd.to_datetime(cabe_airzone_italia["FECHA"])
cabe_airzone_italia["Month"] = cabe_airzone_italia["FECHA"].dt.month
cabe_airzone_italia["Year"] = cabe_airzone_italia["FECHA"].dt.year
cabe_airzone_usa["FECHA"] = pd.to_datetime(cabe_airzone_usa["FECHA"])
cabe_airzone_usa["Month"] = cabe_airzone_usa["FECHA"].dt.month
cabe_airzone_usa["Year"] = cabe_airzone_usa["FECHA"].dt.year
cabe_airzone_altra["FECHA"] = pd.to_datetime(cabe_airzone_altra["FECHA"])
cabe_airzone_altra["Month"] = cabe_airzone_altra["FECHA"].dt.month
cabe_airzone_altra["Year"] = cabe_airzone_altra["FECHA"].dt.year

In [51]:
# Check if the dataframes has the same columns
# Print the columns that are in cabe_airzone but that are not in cabe_airzone_france
print("Columns in cabe_airzone that are not in cabe_airzone_france:")
print(set(cabe_airzone.columns) - set(cabe_airzone_france.columns))
print("Columns in cabe_airzone_france that are not in cabe_airzone_italia:")
print(set(cabe_airzone_france.columns) - set(cabe_airzone_italia.columns))
print("Columns in cabe_airzone_italia that are not in cabe_airzone_usa:")
print(set(cabe_airzone_italia.columns) - set(cabe_airzone_usa.columns))
print("Columns in cabe_airzone_usa that are not in cabe_airzone_altra:")
print(set(cabe_airzone_usa.columns) - set(cabe_airzone_altra.columns))
print("Columns in cabe_airzone_altra that are not in cabe_airzone_usa:")
print(set(cabe_airzone_altra.columns) - set(cabe_airzone_usa.columns))

In [52]:
# Concat all cabe and lin dataframes
cabe = pd.concat(
    [
        cabe_airzone,
        cabe_airzone_france,
        cabe_airzone_italia,
        cabe_airzone_usa,
        cabe_airzone_altra,
    ]
)
lineas = pd.concat(
    [
        lin_airzone,
        lin_airzone_france,
        lin_airzone_italia,
        lin_airzone_usa,
        lin_airzone_altra,
    ]
)

In [53]:
# Clean SERIES field
cabe["SERIE"] = cabe["SERIE"].str.strip()

In [54]:
# Define some constants
SALES_CODES = ["1", "2", "C", "FA", "A"]
AFTER_SALES_CODE = ["3", "6", "FR", "FV"]

In [55]:
# Clean memory
del (
    cabe_airzone,
    lin_airzone,
    cabe_airzone_france,
    lin_airzone_france,
    cabe_airzone_italia,
    lin_airzone_italia,
    cabe_airzone_usa,
    lin_airzone_usa,
    cabe_airzone_altra,
    lin_airzone_altra,
)

In [56]:
# Merge the cabe and lineas dataframes
pedidos = lineas[~lineas["IDPEDV"].isna()].merge(cabe, on="IDPEDV", how="left")
pedidos["count"] = pedidos.groupby("IDPEDV")["IDPEDV"].transform("count")

In [57]:
pedidos.sample(10)

In [58]:
pedidos = pedidos[pedidos["count"] == 1]

In [94]:
# Make sure that the IDPEDV and pedido_a3 are the same dtype
pedidos["NUMDOC"] = pedidos["NUMDOC"].astype(int)
pedidos["NUMDOC"] = pedidos["NUMDOC"].astype(str)
test_dataset["pedido_a3"] = (
    test_dataset["pedido_a3"].astype(str).str.extract(r"(\d{8})")
)
# Merge the test dataset with the cabe_sales dataset
test_dataset = test_dataset.merge(
    pedidos[["NUMDOC", "ERROR_POSTVENTA_y"]],
    left_on="pedido_a3",
    right_on="NUMDOC",
    how="left",
)

In [95]:
test_dataset.describe(include="all")

In [96]:
test_dataset_ids = test_dataset[~test_dataset["ERROR_POSTVENTA_y"].isna()][
    ["codigo", "NUMDOC", "ERROR_POSTVENTA_y"]
]
test_dataset_ids.columns = ["codigo", "NUMDOC", "ERROR_POSTVENTA"]

In [97]:
test_dataset_ids.to_csv("../DATA/processed/test_dataset_ids.csv")

In [98]:
total = test_dataset_ids.groupby("ERROR_POSTVENTA")["codigo"].count().sum()
group = (
    test_dataset_ids.groupby("ERROR_POSTVENTA")["codigo"]
    .count()
    .sort_values(ascending=False)
    .to_frame()
)
group["%"] = group.apply(lambda x: (x / total) * 100)
group["cumulative %"] = group["%"].cumsum()
group

In [99]:
# Plot the distribution of the ERROR_POSTVENTA
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 6))
plt.bar(group.index, group["codigo"])
plt.xlabel("ERROR_POSTVENTA")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.title("Distribution of the ERROR_POSTVENTA field in Test dataset")

plt.twinx()
plt.plot(group.index, group["cumulative %"], color="red", marker="o")
# data labels
for i in range(group.shape[0]):
    plt.text(
        i, group["cumulative %"][i], f'{group["cumulative %"][i]:.2f}', ha="center"
    )
    if i > 5:
        break
plt.ylabel("%")
plt.show()

In [104]:
test_dataset[["codigo", "num_serie"]]