# After sales text clustering using Doc2Vec
## Adding CODART and Characteristics from A3ERP and improving the text preprocessing

## Data preprocessing (Merging the translated text)

In [1]:
import os
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

In [2]:
def query_data(query):
    """
    Function to query data from the database using sqlalchemy
    :param query: 
    :return: pd.DataFrame
    
    Connection parameters:
    user = readmyzone
    password = (get from environment variable MYSQL_PASSWORD)
    host = 192.168.2.7
    port = 3306
    """
    
    # Create the connection string
    user = 'readmyzone'
    password = os.environ.get('MYSQL_PASSWORD')
    host = '192.168.2.7'
    port = '3306'
    db = 'myzone'
    connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{db}'
    
    # Create the engine
    engine = create_engine(connection_string)
    
    try:
        # Query the data
        data = pd.read_sql(query, engine)
    except Exception as e:
        print(e)
        data = None
    
    return data

## Load the data

In [3]:
sav_incidencias = query_data('SELECT * FROM sav_incidencias')
sav_piezas = query_data('SELECT * FROM sav_piezas')
sav_estados = query_data('SELECT * FROM sav_estados')
sav_incidencias_tipo = query_data('SELECT * FROM sav_incidencias_tipo')

In [4]:
dataset = sav_incidencias.merge(sav_piezas, left_on='codigo', right_on='codigo_incidencia', how='left', suffixes=(None, '_pieza'))
dataset = dataset.merge(sav_estados, left_on='estado', right_on='id', how='left', suffixes=(None, '_estado'))
dataset = dataset.merge(sav_incidencias_tipo, left_on='tipo', right_on='id', how='left', suffixes=(None, '_tipo'))

In [5]:
dataset['modification_date'] = pd.to_datetime(dataset['modification_date'], errors='coerce')
clean_dataset = dataset[(dataset["tipo"] == 1) & (dataset["estado"].isin([2,6])) & (dataset['modification_date'] < '2024-05-09')]

In [6]:
# Load from disk the text to translate dictionary
fields_to_translate = ["desc_problema", "problema", "descripcion"]
text_to_translate = {}
for text in fields_to_translate:
    text_to_translate[text] = pd.read_csv(f"../DATA/{text}.csv", sep='¬', encoding='utf-8-sig')

  text_to_translate[text] = pd.read_csv(f"../DATA/{text}.csv", sep='¬', encoding='utf-8-sig')
  text_to_translate[text] = pd.read_csv(f"../DATA/{text}.csv", sep='¬', encoding='utf-8-sig')
  text_to_translate[text] = pd.read_csv(f"../DATA/{text}.csv", sep='¬', encoding='utf-8-sig')


In [7]:
desc_problema_translated = pd.read_csv("../DATA/desc_problema_translated.csv", sep='¬', encoding='utf-8-sig', engine='python')
descripcion_translated = pd.read_csv("../DATA/descripcion_translated.csv", sep='¬', encoding='utf-8-sig', engine='python')
problema_translated = pd.read_csv("../DATA/problema_translated.csv", sep='¬', encoding='utf-8-sig', engine='python')# Data preprocessing (Merging the translated text)

In [8]:
# Delete rows with values (desc_problema, desc_problema_translated)
desc_problema_translated = desc_problema_translated[~desc_problema_translated["desc_problema_translated"].isin(["desc_problema_translated"])]
descripcion_translated = descripcion_translated[~descripcion_translated["descripcion_translated"].isin(["descripcion_translated"])]
problema_translated = problema_translated[~problema_translated["problema_translated"].isin(["problema_translated"])]

In [9]:
desc_problema_translated.count()

desc_problema               18099
desc_problema_translated    18099
dtype: int64

In [10]:
# Merge the translated text with the text_to_translate dataframe
desc_problema_translated = text_to_translate["desc_problema"].merge(desc_problema_translated, left_on="desc_problema", right_on="desc_problema", how="left")
descripcion_translated = text_to_translate["descripcion"].merge(descripcion_translated, left_on="descripcion", right_on="descripcion", how="left")
problema_translated = text_to_translate["problema"].merge(problema_translated, left_on="problema", right_on="problema", how="left")

In [11]:
# Fill NA with the original texts
desc_problema_translated.fillna({"desc_problema_translated": desc_problema_translated["desc_problema"]}, inplace=True)
descripcion_translated.fillna({"descripcion_translated": descripcion_translated["descripcion"]}, inplace=True)
problema_translated.fillna({"problema_translated": problema_translated["problema"]}, inplace=True)

In [12]:
desc_problema_translated.head(5)

Unnamed: 0,desc_problema,desc_problema_lg,desc_problema_translated
0,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,es,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...
1,NO FUNCIONA,es,NO FUNCIONA
2,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,es,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...
3,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,es,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...
4,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR",es,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR"


In [13]:
# Merge the translated text with the original dataset
clean_dataset = clean_dataset.merge(desc_problema_translated, left_on="desc_problema", right_on="desc_problema", how="left")
clean_dataset = clean_dataset.merge(descripcion_translated, left_on="descripcion", right_on="descripcion", how="left")
clean_dataset = clean_dataset.merge(problema_translated, left_on="problema", right_on="problema", how="left")

In [14]:
clean_dataset.describe(include='all')

Unnamed: 0,id,web_id,codigo,creation_date,modification_date,company_id,user_id,ref_cliente,portes_airzone,devaluacion,...,titulo_en_tipo,titulo_fr_tipo,titulo_it_tipo,titulo_pt_tipo,desc_problema_lg,desc_problema_translated,descripcion_lg,descripcion_translated,problema_lg,problema_translated
count,37221.0,37221.0,37221,37221,37221,37221.0,37221.0,37201,37221.0,37221.0,...,37221,37221,37221,0.0,34962,34962,29338,27750,37111,35523
unique,,,28550,28512,,,,24543,,,...,1,1,1,0.0,33,23091,33,10157,32,24467
top,,,MPMAMZZN0E,2023-11-19 14:06:28,,,,RESO MATERIALE ASSISTENZE,,,...,guarantee,garantie,garanzia,,es,NO FUNCIONA,en,TERMOSTATO,es,NO FUNCIONA
freq,,,38,38,,,,278,,,...,37221,37221,37221,,14061,507,7144,632,13496,891
mean,29511.273797,1.586846,,,2021-03-26 09:53:20.839606784,1351.335805,2609.786142,,0.999248,0.000537,...,,,,,,,,,,
min,5.0,1.0,,,2015-04-16 16:51:24,0.0,1.0,,0.0,0.0,...,,,,,,,,,,
25%,14649.0,1.0,,,2019-04-15 12:50:55,237.0,446.0,,1.0,0.0,...,,,,,,,,,,
50%,29853.0,2.0,,,2021-10-11 09:23:47,494.0,1531.0,,1.0,0.0,...,,,,,,,,,,
75%,43691.0,2.0,,,2023-03-21 14:14:27,2020.0,4217.0,,1.0,0.0,...,,,,,,,,,,
max,58867.0,5.0,,,2024-05-08 15:17:04,7667.0,10289.0,,1.0,20.0,...,,,,,,,,,,


In [15]:
for column in clean_dataset.columns:
    print(f"Column: {column}")

Column: id
Column: web_id
Column: codigo
Column: creation_date
Column: modification_date
Column: company_id
Column: user_id
Column: ref_cliente
Column: portes_airzone
Column: devaluacion
Column: pedido_sage
Column: abono_sage
Column: pedido_a3
Column: abono_a3
Column: tipo
Column: estado
Column: personaaz
Column: dire_envio_id
Column: dire_recogida_id
Column: peso3
Column: volumen3
Column: estadofr
Column: c_mail
Column: c_tel
Column: c_obs
Column: accepted_client
Column: desc_problema
Column: codigo_incidencia
Column: id_pieza
Column: user_id_pieza
Column: cod_articulo
Column: descripcion
Column: num_serie
Column: factura_albaran
Column: problema
Column: is_replacement
Column: creation_date_pieza
Column: modification_date_pieza
Column: id_estado
Column: ref
Column: color
Column: valor
Column: titulo_es
Column: titulo_en
Column: titulo_fr
Column: titulo_it
Column: titulo_pt
Column: id_tipo
Column: titulo_es_tipo
Column: titulo_en_tipo
Column: titulo_fr_tipo
Column: titulo_it_tipo
Colum

In [16]:
# Get only the columns with the fields of interest
text_to_analyse = clean_dataset[['desc_problema_translated','descripcion_translated','problema_translated','cod_articulo']]
# Fill NA with empty string
text_to_analyse.fillna("", inplace=True)

In [17]:
text_to_analyse.loc[:, 'text_to_analyse'] = text_to_analyse['desc_problema_translated'] + ' ' + text_to_analyse['descripcion_translated'] + ' ' + text_to_analyse['problema_translated'] + ' ' + text_to_analyse['cod_articulo']

In [18]:
text_to_analyse.head(10)

Unnamed: 0,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse
0,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MODULO ALARMAS TECNICAS,INDICAN SE HA ESTROPEADO EN LA INSTALACION Y ...,MATS,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...
1,NO FUNCIONA,,NO FUNCIONA,AZC3TACTOCSB,NO FUNCIONA NO FUNCIONA AZC3TACTOCSB
2,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,,TACTO BLOQUEADO. NO ENCIENDE-APAGA NI DEJA CAM...,AZATACTORSB,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...
3,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,MODELO 2 PERSIANAS,UNA DE LAS 2 PERSIANAS NO FUNCIONA CON PULSADO...,PER2,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...
4,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR",,Rafael el viernes llego mi pedido nº 23349 pa...,RINT040015BKMTE,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR Rafa..."
5,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC,MODULO DE FANCOIL DE ZONA 32Z,AZAMFANCOILC,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...
6,MODULO DE FANCOIL NO SACA 7V POR EL BUS EN VEZ...,MODULO DE FANCOIL DE ZONA 32Z,"SE DETECTA QUE EL BLUEFACE AL ENCENDERLO, SE Q...",AZAMFANCOILC,MODULO DE FANCOIL NO SACA 7V POR EL BUS EN VEZ...
7,"Error al pedir la pasarela, a última hora hubo...",Pasarela Mitsubishi Heavy,"Error comercial, por cambio de última hora en ...",AZXEQADAPMHI,"Error al pedir la pasarela, a última hora hubo..."
8,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...,,SE ROMPEN TAPAS TRASERAS. CAMBIAR TERMOSTATO.,AZC3BLUEFECOSB,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...
9,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...,,5 UNIDADES. SE ROMPEN TAPAS TRASERAS.,AZC3TACTOCSB,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...


## Add CODART from A3ERP

In [19]:
import pyodbc
# Def function to connect with sql server using pyodbc and query data
def query_data_a3(query,database):
    r"""
    Function to query data from a SQL database
    :param query: 
    :param database: 
    :return: pd.DataFrame
    
        Connection parameters:
        - user: voliveira
        - password: (get password from environment variable SQL_PASSWORD)
        - host: ROMPETECHOS\REPLICA
        - port: 53373
    """
    # Create the connection string
    user = 'voliveira'
    password = os.environ.get('SQL_PASSWORD')
    host = r'ROMPETECHOS'
    instance = 'REPLICA'
    port = '53373'
    conn_str = f"DRIVER=SQL Server;SERVER={host}\\{instance},{port};DATABASE={database};UID={user};PWD={password}"

    # Create the connection
    try:
        conn = pyodbc.connect(conn_str)
    except Exception as e:
        print(f'Error creating connection: {e}')
        return None
    
    # query the data
    try:
        data = pd.read_sql(query, conn)
    except Exception as e:
        print(f'Error: {e}')
        data = None
        
    return data

In [20]:
# Query the data from A3ERP
articulos = query_data_a3(f'SELECT CODART, DESCART, CAR1, CAR2, CAR3, CAR4 FROM dbo.ARTICULO','Altra')
caracteristicas = query_data_a3(f'SELECT * FROM dbo.CARACTERISTICAS;','Altra')

  data = pd.read_sql(query, conn)


In [21]:
articulos.sample(10)

Unnamed: 0,CODART,DESCART,CAR1,CAR2,CAR3,CAR4
17243,AZXEZTEST01M2,Airzone Easyzone Standard + VMC sin electrónic...,1,253,24,30
36311,PERDA01L4,PACK ESSENTIEL RADIO DAIKIN 01L4,1,253,98,182
43158,RDHV075020AKMTE,Rejilla 2 deflex H/V Airzone motor 750x200 mm ...,1,264,31,77
71511,RTAE035065BKX,Rejilla tae malla antipajaro Airzone 350x650 b...,2,262,31,82
18258,AZZS6DAIBS07L5,Airzone Easyzone Medium ZS6 Daikin 5x200 07L,1,261,24,32
25427,CPRC065010MTE,Compuerta rectangular conducto motor Airzone 6...,1,264,19,96
42087,RDHV050045BPX,Rejilla 2 deflex H/V Airzone 500x450 mm blanco...,2,262,31,77
48156,RDVH090035BTMTE,Rejilla 2 deflex V/H Airzone motor 900x350 mm ...,1,264,31,78
43788,RDHV085040ATMTE,Rejilla 2 deflex H/V Airzone motor 850x400 mm ...,1,264,31,77
57322,RLC2070015BPX,Rejilla lama curva 2dir. Airzone 700x150 bl pe...,2,262,31,80


In [22]:
caracteristicas.sample(10)

Unnamed: 0,CODCAR,DESCCAR,NUMCAR,TIPCAR,ID
29,243,UNIVERSAL,2,A,398
196,142,MP_CONECTORES,4,A,89
69,35,OBS_AISLADO,3,A,22
247,408,ELECTROMECÁNICA,4,A,379
393,1,ACAE/Bases Precios,6,F,10
412,6,CALENER,8,F,57
151,80,RLC2,4,A,272
208,155,MP_REMACHES,4,A,267
114,82,Soporte técnico,3,A,407
129,97,BOCAS DE VENTILACION,3,A,387


In [24]:
def get_unique_caracteristicas(numcar):
    return caracteristicas[(caracteristicas['NUMCAR'] == numcar) & (caracteristicas['TIPCAR'] == 'A')][['CODCAR', 'DESCCAR']]



# Merging the characteristics with the articles
articulos = articulos\
    .merge(get_unique_caracteristicas(1), left_on='CAR1', right_on='CODCAR', how='left', suffixes=(None, '1'))
articulos = articulos\
    .merge(get_unique_caracteristicas(2), left_on='CAR2', right_on='CODCAR', how='left', suffixes=(None, '2'))
articulos = articulos\
    .merge(get_unique_caracteristicas(3), left_on='CAR3', right_on='CODCAR', how='left', suffixes=(None, '3'))
articulos = articulos\
    .merge(get_unique_caracteristicas(4), left_on='CAR4', right_on='CODCAR', how='left', suffixes=(None, '4'))

# Clean usuless columns
articulos = articulos.drop(['CODCAR', 'CODCAR2', 'CODCAR3', 'CODCAR4'], axis=1)
# Rename to match patterns
articulos = articulos.rename(columns={'DESCCAR': 'DESCCAR1'})

In [28]:
articulos.sample(10)

Unnamed: 0,CODART,DESCART,CAR1,CAR2,CAR3,CAR4,DESCCAR1,DESCCAR2,DESCCAR3,DESCCAR4
8171,AZEX8KAY03MST4,Cuello Kit Combo Easyzone Airzone CAI Standard...,1,253,98,30.0,SISTEMAS DE ZONAS,EASYZONE (EZ6),PACKS,Plénum Standar (ST)
23482,C2EXPOAIDIT01,EXPOSITOR CARTON AIDOO IT01 MKT,6,17,71,172.0,MATERIAS PRIMAS,MATERIAS PRIMAS,MP_CONSUMIBLES,MP_MKT
30032,E7004163,PCB FUJITSU ADAPTER V1_0_1(OBS),6,17,73,138.0,MATERIAS PRIMAS,MATERIAS PRIMAS,MP_ELECTRONICA,MP_PCB
53746,RL0V080015BKMTE,Rejilla lineal 0º deflex vertical motorizada A...,1,264,31,86.0,SISTEMAS DE ZONAS,DIFUSION MOTORIZADA,REJILLAS,RL00
53516,RL0V060015AKRT,Rejilla lineal 0º deflex V. Airzone +Reg 600x1...,2,262,31,86.0,DIFUSIÓN,DIFUSION NO MOTORIZADA,REJILLAS,RL00
72894,SMDAIRQBOX_TI,SMD AIRQBOX MODULO CENTRAL (485 Texas),1,9,95,,SISTEMAS DE ZONAS,OBS_SISTEMAS,MODULOS DE ZONA,
14116,AZEZ8SABST05M6,Airzone Easyzone CAI Standard + VMC IB8 Sabian...,1,269,24,30.0,SISTEMAS DE ZONAS,EASYZONE CAI,PLENUM MOTORIZADO,Plénum Standar (ST)
32924,M57361BL,PERFIL MARCO DFLI RAL 9010 (BARRA 6M)(OBS),6,17,61,160.0,MATERIAS PRIMAS,MATERIAS PRIMAS,MP_MECANICOS,MP_ALUMINIO
29824,E4BH4VRBBGR,BORNA HEMBRA AEREO E4B350H4VR + PEGATINA LB014...,6,17,80,142.0,MATERIAS PRIMAS,MATERIAS PRIMAS,REEMPLAZOS,MP_CONECTORES
56372,RLC1095010BTMTE,Rejilla lama curva 1dir. motorizada Airzone 95...,1,264,31,79.0,SISTEMAS DE ZONAS,DIFUSION MOTORIZADA,REJILLAS,RLC1


In [29]:
from thefuzz import fuzz, process

"""
# Function to find best CODART match
def find_best_match(cod, codart_list, return_score=True):
    best_match, score = process.extractOne(cod, codart_list, scorer=fuzz.token_set_ratio)
    if return_score:
        return best_match, score
    else:
        return best_match
        
# Add the CODART to the text_to_analyse dataframe
text_to_analyse.loc[:, 'CODART_A3'] = text_to_analyse['cod_articulo'].apply(lambda x: find_best_match(x, articulos['CODART'].values, return_score=False))
"""

# Load fuzzy matches from disk
fuzzy_matches = pd.read_csv("../DATA/fuzzy_matches.csv", sep='¬', encoding='utf-8-sig')

# Merge the fuzzy matches with the text_to_analyse dataframe
text_to_analyse = text_to_analyse.merge(fuzzy_matches, left_on='cod_articulo', right_on='cod_articulo', how='left')

  fuzzy_matches = pd.read_csv("../DATA/fuzzy_matches.csv", sep='¬', encoding='utf-8-sig')


In [31]:
text_to_analyse.sample(20)

Unnamed: 0,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse,CODART_A3
2597,EL WEB SERVER SE CONECTA A LA RED PERO NO SE P...,SERVIDOR WEB WIFI,"SE CONECTA A LA RED, PERO AL INTENTAR ACCEDER ...",AZXWSCLOUDWIFI,EL WEB SERVER SE CONECTA A LA RED PERO NO SE P...,AZXWSCLOUDWIFI
15468,AQUAZONE CON PROBLEMAS,ACUAZONA,"ACUAZONE DEFECTUOSO,CUANDO LLEVA UN TIEMPO FUS...",AZDI6ACUAZONE,AQUAZONE CON PROBLEMAS ACUAZONA ACUAZONE DEFEC...,AZDI6ACUAZONE
35295,termostatos x2 HS,TERMOSTATO,HS.,AZCE6THINKRB,termostatos x2 HS TERMOSTATO HS. AZCE6THINKRB,AZCE6THINKRB
1950,"NECESITO UNA PASARELA FUJITSU EN GARANTÍA, YA ...",ESCAPAR DE LA PUERTA DE ENTRADA,"NECESITO UNA PASARELA FUJITSU EN GARANTÍA, YA ...",X,"NECESITO UNA PASARELA FUJITSU EN GARANTÍA, YA ...",AUX
17232,"CENTRAL FLEXA, BLUEFACE Y WS. VERIFICADO POR M...",TERMOSTATO BLUEFACE FLEXA,NO FUNCIONA,ACE6BLUEFACECB,"CENTRAL FLEXA, BLUEFACE Y WS. VERIFICADO POR M...",AZCE6BLUEFACECB
9685,Modulo zona Acuazone no tiene luz de Bus y el ...,MODULO ZONA CABLE,MODULO ZONA CABLE NO TIENE LUZ DE BUS Y EL TER...,AZDI6MZZONC,Modulo zona Acuazone no tiene luz de Bus y el ...,AZDI6MZZONC
31341,BOLETO N° 1774961809,AMORTIGUADOR DE PURIFICACIÓN Y MOTOR DE 200MM,BOLETO N° 1774961809,AZPV6CAM200ION,BOLETO N° 1774961809 AMORTIGUADOR DE PURIFICAC...,AZPV6CAM200ION
22301,,,...,****,... ****,0
35767,SEGÚN ACUERDOS CON DOMENICO UTANO ESTAMOS SOLI...,,SEGÚN ACUERDO CON DOMENICO UTANO SOLICITAMOS E...,AZX6AC1VALR,SEGÚN ACUERDOS CON DOMENICO UTANO ESTAMOS SOLI...,AZX6AC1VALR
10283,BUENOS DIAS\r\n¿PODRÍAN ENVIARME EL MATERIAL A...,PIENSA TERMOSTATO CON CABLE BLANCO,X,AZCE6THINKC,BUENOS DIAS\r\n¿PODRÍAN ENVIARME EL MATERIAL A...,AZCE6THINKCB


In [36]:
# Count the number of NaN values in column CODART_A3
text_to_analyse['CODART_A3'].isna().sum()

1759

In [38]:
# Fill NA with 0
text_to_analyse.fillna("0", inplace=True)

In [39]:
# Clean the text_to_analyse dataframe
text_to_analyse = text_to_analyse[text_to_analyse['CODART_A3'] != '0']
text_to_analyse = text_to_analyse[text_to_analyse['CODART_A3'].notna()]
text_to_analyse = text_to_analyse[text_to_analyse['text_to_analyse'].str.len() > 25]
# Clean rows with only NO FUNCIONA in text_to_analyse
text_to_analyse = text_to_analyse[text_to_analyse['text_to_analyse'].str.replace('NO FUNCIONA', '').str.len() > 25]
text_to_analyse.head()

Unnamed: 0,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse,CODART_A3
0,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MODULO ALARMAS TECNICAS,INDICAN SE HA ESTROPEADO EN LA INSTALACION Y ...,MATS,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MATS
2,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,,TACTO BLOQUEADO. NO ENCIENDE-APAGA NI DEJA CAM...,AZATACTORSB,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,AZATACTORSB
3,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,MODELO 2 PERSIANAS,UNA DE LAS 2 PERSIANAS NO FUNCIONA CON PULSADO...,PER2,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,PER2
4,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR",,Rafael el viernes llego mi pedido nº 23349 pa...,RINT040015BKMTE,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR Rafa...",RINT040015BKMTE
5,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC,MODULO DE FANCOIL DE ZONA 32Z,AZAMFANCOILC,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC


In [41]:
# Merge the text_to_analyse with the articulos dataframe
text_to_analyse = text_to_analyse.merge(articulos, left_on='CODART_A3', right_on='CODART', how='left')
text_to_analyse.head()

Unnamed: 0,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse,CODART_A3,CODART,DESCART,CAR1,CAR2,CAR3,CAR4,DESCCAR1,DESCCAR2,DESCCAR3,DESCCAR4
0,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MODULO ALARMAS TECNICAS,INDICAN SE HA ESTROPEADO EN LA INSTALACION Y ...,MATS,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MATS,MATS,Central de alarmas técnicas multifunción,3,265,94,,AT HOME,AT HOME,MODULOS DE CONTROL,
1,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,,TACTO BLOQUEADO. NO ENCIENDE-APAGA NI DEJA CAM...,AZATACTORSB,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,AZATACTORSB,AZATACTORSB,Termostato Tacto superficie radio (AZA) - Blanco,1,251,91,4.0,SISTEMAS DE ZONAS,ACUAZONE (DI6),TERMOSTATOS,TACTO
2,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,MODELO 2 PERSIANAS,UNA DE LAS 2 PERSIANAS NO FUNCIONA CON PULSADO...,PER2,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,PER2,PER2,Módulo de control de 2 persianas con pulsador,3,265,94,,AT HOME,AT HOME,MODULOS DE CONTROL,
3,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR",,Rafael el viernes llego mi pedido nº 23349 pa...,RINT040015BKMTE,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR Rafa...",RINT040015BKMTE,RINT040015BKMTE,Rejilla Inteligente doble Airzone motorizada 4...,1,264,31,92.0,SISTEMAS DE ZONAS,DIFUSION MOTORIZADA,REJILLAS,RINT
4,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC,MODULO DE FANCOIL DE ZONA 32Z,AZAMFANCOILC,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC,AZAMFANCOILC,Módulo de zona fancoil cableado Airzone (AZA),1,251,95,,SISTEMAS DE ZONAS,ACUAZONE (DI6),MODULOS DE ZONA,


In [44]:
text_to_analyse = text_to_analyse.fillna('')
text_to_analyse = text_to_analyse.copy()
text_to_analyse['text_to_analyse'] = text_to_analyse['desc_problema_translated'] + \
                           ' ' + text_to_analyse['descripcion_translated'] + \
                           ' ' + text_to_analyse['problema_translated'] + \
                           ' ' + text_to_analyse['CODART_A3'] + \
                           ' ' + text_to_analyse['DESCART']

text_to_analyse.head(10)

Unnamed: 0,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse,CODART_A3,CODART,DESCART,CAR1,CAR2,CAR3,CAR4,DESCCAR1,DESCCAR2,DESCCAR3,DESCCAR4
0,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MODULO ALARMAS TECNICAS,INDICAN SE HA ESTROPEADO EN LA INSTALACION Y ...,MATS,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MATS,MATS,Central de alarmas técnicas multifunción,3,265,94,,AT HOME,AT HOME,MODULOS DE CONTROL,
1,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,,TACTO BLOQUEADO. NO ENCIENDE-APAGA NI DEJA CAM...,AZATACTORSB,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,AZATACTORSB,AZATACTORSB,Termostato Tacto superficie radio (AZA) - Blanco,1,251,91,4.0,SISTEMAS DE ZONAS,ACUAZONE (DI6),TERMOSTATOS,TACTO
2,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,MODELO 2 PERSIANAS,UNA DE LAS 2 PERSIANAS NO FUNCIONA CON PULSADO...,PER2,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,PER2,PER2,Módulo de control de 2 persianas con pulsador,3,265,94,,AT HOME,AT HOME,MODULOS DE CONTROL,
3,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR",,Rafael el viernes llego mi pedido nº 23349 pa...,RINT040015BKMTE,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR Rafa...",RINT040015BKMTE,RINT040015BKMTE,Rejilla Inteligente doble Airzone motorizada 4...,1,264,31,92.0,SISTEMAS DE ZONAS,DIFUSION MOTORIZADA,REJILLAS,RINT
4,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC,MODULO DE FANCOIL DE ZONA 32Z,AZAMFANCOILC,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC,AZAMFANCOILC,Módulo de zona fancoil cableado Airzone (AZA),1,251,95,,SISTEMAS DE ZONAS,ACUAZONE (DI6),MODULOS DE ZONA,
5,MODULO DE FANCOIL NO SACA 7V POR EL BUS EN VEZ...,MODULO DE FANCOIL DE ZONA 32Z,"SE DETECTA QUE EL BLUEFACE AL ENCENDERLO, SE Q...",AZAMFANCOILC,MODULO DE FANCOIL NO SACA 7V POR EL BUS EN VEZ...,AZAMFANCOILC,AZAMFANCOILC,Módulo de zona fancoil cableado Airzone (AZA),1,251,95,,SISTEMAS DE ZONAS,ACUAZONE (DI6),MODULOS DE ZONA,
6,"Error al pedir la pasarela, a última hora hubo...",Pasarela Mitsubishi Heavy,"Error comercial, por cambio de última hora en ...",AZXEQADAPMHI,"Error al pedir la pasarela, a última hora hubo...",AZXEQADAPMHI,AZXEQADAPMHI,Pasarela de control MITSUBISHI HEAVY (AZXE),1,260,49,,SISTEMAS DE ZONAS,COMUNES,PASARELAS,
7,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...,,SE ROMPEN TAPAS TRASERAS. CAMBIAR TERMOSTATO.,AZC3BLUEFECOSB,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...,AZC3BLUEFECOSB,AZC3BLUEFECOSB,Termostato Inteligente Blueface superficie cab...,1,250,91,1.0,SISTEMAS DE ZONAS,FLEXA (CE6),TERMOSTATOS,BLUEFACE
8,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...,,5 UNIDADES. SE ROMPEN TAPAS TRASERAS.,AZC3TACTOCSB,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...,AZC3TACTOCSB,AZC3TACTOCSB,Termostato Tacto Superficie Cable (C3) - Blanco,1,250,91,4.0,SISTEMAS DE ZONAS,FLEXA (CE6),TERMOSTATOS,TACTO
9,SE HAN ROTO LAS TAPAS TRASERAS EN MAS DE UNA O...,,2 UNIDADES. TAPAS TRASERAS ROTAS EN VARIAS OCA...,AZC3BLUEFECOSB,SE HAN ROTO LAS TAPAS TRASERAS EN MAS DE UNA O...,AZC3BLUEFECOSB,AZC3BLUEFECOSB,Termostato Inteligente Blueface superficie cab...,1,250,91,1.0,SISTEMAS DE ZONAS,FLEXA (CE6),TERMOSTATOS,BLUEFACE


In [45]:
from datetime import date
# Save to disk
today_date = date.today().isoformat()
base_path = f"../MODELS/{today_date}"
os.makedirs(base_path, exist_ok=True)
text_to_analyse.to_csv(f"{base_path}/text_to_analyse.csv", sep='¬', encoding='utf-8-sig', index=False)

In [None]:
import multiprocessing
from collections import OrderedDict
import gensim.models.doc2vec
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
class CommentedDoc2Vec(Doc2Vec):
    def __init__(self, comment="", **kwargs):
        super().__init__(**kwargs)
        self.comment = comment

## Train DocVec and save the model

In [None]:
common_kwargs = dict(
    vector_size=100, epochs=20, min_count=2,
    sample=0, workers=multiprocessing.cpu_count(), negative=5, hs=0,
)

# Create models
simple_models = [
    # PV-DBOW plain
    CommentedDoc2Vec(dm=0, comment="PV-DBOW plain",**common_kwargs),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    CommentedDoc2Vec(dm=1, window=10, alpha=0.05, comment="PV-DM averaging", **common_kwargs),
]

# Create TaggedDocument objects
tagged_data = [TaggedDocument(words=doc.split(), tags=[i]) for i, doc in enumerate(text_to_analyse['text_to_analyse'])]

# Build the vocabulary
for model in simple_models:
    model.build_vocab(tagged_data)
    print("%s vocabulary scanned & state initialized" % model)

In [None]:
# Train the models
for model in simple_models:
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    print("%s training completed" % model)

In [None]:
for model in simple_models:
    print(model.comment)

In [None]:
from datetime import date
# Save the models
today_date = date.today().isoformat()
base_path = f"../MODELS/{today_date}"
os.makedirs(base_path, exist_ok=True)
for model in simple_models:
    model_name = f"{base_path}/{type(model).__name__}_{model.comment.replace(' ', '_')}.model"
    model.save(model_name)
    print(f"Model saved at {model_name}")

## Load the models

In [None]:
# Load the models
pv_dbow = f"../MODELS/2024-05-08/CommentedDoc2Vec_PV-DBOW_plain.model"
pv_dm = f"../MODELS/2024-05-08/CommentedDoc2Vec_PV-DM_averaging.model"
loaded_models = []
for model in [pv_dbow, pv_dm]:
    loaded_model = CommentedDoc2Vec.load(model)
    loaded_models.append(loaded_model)
    print(f"Model loaded from {model}")

In [None]:
# Get the vectors
vectors = [model.dv.vectors for model in loaded_models]

In [None]:
# Ensure text_to_analyse and vectors have the same length
assert len(text_to_analyse) == len(vectors[0]), "Mismatched document counts between models"

# Add vectors to the text_to_analyse dataframe
text_to_analyse.loc[:, 'PV-DBOW'] = list(vectors[0])
text_to_analyse.loc[:, 'PV-DM'] = list(vectors[1])

In [None]:
# Convert the vectors into a 2D array for PCA
vectors_pv_dbow = np.vstack(text_to_analyse['PV-DBOW'])
vectors_pv_dm = np.vstack(text_to_analyse['PV-DM'])

# Combine both sets of vectors
combined_vectors = np.hstack([vectors_pv_dbow, vectors_pv_dm])

In [None]:
# Create a sklearn pipeline to apply a clustering algorithm
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('kmeans', KMeans())
])

# Create grid search parameters
parameters = {
    'kmeans__n_clusters': [200, 220, 240, 260, 280, 300, 320, 340, 360, 380, 400]
}

grid_search = GridSearchCV(pipeline, parameters, cv=2, n_jobs=-1, verbose=1)
grid_search.fit(combined_vectors)

In [None]:
# Check the best parameters
grid_search.best_params_

In [None]:
# Add the cluster to the dataset
text_to_analyse['cluster'] = grid_search.best_estimator_.predict(combined_vectors)

In [None]:
# Plot PCA of the vectors in 3 dimensions
%matplotlib qt
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
vectors_df_pca = pca.fit_transform(vectors_pv_dbow)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(vectors_df_pca[:, 0], vectors_df_pca[:, 1], vectors_df_pca[:, 2], c=text_to_analyse['cluster'])
plt.title("PCA of the vectors")
plt.show()

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3, verbose=1, perplexity=40)
tsne_vector = tsne.fit_transform(combined_vectors)
tsne_vector = pd.DataFrame(tsne_vector, columns=['TSNE1', 'TSNE2', 'TSNE3'])

In [None]:
# Plot TSNE of the vectors in 3 dimensions
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(tsne_vector['TSNE1'], tsne_vector['TSNE2'], tsne_vector['TSNE3'], c=text_to_analyse['cluster'])
plt.title("TSNE of the vectors")
plt.show()

## Tests with actual standard error text

In [None]:
# Get the most similar texts
def get_similar_texts(text, model, topn=5):
    similar_texts = model.dv.most_similar([model.infer_vector(text.split())], topn=topn)
    return similar_texts

In [None]:
test_texts = [
    "Fallo de comunicaciones con la central El dispositivo intenta comunicar con la central, pero no la detecta. Los leds sí que parpadean.",
    "Fallo de comunicaciones con la máquina, el dispositivo intenta comunicar con la maquina, pero no la detecta. Los leds sí que parpadean.",
    "Unidad no arranca, las comunicaciones y los leds son correctos, pero la máquina no arranca.",
    "Error apertura/cierre. No muestra error en el sistema. La compuerta o rejilla no abre ni cierra, pero no se muesrta ningún error en el sistema."
]

#test_text = test_texts[random.randint(0, len(test_texts)-1)]
test_text = test_texts[3]

print(f'TARGET TEXT: {test_text} \n')

for model in loaded_models:
    similar_texts = get_similar_texts(test_text, model)
    print(f"Model: {model.comment}")
    for i, (index, similarity) in enumerate(similar_texts):
        print(f"Similar text {i+1}: {text_to_analyse['text_to_analyse'][index]} with similarity {similarity}\n")
    break

## Calculate the similarity between the texts

In [None]:
# Read list of errors
errors = pd.read_csv("../DATA/errors.csv", sep=';')

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_mean_cosine_score(vector, text, model, n=5):
    cosine_scores = []
    for i in range(n):
        cosine_scores.append(
            cosine_similarity(vector.reshape(1, -1), model.infer_vector(text.split()).reshape(1, -1))
        )
    return np.mean(cosine_scores)

In [None]:
# Calculate the cosine similarity with all text_for_analyse for each of the errors descriptions
for i, id_error in enumerate(errors['ID_ERROR']):
    error_description = errors[errors['ID_ERROR'] == id_error]['DESCRIPCION'].values[0]
    text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))
    print(f"Error {i+1} of {len(errors)} calculated")

In [None]:
# Save text_to_analyse to disk
# text_to_analyse.to_csv("../DATA/text_to_analyse.csv", sep='¬', encoding='utf-8-sig', index=False)

In [None]:
# Load text_to_analyse from disk
# text_to_analyse = pd.read_csv("../DATA/text_to_analyse.csv", sep='¬', encoding='utf-8-sig')

In [None]:
text_to_analyse.loc[:,'cosine_similarity'] = text_to_analyse['PV-DBOW'] \
    .apply(lambda x: calculate_mean_cosine_score(x, test_text, loaded_models[0]))

In [None]:
text_to_analyse[['text_to_analyse','cosine_similarity']] \
    .sort_values(by='cosine_similarity', ascending=False) \
    .head(10)

In [None]:
cosine_columns = [col for col in text_to_analyse.columns if 'cosine_similarity_' in col]
text_to_analyse.loc[:, 'highest_score'] = text_to_analyse[cosine_columns].max(axis=1)
text_to_analyse.loc[:, 'highest_score_error'] = text_to_analyse[cosine_columns].idxmax(axis=1).apply(lambda x: x.split('_')[-1])

In [None]:
text_to_analyse.head(10)

In [None]:
top10_per_error = text_to_analyse[['text_to_analyse', 'highest_score', 'highest_score_error']] \
    .groupby('highest_score_error', group_keys=False) \
    .apply(lambda x: x.nlargest(10, 'highest_score')) \
    .reset_index(drop=True)

top10_per_error.head(500)