# After sales text clustering using Transformers
## Sentence Transformers - BERT embeddings

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer

import os

pd.options.mode.chained_assignment = None

data_version = "2024-05-14"
data_base_path = "../DATA/processed"

In [2]:
# Preprocess text
def preprocess_text(text, words_to_remove=None):
    if words_to_remove is None:
        words_to_remove = []
    
    text = text.lower()
    words = text.split()
    text = [word for word in words if word not in words_to_remove]
    return ' '.join(text)

# Load data
data_path = os.path.join(data_base_path, f"{data_version}", "text_to_analyse_clean.csv")
text_to_analyse = pd.read_csv(data_path, sep="¬", engine="python")

words_to_remove = ['averia', 'averías', 'avería', 'defecto', 'defectos', 'error', 'errores', 'fallo', 'fallos', 'falla', 'motivo', 'motivos', 'proble', 'problema', 'problemas']

text_to_analyse["processed_text"] = text_to_analyse["text_to_analyse"].apply(preprocess_text)

In [3]:
text_to_analyse.head()

Unnamed: 0,codigo,id_pieza,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse,CODART_A3,Fuzzy_Score,CODART,DESCART,CAR1,CAR2,CAR3,CAR4,DESCCAR1,DESCCAR2,DESCCAR3,DESCCAR4,processed_text
0,YZ2YZZUU16,70.0,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MODULO ALARMAS TECNICAS,INDICAN SE HA ESTROPEADO EN LA INSTALACION Y ...,MATS,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MATS,100.0,MATS,Central de alarmas técnicas multifunción,3.0,265.0,94.0,,AT HOME,AT HOME,MODULOS DE CONTROL,,indican exclusivamente que se ha estropeado la...
1,ZP2CZZYVBD,71.0,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,,TACTO BLOQUEADO. NO ENCIENDE-APAGA NI DEJA CAM...,AZATACTORSB,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,AZATACTORSB,100.0,AZATACTORSB,Termostato Tacto superficie radio (AZA) - Blanco,1.0,251.0,91.0,4.0,SISTEMAS DE ZONAS,ACUAZONE (DI6),TERMOSTATOS,TACTO,azatactorsb se ha quedado bloqueado. no permit...
2,YPUEA5WZ10,67.0,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,MODELO 2 PERSIANAS,UNA DE LAS 2 PERSIANAS NO FUNCIONA CON PULSADO...,PER2,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,PER2,100.0,PER2,Módulo de control de 2 persianas con pulsador,3.0,265.0,94.0,,AT HOME,AT HOME,MODULOS DE CONTROL,,desde pulsadores no se puede subir la persiana...
3,ZPWBA5ETF7,72.0,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR",,Rafael el viernes llego mi pedido nº 23349 pa...,RINT040015BKMTE,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR Rafa...",RINT040015BKMTE,100.0,RINT040015BKMTE,Rejilla Inteligente doble Airzone motorizada 4...,1.0,264.0,31.0,92.0,SISTEMAS DE ZONAS,DIFUSION MOTORIZADA,REJILLAS,RINT,"rejilla motorizada defectuosa, sustituir rafae..."
4,ZP2CAPUAA9,74.0,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC,MODULO DE FANCOIL DE ZONA 32Z,AZAMFANCOILC,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC,100.0,AZAMFANCOILC,Módulo de zona fancoil cableado Airzone (AZA),1.0,251.0,95.0,,SISTEMAS DE ZONAS,ACUAZONE (DI6),MODULOS DE ZONA,,modulo de fancoil da problemas las comunicacio...


In [4]:
# Load model
model = SentenceTransformer("all-mpnet-base-v2")

# Compute embeddings
embeddings = model.encode(text_to_analyse["processed_text"].tolist())
text_to_analyse['embeddings'] = embeddings.tolist()



In [5]:
# Load Errors
errors = pd.read_csv("../DATA/TablaTipoErrorPostventa.csv", sep=';', header=0)[['Código','CODCAR3','CODCAR2','DESCFAM','Motivo General','DESCRIPCION']]
errors.columns = ['ID_ERROR','CODCAR3','CODCAR2','DESCFAM','MOTIVO','DESCRIPCION'] # Rename columns
errors['DESCRIPCION_DETAILED']  = errors['MOTIVO'] #+ ' ' + errors['DESCRIPCION'].fillna('') # Concatenate MOTIVO and DESCRIPCION
errors['CODCAR2'] = errors['CODCAR2'].str.replace('-','0').astype(int) # Clean CODCAR2

In [6]:
# Calculate embeddings for errors
errors["description_processed"] = errors["DESCRIPCION_DETAILED"].apply(preprocess_text)
errors_embeddings = model.encode(errors["description_processed"].tolist())
errors["embeddings"] = errors_embeddings.tolist()

In [7]:
# Calculate similarity
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(embeddings, error_embeddings):
    return cosine_similarity([embeddings], [error_embeddings])[0][0]

In [8]:
# Calculate the cosine similarity between the text_to_analyse and the errors
for index, row in errors.iterrows():
    # Create a condition for filtering
    condition = (text_to_analyse['CAR3'] == row['CODCAR3'])
    if row['CODCAR2']:
        condition &= (text_to_analyse['CAR2'] == row['CODCAR2'])
    
    if not text_to_analyse.loc[condition, 'embeddings'].empty:
        text_to_analyse.loc[condition, f'cosine_similarity_{row["ID_ERROR"]}'] = text_to_analyse.loc[condition, 'embeddings'].apply(lambda x: calculate_similarity(x, row['embeddings']))
    
    print(f"Error {row['ID_ERROR']} calculated")

Error 3.01 calculated
Error 3.02 calculated
Error 3.03 calculated
Error 3.04 calculated
Error 3.05 calculated
Error 3.08 calculated
Error 3.09 calculated
Error 3.21 calculated
Error 3.22 calculated
Error 3.23 calculated
Error 3.24 calculated
Error 3.28 calculated
Error 3.29 calculated
Error 3.41 calculated
Error 3.42 calculated
Error 3.43 calculated
Error 3.44 calculated
Error 3.45 calculated
Error 3.48 calculated
Error 3.49 calculated
Error 3.61 calculated
Error 3.62 calculated
Error 3.63 calculated
Error 3.64 calculated
Error 3.68 calculated
Error 3.69 calculated
Error 3.61 calculated
Error 3.62 calculated
Error 3.63 calculated
Error 3.64 calculated
Error 3.68 calculated
Error 3.69 calculated
Error 3.61 calculated
Error 3.62 calculated
Error 3.63 calculated
Error 3.64 calculated
Error 3.68 calculated
Error 3.69 calculated
Error 3.61 calculated
Error 3.62 calculated
Error 3.63 calculated
Error 3.64 calculated
Error 3.68 calculated
Error 3.69 calculated
Error 3.61 calculated
Error 3.62

In [9]:
text_to_analyse.sample(10)

Unnamed: 0,codigo,id_pieza,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse,CODART_A3,Fuzzy_Score,CODART,...,cosine_similarity_4.21,cosine_similarity_4.29,cosine_similarity_4.41,cosine_similarity_4.42,cosine_similarity_4.43,cosine_similarity_4.49,cosine_similarity_4.61,cosine_similarity_4.62,cosine_similarity_4.63,cosine_similarity_4.69
20301,MPSENJLP76,67239.0,Problema en la visualización del termostato,Termostato pensar,Pantalla congelada,azce6thinkrb,Problema en la visualización del termostato Te...,AZCE6THINKRB,100.0,AZCE6THINKRB,...,,,,,,,,,,
77,YJUAZZWA5D,391.0,"por favor sacar este material a mi nombre, es ...",1 bf aza superficie,incompatibilidad cen y pasarela 0-10v,azabluefecos,"por favor sacar este material a mi nombre, es ...",AZABLUEFECOSB,96.0,AZABLUEFECOSB,...,,,,,,,,,,
20755,LZICNZPN2F,68926.0,producto defectuoso // equipo en garantía ya e...,,2 tarjetas de servidor web\r\n2 pasarelas\r\n1...,AZX6WSPHUB,producto defectuoso // equipo en garantía ya e...,AZX6WSPHUB,100.0,AZX6WSPHUB,...,,,,,,,,,,
6116,MWADNJDP6D,18245.0,"Después de la asociación, los Radio Termostato...",TERMOSTATO THINK RADIO BLANCO,"Después de la asociación, el Radio Termostato ...",AZCE6THINKRB,"Después de la asociación, los Radio Termostato...",AZCE6THINKRB,100.0,AZCE6THINKRB,...,,,,,,,,,,
11758,LMPXNWDO74,37401.0,Los mandos BlueFace no se dejan direccionar so...,TERMOSTATO AZUL,Cambio unidad por no poder direccionar,AZDI6BLUEFACECB,Los mandos BlueFace no se dejan direccionar so...,AZDI6BLUEFACECB,100.0,AZDI6BLUEFACECB,...,,,,,,,,,,
22370,NJKBLZPL10,76469.0,"Dos termostatos blueface que no funcionan, sol...",Termostato Airzone Blueface Zero Cable Blanco 32Z,No funciona,AZDI6BLUEZEROCB,"Dos termostatos blueface que no funcionan, sol...",AZDI6BLUEZEROCB,100.0,AZDI6BLUEZEROCB,...,,,,,,,,,,
10846,LWLRLW9REC,34692.0,BOLETO 9020,REMOTO,BOLETO 9020,AZCEBLUEFACECN,BOLETO 9020 REMOTO BOLETO 9020 AZCE6BLUEFACECN,AZCE6BLUEFACECN,97.0,AZCE6BLUEFACECN,...,,,,,,,,,,
7744,AWZNAGRRC2,23707.0,- Componente (condensador) de la pasarela de c...,Compuerta circular motorizada 200 mm.,Pata del eje rota,CPCC200MTE,- Componente (condensador) de la pasarela de c...,CPCC200MTE,100.0,CPCC200MTE,...,,,,,,,,,,
18178,Z2TMZWOZ74,58491.0,NO FUNCIONA LA ANTENA RADIO DE LA CENTRAL FLEX...,FLEXA CENTRAL 3,ENVIAR UNA CENTRAL FLEXA 3 NUEVA,AZCE6FLEXA3,NO FUNCIONA LA ANTENA RADIO DE LA CENTRAL FLEX...,AZCE6FLEXA3,100.0,AZCE6FLEXA3,...,,,,,,,,,,
3957,A2IXAPHK86,11729.0,TERMOSTATO AZCE6BLUEFACECN----NO FUNCIONA,TERMOSTATO CABLE BLUEFACE 8Z NEGRO,TERMOSTATO BLUEFACE AZCE6BLUEFACECN---NO FUNCIONA,AZCE6BLUEFACECN,TERMOSTATO AZCE6BLUEFACECN----NO FUNCIONA TERM...,AZCE6BLUEFACECN,100.0,AZCE6BLUEFACECN,...,,,,,,,,,,


In [10]:
cosine_columns = [col for col in text_to_analyse.columns if 'cosine_similarity_' in col]
text_to_analyse[cosine_columns] = text_to_analyse[cosine_columns].fillna(0) # Fill NA with 0
text_to_analyse.loc[:, 'highest_score'] = text_to_analyse[cosine_columns].max(axis=1)
text_to_analyse.loc[:, 'highest_score_error'] = text_to_analyse[cosine_columns].idxmax(axis=1).apply(lambda x: x.split('_')[-1])

In [11]:
errors['ID_ERROR'] = errors['ID_ERROR'].astype(str)
text_to_analyse = text_to_analyse.merge(errors[['ID_ERROR','MOTIVO']], left_on='highest_score_error', right_on='ID_ERROR', how='left')

In [12]:
text_to_analyse.sample(10)

Unnamed: 0,codigo,id_pieza,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse,CODART_A3,Fuzzy_Score,CODART,...,cosine_similarity_4.43,cosine_similarity_4.49,cosine_similarity_4.61,cosine_similarity_4.62,cosine_similarity_4.63,cosine_similarity_4.69,highest_score,highest_score_error,ID_ERROR,MOTIVO
36115,Y2HGZGKA5F,64120.0,visto con el número de billete,PASARELAS CONT. ZONA AÉREA/DAIKIN,visto con el número de billete,AZX6GTCDA1,visto con el número de billete PASARELAS CONT....,AZX6GTCDA1,100.0,AZX6GTCDA1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.428879,3.44,3.44,Unidad no arranca
35089,ZW5HZGIT0F,62244.0,COMPRAMOS 4 MODULOS COD. AZDI6ZMOPANC PARA UNA...,AIRZONE MODULO ZONA CABLE U.INDIVIDUAL PANASONIC,COMPRAMOS 4 MODULOS COD. AZDI6ZMOPANC PARA UNA...,AZDI6ZMOPANC,COMPRAMOS 4 MODULOS COD. AZDI6ZMOPANC PARA UNA...,AZDI6ZMOPANC,100.0,AZDI6ZMOPANC,...,0.0,0.0,0.0,0.0,0.0,0.0,0.477414,3.82,3.82,Fallo de comunicaciones cableadas
32204,AMZOAWKACA,56792.0,"PCC NO FUNCIONA, REEMPLAZO DE PRODUCTO POR PAR...",,"PCC NO FUNCIONA, REEMPLAZO DE PRODUCTO POR PAR...",AZX6CCP,"PCC NO FUNCIONA, REEMPLAZO DE PRODUCTO POR PAR...",AZX6CCP,100.0,AZX6CCP,...,0.0,0.0,0.0,0.0,0.0,0.0,0.330356,3.82,3.82,Fallo de comunicaciones cableadas
19909,MWXUNWLOC1,34218.0,PASARELA DE COMUNICACIONES AIRZONE-PANASONIC (...,PASARELA COMUNICACIONES AIRZONE-PANASONIC,REPOSICIÓN EN GARANTÍA POR FUNCIONAMIENTO INC...,AZX6QADAPTPAN,PASARELA DE COMUNICACIONES AIRZONE-PANASONIC (...,AZX6QADAPTPAN,100.0,AZX6QADAPTPAN,...,0.0,0.0,0.0,0.0,0.0,0.0,0.379059,3.42,3.42,Fallo de comunicaciones con la central
26223,A21LBGPUF0,45328.0,NO FUNCIONA TERMOSTATO THINK,,NO FUNCIONA,AZCE6THINKRB,NO FUNCIONA TERMOSTATO THINK NO FUNCIONA AZCE...,AZCE6THINKRB,100.0,AZCE6THINKRB,...,0.0,0.0,0.0,0.0,0.0,0.0,0.608139,3.01,3.01,Termostato bloqueado
6112,AMSYA55J4C,9621.0,TARJETA DE COMUNICACIONES HS - NO FUNCIONA. FA...,PUERTA DE COMUNICACIÓN AIRZONE-TOSHIBA,TARJETA DE COMUNICACIONES HS - NO FUNCIONA. FA...,AZX6QADAPTTOS,TARJETA DE COMUNICACIONES HS - NO FUNCIONA. FA...,AZX6QADAPTTOS,100.0,AZX6QADAPTTOS,...,0.0,0.0,0.0,0.0,0.0,0.0,0.282467,3.48,3.48,Requiere actualización software/hardware
16872,AMLOAWRRC7,28661.0,"Buenos dias,\r\n\r\nTras el correo electrónico...",termostato azul,ver billete 453,azce6bluefacecb,"Buenos dias,\r\n\r\nTras el correo electrónico...",AZCE6BLUEFACECB,100.0,AZCE6BLUEFACECB,...,0.0,0.0,0.0,0.0,0.0,0.0,0.445244,3.01,3.01,Termostato bloqueado
40371,L5ICNZDH42,73995.0,AZX6QADAPT3MEL NO FUNCIONA CORRECTAMENTE.,PASARELA MITSUBISHI,AZX6QADAPT3MEL NO FUNCIONA CORRECTAMENTE.,AZX6QADAPT3MEL,AZX6QADAPT3MEL NO FUNCIONA CORRECTAMENTE. PASA...,AZX6QADAPT3MEL,100.0,AZX6QADAPT3MEL,...,0.0,0.0,0.0,0.0,0.0,0.0,0.352021,3.44,3.44,Unidad no arranca
2413,CWZPBPTN6B,3928.0,,Termostato maestro,Diagnóstico del Sr. Veyssiere después de la as...,AZXCSMASTERSB,Termostato maestro Diagnóstico del Sr. Veyssi...,AZXCSMASTERSB,100.0,AZXCSMASTERSB,...,0.0,0.0,0.0,0.0,0.0,0.0,0.260337,3.82,3.82,Fallo de comunicaciones cableadas
14534,AGZQY2NNAC,23882.0,Número de expediente 1114,,Número de expediente 1114.,AZCE6IBPR06,Número de expediente 1114 Número de expedient...,AZCE6IBPRO6,91.0,AZCE6IBPRO6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.340198,3.81,3.81,Central bloqueada o no alimenta


In [13]:
top10_per_error = text_to_analyse[['codigo','text_to_analyse', 'highest_score', 'highest_score_error']] \
    .groupby('highest_score_error', group_keys=False) \
    .apply(lambda x: x.nlargest(10, 'highest_score')) \
    .reset_index(drop=True)

top10_per_error.head(500)

  top10_per_error = text_to_analyse[['codigo','text_to_analyse', 'highest_score', 'highest_score_error']] \


Unnamed: 0,codigo,text_to_analyse,highest_score,highest_score_error
0,BGRPY2DRA7,TERMOSTATO BLOQUEADO TERMOSTATO ESTA BLOQUEADO...,0.873891,3.01
1,CW1LAP5IED,TERMOSTATO BLOQUEADO TERMOSTATO THINKC BLANCO ...,0.847682,3.01
2,N2LTAGVMBD,TERMOSTATO BLOQUEADO TERMOSTATO PENSAR El mand...,0.804312,3.01
3,NGIXMPVOD4,"Termostato bloqueado, imposible cambiar de mod...",0.796784,3.01
4,MWDVAMTL6E,LOS TERMOSTATOS SE QUEDAN BLOQUEADOS TERMOSTAT...,0.790993,3.01
...,...,...,...,...
409,B2LRZWLX04,"La sonda ambiente no funciona, no manda temper...",0.240174,4.63
410,MWQDMZTU95,2 RECEPTORES AVERIADOS SENSOR DE TEMPERATURA E...,0.231570,4.63
411,ZMTMZ2KT54,"N°48315 Cable BUS 2x0,5+2x0,22 - 15M N°48315\r...",0.217293,4.63
412,MPGVL5HMAF,N°83681 BYPASS CIRCULAR DN150 PARA PLENUM N°83...,0.214375,4.63


In [14]:
text_to_analyse[text_to_analyse['codigo'] == 'MMHSNG1V2C'][['codigo','text_to_analyse', 'highest_score', 'highest_score_error']]

Unnamed: 0,codigo,text_to_analyse,highest_score,highest_score_error
20651,MMHSNG1V2C,WEBSERVER QUE HA ESTADO FUNCIONANDO DURANTE TR...,0.507571,3.23


# Save results in Qdrant database

In [15]:
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels
from qdrant_client.http.models import Filter, FieldCondition, MatchValue

qdrant_client = QdrantClient(":memory:")

qdrant_client.create_collection(
    collection_name="MyZone-DefectClassification",
    vectors_config=qmodels.VectorParams(
        size=768,
        distance=qmodels.Distance.COSINE
    )
)

True

In [16]:
# Insert vectors into Qdrant
points = [
    qmodels.PointStruct(
        id=id, 
        vector=row['embeddings'], 
        payload={
            "error": row['highest_score_error'], 
            "codigo": row['codigo'],
            "error_description": row['MOTIVO'],
            "text": row['text_to_analyse'],
            "family": row['CAR3'],
            "sistema": row['CAR2']
        }
    )
    for id, row in text_to_analyse.iterrows()
]

qdrant_client.upsert(collection_name="MyZone-DefectClassification", points=points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [17]:
# Define a function to search for errors
def define_error(text, family):
    query_embedding = model.encode(text).tolist()
    
    # Define the filter
    filter_condition = Filter(
        must=[
            FieldCondition(
                key="family",
                match=MatchValue(value=int(family))
            )
        ]
    )
    
    search_result = qdrant_client.search(
        collection_name="MyZone-DefectClassification",
        query_vector=query_embedding,
        limit=20,
        query_filter=filter_condition,
    )
    df = pd.DataFrame([(result.payload['error_description'], result.score) for result in search_result], columns=['Error', 'Score'])
    
    group = df.groupby("Error").max().sort_values(by="Score",ascending=False)

    #total = group[1].sum()
    #group['percentage'] = group[group[1] > 0.2][1].apply(lambda x: x/total)
    
    return group['Score'].to_dict()

In [18]:
define_error("Fallo de la pantalla", "91")

{'Fallo de pantalla': 0.8171876550333411}

In [21]:
import gradio as gr

# Create the Gradio interface
iface = gr.Interface(
    fn=define_error, 
    inputs=[gr.Textbox(lines=1, placeholder="Enter error description here", label="Error Description", value="Fallo de la pantalla"),
            gr.Textbox(lines=1, placeholder="Product Family", label="Product Family", value="91")], 
    outputs=gr.Label(num_top_classes=5),
    title="Error probability calculator",
    description="Enter error description and the product family to get the most probable error."
)

# Launch the interface
iface.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


