# After sales text clustering using Transformers
## Sentence Transformers - BERT embeddings

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer

import os

pd.options.mode.chained_assignment = None

data_version = "2024-05-14"
data_base_path = "../DATA/processed"

In [2]:
# Preprocess text
def preprocess_text(text, words_to_remove=None):
    if words_to_remove is None:
        words_to_remove = []
    
    text = text.lower()
    words = text.split()
    text = [word for word in words if word not in words_to_remove]
    return ' '.join(text)

# Load data
data_path = os.path.join(data_base_path, f"{data_version}", "text_to_analyse_clean.csv")
text_to_analyse = pd.read_csv(data_path, sep="¬", engine="python")

words_to_remove = ['averia', 'averías', 'avería', 'defecto', 'defectos', 'error', 'errores', 'fallo', 'fallos', 'falla', 'motivo', 'motivos', 'proble', 'problema', 'problemas']

text_to_analyse["processed_text"] = text_to_analyse["text_to_analyse"].apply(preprocess_text)

In [3]:
text_to_analyse.head()

Unnamed: 0,codigo,id_pieza,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse,CODART_A3,Fuzzy_Score,CODART,DESCART,CAR1,CAR2,CAR3,CAR4,DESCCAR1,DESCCAR2,DESCCAR3,DESCCAR4,processed_text
0,YZ2YZZUU16,70.0,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MODULO ALARMAS TECNICAS,INDICAN SE HA ESTROPEADO EN LA INSTALACION Y ...,MATS,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MATS,100.0,MATS,Central de alarmas técnicas multifunción,3.0,265.0,94.0,,AT HOME,AT HOME,MODULOS DE CONTROL,,indican exclusivamente que se ha estropeado la...
1,ZP2CZZYVBD,71.0,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,,TACTO BLOQUEADO. NO ENCIENDE-APAGA NI DEJA CAM...,AZATACTORSB,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,AZATACTORSB,100.0,AZATACTORSB,Termostato Tacto superficie radio (AZA) - Blanco,1.0,251.0,91.0,4.0,SISTEMAS DE ZONAS,ACUAZONE (DI6),TERMOSTATOS,TACTO,azatactorsb se ha quedado bloqueado. no permit...
2,YPUEA5WZ10,67.0,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,MODELO 2 PERSIANAS,UNA DE LAS 2 PERSIANAS NO FUNCIONA CON PULSADO...,PER2,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,PER2,100.0,PER2,Módulo de control de 2 persianas con pulsador,3.0,265.0,94.0,,AT HOME,AT HOME,MODULOS DE CONTROL,,desde pulsadores no se puede subir la persiana...
3,ZPWBA5ETF7,72.0,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR",,Rafael el viernes llego mi pedido nº 23349 pa...,RINT040015BKMTE,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR Rafa...",RINT040015BKMTE,100.0,RINT040015BKMTE,Rejilla Inteligente doble Airzone motorizada 4...,1.0,264.0,31.0,92.0,SISTEMAS DE ZONAS,DIFUSION MOTORIZADA,REJILLAS,RINT,"rejilla motorizada defectuosa, sustituir rafae..."
4,ZP2CAPUAA9,74.0,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC,MODULO DE FANCOIL DE ZONA 32Z,AZAMFANCOILC,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC,100.0,AZAMFANCOILC,Módulo de zona fancoil cableado Airzone (AZA),1.0,251.0,95.0,,SISTEMAS DE ZONAS,ACUAZONE (DI6),MODULOS DE ZONA,,modulo de fancoil da problemas las comunicacio...


In [4]:
# Load model
model = SentenceTransformer("all-mpnet-base-v2")

# Compute embeddings
embeddings = model.encode(text_to_analyse["processed_text"].tolist())
text_to_analyse['embeddings'] = embeddings.tolist()



In [5]:
# Load Errors
errors = pd.read_csv("../DATA/TablaTipoErrorPostventa.csv", sep=';', header=0)[['Código','CODCAR3','CODCAR2','DESCFAM','Motivo General','DESCRIPCION']]
errors.columns = ['ID_ERROR','CODCAR3','CODCAR2','DESCFAM','MOTIVO','DESCRIPCION'] # Rename columns
errors['DESCRIPCION_DETAILED']  = errors['MOTIVO'] #+ ' ' + errors['DESCRIPCION'].fillna('') # Concatenate MOTIVO and DESCRIPCION
errors['CODCAR2'] = errors['CODCAR2'].str.replace('-','0').astype(int) # Clean CODCAR2

In [6]:
# Calculate embeddings for errors
errors["description_processed"] = errors["DESCRIPCION_DETAILED"].apply(preprocess_text)
errors_embeddings = model.encode(errors["description_processed"].tolist())
errors["embeddings"] = errors_embeddings.tolist()

In [7]:
# Calculate similarity
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(embeddings, error_embeddings):
    return cosine_similarity([embeddings], [error_embeddings])[0][0]

In [8]:
# Calculate the cosine similarity between the text_to_analyse and the errors
for index, row in errors.iterrows():
    # Create a condition for filtering
    condition = (text_to_analyse['CAR3'] == row['CODCAR3'])
    if row['CODCAR2']:
        condition &= (text_to_analyse['CAR2'] == row['CODCAR2'])
    
    if not text_to_analyse.loc[condition, 'embeddings'].empty:
        text_to_analyse.loc[condition, f'cosine_similarity_{row["ID_ERROR"]}'] = text_to_analyse.loc[condition, 'embeddings'].apply(lambda x: calculate_similarity(x, row['embeddings']))
    
    print(f"Error {row['ID_ERROR']} calculated")

Error 3.01 calculated
Error 3.02 calculated
Error 3.03 calculated
Error 3.04 calculated
Error 3.05 calculated
Error 3.08 calculated
Error 3.09 calculated
Error 3.21 calculated
Error 3.22 calculated
Error 3.23 calculated
Error 3.24 calculated
Error 3.28 calculated
Error 3.29 calculated
Error 3.41 calculated
Error 3.42 calculated
Error 3.43 calculated
Error 3.44 calculated
Error 3.45 calculated
Error 3.48 calculated
Error 3.49 calculated
Error 3.61 calculated
Error 3.62 calculated
Error 3.63 calculated
Error 3.64 calculated
Error 3.68 calculated
Error 3.69 calculated
Error 3.61 calculated
Error 3.62 calculated
Error 3.63 calculated
Error 3.64 calculated
Error 3.68 calculated
Error 3.69 calculated
Error 3.61 calculated
Error 3.62 calculated
Error 3.63 calculated
Error 3.64 calculated
Error 3.68 calculated
Error 3.69 calculated
Error 3.61 calculated
Error 3.62 calculated
Error 3.63 calculated
Error 3.64 calculated
Error 3.68 calculated
Error 3.69 calculated
Error 3.61 calculated
Error 3.62

In [9]:
text_to_analyse.sample(10)

Unnamed: 0,codigo,id_pieza,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse,CODART_A3,Fuzzy_Score,CODART,...,cosine_similarity_4.21,cosine_similarity_4.29,cosine_similarity_4.41,cosine_similarity_4.42,cosine_similarity_4.43,cosine_similarity_4.49,cosine_similarity_4.61,cosine_similarity_4.62,cosine_similarity_4.63,cosine_similarity_4.69
2270,A2KFAZDL2A,6891.0,línea directa de expedientes 14246,carta,línea directa de expedientes 14246,AZCE6IBPRO6,línea directa de expedientes 14246 carta línea...,AZCE6IBPRO6,100.0,AZCE6IBPRO6,...,,,,,,,,,,
21538,LPADL51N10,72455.0,NO FUNCIONA,CENTRALITA FLEXA3,NO FUNCIONA,AZCE6FLEXA3,NO FUNCIONA CENTRALITA FLEXA3 NO FUNCIONA AZCE...,AZCE6FLEXA3,100.0,AZCE6FLEXA3,...,,,,,,,,,,
1130,A2TKCZ5I39,3494.0,CAMBIO PASARELAS,,2 pasarelas y 5 pilas,AZX6QADAPTSAM,CAMBIO PASARELAS 2 pasarelas y 5 pilas AZX6QA...,AZX6QADAPTSAM,100.0,AZX6QADAPTSAM,...,,,,,,,,,,
11282,MMZSNW1T62,36045.0,NUESTRO CLIENTE EBCM DISPONÍA DE LA LÍNEA DIRE...,PLACA CENTRAL,VER NÚMERO DE BOLETO 10454,AZCE6IBPR,NUESTRO CLIENTE EBCM DISPONÍA DE LA LÍNEA DIRE...,AZCE6IBPRO6,90.0,AZCE6IBPRO6,...,,,,,,,,,,
6985,Z2TUZGNR00,21205.0,LE ENVIAMOS UN TERMOSTATO DE UNA REPARACIÓN Y ...,TERMOSTATO PLUS CABLE,TERMOSTATO EN GARANTIA,ZONAPL,LE ENVIAMOS UN TERMOSTATO DE UNA REPARACIÓN Y ...,ZONAPL,100.0,ZONAPL,...,,,,,,,,,,
2865,AMCBAZPM81,8659.0,EL TERMOSTATO NO SE DEJA PROGRAMAR,TERMOSTATO GRUESO,EL TERMOSTATO NO SE DEJA PRORAMAR,AZCE6THINKCB,EL TERMOSTATO NO SE DEJA PROGRAMAR TERMOSTATO ...,AZCE6THINKCB,100.0,AZCE6THINKCB,...,,,,,,,,,,
9627,M2TXMM9R9B,30839.0,CON SU ALBARAN N.1/21804988 DEL 10/05/18 SE EN...,TERMOSTATO DE CABLE MONOCROMO AIRZONE THINK (DI6),CON EL ALBARAN INDICADO SUMINISTRARON VDS EN N...,AZDI6THINKCB,CON SU ALBARAN N.1/21804988 DEL 10/05/18 SE EN...,AZDI6THINKCB,100.0,AZDI6THINKCB,...,,,,,,,,,,
23011,L5SDMPTI9C,79331.0,PLATINO HS,,PLATINO HS,ARZAZDI6IBPRO32,PLATINO HS PLATINO HS AZDI6IBPRO32,AZDI6IBPRO32,89.0,AZDI6IBPRO32,...,,,,,,,,,,
11680,M2HQMGPUF9,37174.0,WS ETHERNET - Platino de comunicación Pb IP6,SERVIDOR WEB ETHERNET,WS ETHERNET - Platino de comunicación Pb IP6,AZX6WEBSCLOUDC,WS ETHERNET - Platino de comunicación Pb IP6 S...,AZX6WEBSCLOUDC,100.0,AZX6WEBSCLOUDC,...,,,,,,,,,,
3206,ZGIYBPLM9A,9646.0,TERMOSTATO CARA AZUL SI APAGA,TERMOSTATO BLUEFACE CABLE FLEXA,TERMOSTATO CARA AZUL SI APAGA,AZCE6BLUEFACECB,TERMOSTATO CARA AZUL SI APAGA TERMOSTATO BLUEF...,AZCE6BLUEFACECB,100.0,AZCE6BLUEFACECB,...,,,,,,,,,,


In [10]:
cosine_columns = [col for col in text_to_analyse.columns if 'cosine_similarity_' in col]
text_to_analyse[cosine_columns] = text_to_analyse[cosine_columns].fillna(0) # Fill NA with 0
text_to_analyse.loc[:, 'highest_score'] = text_to_analyse[cosine_columns].max(axis=1)
text_to_analyse.loc[:, 'highest_score_error'] = text_to_analyse[cosine_columns].idxmax(axis=1).apply(lambda x: x.split('_')[-1])

In [11]:
errors['ID_ERROR'] = errors['ID_ERROR'].astype(str)
text_to_analyse = text_to_analyse.merge(errors[['ID_ERROR','MOTIVO']], left_on='highest_score_error', right_on='ID_ERROR', how='left')

In [12]:
text_to_analyse.sample(10)

Unnamed: 0,codigo,id_pieza,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse,CODART_A3,Fuzzy_Score,CODART,...,cosine_similarity_4.43,cosine_similarity_4.49,cosine_similarity_4.61,cosine_similarity_4.62,cosine_similarity_4.63,cosine_similarity_4.69,highest_score,highest_score_error,ID_ERROR,MOTIVO
12206,MWAXMP1Q66,19539.0,"PROBLEMA TARJETA DE RADIO DEFECTUOSA, SEÑAL MU...",,"PROBLEMA TARJETA DE RADIO DEFECTUOSA, SEÑAL MU...",AZCE6IBPRO6,"PROBLEMA TARJETA DE RADIO DEFECTUOSA, SEÑAL MU...",AZCE6IBPRO6,100.0,AZCE6IBPRO6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.558152,3.83,3.83,Fallo de comunicaciones radio
6429,AGMZCPTO78,10213.0,TERMOSTATO AZCE6BLUEFACECB----NO FUNCIONA,TERMOSTATO BLUEFACE 8Z CABLE BLANCO,TERMOSTATO BLUEFACE 8Z CALBLE BLANCO---NO FUNC...,AZCE6BLUEFACECB,TERMOSTATO AZCE6BLUEFACECB----NO FUNCIONA TERM...,AZCE6BLUEFACECB,100.0,AZCE6BLUEFACECB,...,0.0,0.0,0.0,0.0,0.0,0.0,0.536686,3.01,3.01,Termostato bloqueado
42243,NJIDLP5JCD,78116.0,La puerta de enlace AZX6GTCHIT ya no se comuni...,Puente,La puerta de enlace AZX6GTCHIT ya no se comuni...,AZX6GTCHIT,La puerta de enlace AZX6GTCHIT ya no se comuni...,AZX6GTCHIT,100.0,AZX6GTCHIT,...,0.0,0.0,0.0,0.0,0.0,0.0,0.319333,3.45,3.45,Datos de unidad y sistema no coinciden
20494,MMHQNG9QDA,35278.0,Pantalla táctil desplazada.,CARA AZUL NOIR,PANTALLA TÁCTIL DESPLAZADA.,AZCE6BLUEFACECN,Pantalla táctil desplazada. CARA AZUL NOIR PAN...,AZCE6BLUEFACECN,100.0,AZCE6BLUEFACECN,...,0.0,0.0,0.0,0.0,0.0,0.0,0.613944,3.03,3.03,Fallo de pantalla
34254,ZMXJZ2WY0F,60406.0,"Buenos dias,\r\n\r\nCDE de sidv 66: 1431094 BL...",,consulte su línea directa 68601,AZCE6CB1IAQE,"Buenos dias,\r\n\r\nCDE de sidv 66: 1431094 BL...",AZCE6CB1IAQE,100.0,AZCE6CB1IAQE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.311201,3.84,3.84,No da señal a los motores
41527,MJKCNZLMB3,76372.0,CARTA M7RE hs,lleno,placa base HS\r\nen PAQUETE PLENO PP8PANBS08L6,AZCE8CB1IAQE,CARTA M7RE hs lleno placa base HS\r\nen PAQUET...,AZCE8CB1IAQE,100.0,AZCE8CB1IAQE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.450701,3.81,3.81,Central bloqueada o no alimenta
20163,M2PTNG5QF7,34680.0,REJILLA QUE HA LLEGADO GOLPEADA EN TRANSPORTE....,REJILLA LINEAL CERO GRADOS TRAMO TERMINAL,REJILLA DEL PEDIDO 22010621 QUE HA LLEGADO GOL...,RL001750BKXT,REJILLA QUE HA LLEGADO GOLPEADA EN TRANSPORTE....,RL00175007BKXT,92.0,RL00175007BKXT,...,0.0,0.0,0.0,0.0,0.0,0.0,0.292236,4.21,4.21,Error estético
37716,MZKZNZLNAE,67783.0,DEVOLUCIÓN DE MATERIAL DE ASISTENCIA,,LA UNIDAD DE CONTROL NO FUNCIONA,AZCE6FLEXA3,DEVOLUCIÓN DE MATERIAL DE ASISTENCIA LA UNIDA...,AZCE6FLEXA3,100.0,AZCE6FLEXA3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.259832,3.82,3.82,Fallo de comunicaciones cableadas
45089,M5OBM5ZI4F,84842.0,,Termostato IBPR06 monocromático airzone think ...,intercambio anticipado,AZCE6THINKRB,Termostato IBPR06 monocromático airzone think...,AZCE6THINKRB,100.0,AZCE6THINKRB,...,0.0,0.0,0.0,0.0,0.0,0.0,0.432238,3.01,3.01,Termostato bloqueado
21177,LMLVNWPP06,36453.0,PLACA FLEXA AVERIADA. SE SUSTITUYE PLACA FLEXA...,PLACA FLEXIBLE,PLACA FLEXA AVERIADA. SE SUSTITUYE PLACA FLEXA...,AZCE6FLEXA3,PLACA FLEXA AVERIADA. SE SUSTITUYE PLACA FLEXA...,AZCE6FLEXA3,100.0,AZCE6FLEXA3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.274284,3.81,3.81,Central bloqueada o no alimenta


In [13]:
top10_per_error = text_to_analyse[['codigo','text_to_analyse', 'highest_score', 'highest_score_error']] \
    .groupby('highest_score_error', group_keys=False) \
    .apply(lambda x: x.nlargest(10, 'highest_score')) \
    .reset_index(drop=True)

top10_per_error.head(500)

  top10_per_error = text_to_analyse[['codigo','text_to_analyse', 'highest_score', 'highest_score_error']] \


Unnamed: 0,codigo,text_to_analyse,highest_score,highest_score_error
0,BGRPY2DRA7,TERMOSTATO BLOQUEADO TERMOSTATO ESTA BLOQUEADO...,0.873891,3.01
1,CW1LAP5IED,TERMOSTATO BLOQUEADO TERMOSTATO THINKC BLANCO ...,0.847682,3.01
2,N2LTAGVMBD,TERMOSTATO BLOQUEADO TERMOSTATO PENSAR El mand...,0.804312,3.01
3,NGIXMPVOD4,"Termostato bloqueado, imposible cambiar de mod...",0.796784,3.01
4,MWDVAMTL6E,LOS TERMOSTATOS SE QUEDAN BLOQUEADOS TERMOSTAT...,0.790993,3.01
...,...,...,...,...
409,B2LRZWLX04,"La sonda ambiente no funciona, no manda temper...",0.240174,4.63
410,MWQDMZTU95,2 RECEPTORES AVERIADOS SENSOR DE TEMPERATURA E...,0.231570,4.63
411,ZMTMZ2KT54,"N°48315 Cable BUS 2x0,5+2x0,22 - 15M N°48315\r...",0.217293,4.63
412,MPGVL5HMAF,N°83681 BYPASS CIRCULAR DN150 PARA PLENUM N°83...,0.214375,4.63


In [14]:
text_to_analyse[text_to_analyse['codigo'] == 'MMHSNG1V2C'][['codigo','text_to_analyse', 'highest_score', 'highest_score_error']]

Unnamed: 0,codigo,text_to_analyse,highest_score,highest_score_error
20651,MMHSNG1V2C,WEBSERVER QUE HA ESTADO FUNCIONANDO DURANTE TR...,0.507571,3.23


In [15]:
"""import csv
top50_per_error = text_to_analyse[['codigo','text_to_analyse', 'highest_score', 'highest_score_error']] \
    .groupby('highest_score_error', group_keys=False) \
    .apply(lambda x: x.nlargest(50, 'highest_score')) \
    .reset_index(drop=True)

top50_per_error.to_csv("../DATA/processed/2024-05-14/top50_per_error.csv", index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)"""

'import csv\ntop50_per_error = text_to_analyse[[\'codigo\',\'text_to_analyse\', \'highest_score\', \'highest_score_error\']]     .groupby(\'highest_score_error\', group_keys=False)     .apply(lambda x: x.nlargest(50, \'highest_score\'))     .reset_index(drop=True)\n\ntop50_per_error.to_csv("../DATA/processed/2024-05-14/top50_per_error.csv", index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)'

# Save results in Qdrant database

In [16]:
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels
from qdrant_client.http.models import Filter, FieldCondition, MatchValue

qdrant_client = QdrantClient(":memory:")

qdrant_client.create_collection(
    collection_name="MyZone-DefectClassification",
    vectors_config=qmodels.VectorParams(
        size=768,
        distance=qmodels.Distance.COSINE
    )
)

True

In [17]:
# Insert vectors into Qdrant
points = [
    qmodels.PointStruct(
        id=id, 
        vector=row['embeddings'], 
        payload={
            "error": row['highest_score_error'], 
            "codigo": row['codigo'],
            "error_description": row['MOTIVO'],
            "text": row['text_to_analyse'],
            "family": row['CAR3'],
            "sistema": row['CAR2']
        }
    )
    for id, row in text_to_analyse.iterrows()
]

qdrant_client.upsert(collection_name="MyZone-DefectClassification", points=points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [18]:
# Define a function to search for errors
def define_error(text, family):
    query_embedding = model.encode(text).tolist()
    
    # Define the filter
    filter_condition = Filter(
        must=[
            FieldCondition(
                key="family",
                match=MatchValue(value=int(family))
            )
        ]
    )
    
    search_result = qdrant_client.search(
        collection_name="MyZone-DefectClassification",
        query_vector=query_embedding,
        limit=20,
        query_filter=filter_condition,
    )
    df = pd.DataFrame([(result.payload['error_description'], result.score) for result in search_result], columns=['Error', 'Score'])
    
    group = df.groupby("Error").max().sort_values(by="Score",ascending=False)

    #total = group[1].sum()
    #group['percentage'] = group[group[1] > 0.2][1].apply(lambda x: x/total)
    
    return group['Score'].to_dict()

In [19]:
define_error("Fallo de la pantalla", "91")

{'Fallo de pantalla': 0.8171876550333411}

In [20]:
import gradio as gr

# Create the Gradio interface
iface = gr.Interface(
    fn=define_error, 
    inputs=[gr.Textbox(lines=1, placeholder="Enter error description here", label="Error Description", value="Fallo de la pantalla"),
            gr.Textbox(lines=1, placeholder="Product Family", label="Product Family", value="91")], 
    outputs=gr.Label(num_top_classes=5),
    title="Error probability calculator",
    description="Enter error description and the product family to get the most probable error."
)

# Launch the interface
iface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




## Evaluate the model

In [21]:
# Separate the data into train and test
test_dataset_ids = pd.read_csv("../DATA/processed/test_dataset_ids.csv")
test_dataset = text_to_analyse[text_to_analyse['codigo'].isin(test_dataset_ids['codigo'])]

In [22]:
test_dataset = test_dataset.merge(test_dataset_ids, on='codigo', how='inner')
test_dataset['ERROR_POSTVENTA'] = test_dataset['ERROR_POSTVENTA'].astype(str)
test_dataset['highest_score_error'] = test_dataset['highest_score_error'].astype(str)

In [26]:
test_dataset[['highest_score_error','ERROR_POSTVENTA']]

Unnamed: 0,highest_score_error,ERROR_POSTVENTA
0,3.44,3.28
1,3.01,3.08
2,3.03,3.08
3,3.03,3.08
4,3.23,3.08
...,...,...
909,3.81,3.22
910,3.81,3.22
911,3.81,3.22
912,3.81,3.22


In [24]:
from sklearn.metrics import classification_report

print(classification_report(test_dataset['ERROR_POSTVENTA'], test_dataset['highest_score_error']))

              precision    recall  f1-score   support

        3.01       0.06      0.74      0.12        19
        3.02       0.00      0.00      0.00         9
        3.03       0.05      0.11      0.07         9
        3.04       0.04      0.50      0.07         2
        3.05       0.00      0.00      0.00         0
        3.07       0.00      0.00      0.00         4
        3.08       0.00      0.00      0.00       320
        3.09       0.00      0.00      0.00       145
        3.21       0.00      0.00      0.00         8
        3.22       0.00      0.00      0.00        31
        3.23       0.07      1.00      0.12         2
        3.24       0.00      0.00      0.00        10
        3.28       0.00      0.00      0.00        25
        3.29       0.00      0.00      0.00        37
        3.41       0.00      0.00      0.00         1
        3.42       0.12      0.50      0.20         2
        3.43       0.00      0.00      0.00         3
        3.44       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
test_dataset.count()

codigo                      914
id_pieza                    914
desc_problema_translated    874
descripcion_translated      799
problema_translated         912
                           ... 
ID_ERROR                    914
MOTIVO                      914
Unnamed: 0                  914
NUMDOC                      914
ERROR_POSTVENTA             914
Length: 76, dtype: int64