# After sales text clustering using Doc2Vec

## Data preprocessing (Merging the translated text)

In [2]:
import os
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

In [2]:
def query_data(query):
    """
    Function to query data from the database using sqlalchemy
    :param query: 
    :return: pd.DataFrame
    
    Connection parameters:
    user = readmyzone
    password = (get from environment variable MYSQL_PASSWORD)
    host = 192.168.2.7
    port = 3306
    """
    
    # Create the connection string
    user = 'readmyzone'
    password = os.environ.get('MYSQL_PASSWORD')
    host = '192.168.2.7'
    port = '3306'
    db = 'myzone'
    connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{db}'
    
    # Create the engine
    engine = create_engine(connection_string)
    
    try:
        # Query the data
        data = pd.read_sql(query, engine)
    except Exception as e:
        print(e)
        data = None
    
    return data

## Load the data

In [3]:
sav_incidencias = query_data('SELECT * FROM sav_incidencias')
sav_piezas = query_data('SELECT * FROM sav_piezas')
sav_estados = query_data('SELECT * FROM sav_estados')
sav_incidencias_tipo = query_data('SELECT * FROM sav_incidencias_tipo')

In [4]:
dataset = sav_incidencias.merge(sav_piezas, left_on='codigo', right_on='codigo_incidencia', how='left', suffixes=(None, '_pieza'))
dataset = dataset.merge(sav_estados, left_on='estado', right_on='id', how='left', suffixes=(None, '_estado'))
dataset = dataset.merge(sav_incidencias_tipo, left_on='tipo', right_on='id', how='left', suffixes=(None, '_tipo'))

In [5]:
clean_dataset = dataset[(dataset["tipo"] == 1) & (dataset["estado"].isin([2,6]))]

In [6]:
# Load from disk the text to translate dictionary
fields_to_translate = ["desc_problema", "problema", "descripcion"]
text_to_translate = {}
for text in fields_to_translate:
    text_to_translate[text] = pd.read_csv(f"../DATA/{text}.csv", sep='¬', encoding='utf-8-sig')

  text_to_translate[text] = pd.read_csv(f"../DATA/{text}.csv", sep='¬', encoding='utf-8-sig')
  text_to_translate[text] = pd.read_csv(f"../DATA/{text}.csv", sep='¬', encoding='utf-8-sig')
  text_to_translate[text] = pd.read_csv(f"../DATA/{text}.csv", sep='¬', encoding='utf-8-sig')


In [7]:
desc_problema_translated = pd.read_csv("../DATA/desc_problema_translated.csv", sep='¬', encoding='utf-8-sig', engine='python')
descripcion_translated = pd.read_csv("../DATA/descripcion_translated.csv", sep='¬', encoding='utf-8-sig', engine='python')
problema_translated = pd.read_csv("../DATA/problema_translated.csv", sep='¬', encoding='utf-8-sig', engine='python')# Data preprocessing (Merging the translated text)

In [8]:
# Delete rows with values (desc_problema, desc_problema_translated)
desc_problema_translated = desc_problema_translated[~desc_problema_translated["desc_problema_translated"].isin(["desc_problema_translated"])]
descripcion_translated = descripcion_translated[~descripcion_translated["descripcion_translated"].isin(["descripcion_translated"])]
problema_translated = problema_translated[~problema_translated["problema_translated"].isin(["problema_translated"])]

In [9]:
desc_problema_translated.count()

desc_problema               18099
desc_problema_translated    18099
dtype: int64

In [10]:
# Merge the translated text with the text_to_translate dataframe
desc_problema_translated = text_to_translate["desc_problema"].merge(desc_problema_translated, left_on="desc_problema", right_on="desc_problema", how="left")
descripcion_translated = text_to_translate["descripcion"].merge(descripcion_translated, left_on="descripcion", right_on="descripcion", how="left")
problema_translated = text_to_translate["problema"].merge(problema_translated, left_on="problema", right_on="problema", how="left")

In [11]:
# Fill NA with the original texts
desc_problema_translated.fillna({"desc_problema_translated": desc_problema_translated["desc_problema"]}, inplace=True)
descripcion_translated.fillna({"descripcion_translated": descripcion_translated["descripcion"]}, inplace=True)
problema_translated.fillna({"problema_translated": problema_translated["problema"]}, inplace=True)

In [12]:
desc_problema_translated.head(5)

Unnamed: 0,desc_problema,desc_problema_lg,desc_problema_translated
0,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,es,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...
1,NO FUNCIONA,es,NO FUNCIONA
2,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,es,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...
3,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,es,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...
4,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR",es,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR"


In [13]:
# Merge the translated text with the original dataset
clean_dataset = clean_dataset.merge(desc_problema_translated, left_on="desc_problema", right_on="desc_problema", how="left")
clean_dataset = clean_dataset.merge(descripcion_translated, left_on="descripcion", right_on="descripcion", how="left")
clean_dataset = clean_dataset.merge(problema_translated, left_on="problema", right_on="problema", how="left")

In [14]:
clean_dataset[["desc_problema"]].head(100)

Unnamed: 0,desc_problema
0,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...
1,NO FUNCIONA
2,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...
3,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...
4,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR"
...,...
95,"por favor sacar este material a mi nombre, es ..."
96,"por favor sacar este material a mi nombre, es ..."
97,Pasarela Daikin: unos 3 minutos despues de arr...
98,Rejilla no funciona. Enviar una nueva retirar ...


In [15]:
clean_dataset.describe(include='all')

Unnamed: 0,id,web_id,codigo,creation_date,modification_date,company_id,user_id,ref_cliente,portes_airzone,devaluacion,...,titulo_en_tipo,titulo_fr_tipo,titulo_it_tipo,titulo_pt_tipo,desc_problema_lg,desc_problema_translated,descripcion_lg,descripcion_translated,problema_lg,problema_translated
count,44921.0,44921.0,44921,44921,44921,44921.0,44921.0,44899,44921.0,44921.0,...,44921,44921,44921,0.0,42132,42132,35132,33316,44847,43031
unique,,,34755,34712,28509,,,29872,,,...,1,1,1,0.0,33,28048,33,11382,33,29737
top,,,MPMAMZZN0E,2023-11-19 14:06:28,0000-00-00 00:00:00,,,RESO MATERIALE ASSISTENZE,,,...,guarantee,garantie,garanzia,,es,NO FUNCIONA,en,TERMOSTATO,es,NO FUNCIONA
freq,,,38,38,7736,,,278,,,...,44921,44921,44921,,15028,538,8609,735,14395,952
mean,31104.991274,1.63654,,,,1342.788785,2711.198704,,0.999265,0.000445,...,,,,,,,,,,
min,5.0,1.0,,,,0.0,1.0,,0.0,0.0,...,,,,,,,,,,
25%,17440.0,1.0,,,,242.0,469.0,,1.0,0.0,...,,,,,,,,,,
50%,32141.0,2.0,,,,492.0,1965.0,,1.0,0.0,...,,,,,,,,,,
75%,45202.0,2.0,,,,1955.0,4241.0,,1.0,0.0,...,,,,,,,,,,
max,58847.0,5.0,,,,7667.0,10289.0,,1.0,20.0,...,,,,,,,,,,


In [16]:
for column in clean_dataset.columns:
    print(f"Column: {column}")

Column: id
Column: web_id
Column: codigo
Column: creation_date
Column: modification_date
Column: company_id
Column: user_id
Column: ref_cliente
Column: portes_airzone
Column: devaluacion
Column: pedido_sage
Column: abono_sage
Column: pedido_a3
Column: abono_a3
Column: tipo
Column: estado
Column: personaaz
Column: dire_envio_id
Column: dire_recogida_id
Column: peso3
Column: volumen3
Column: estadofr
Column: c_mail
Column: c_tel
Column: c_obs
Column: accepted_client
Column: desc_problema
Column: codigo_incidencia
Column: id_pieza
Column: user_id_pieza
Column: cod_articulo
Column: descripcion
Column: num_serie
Column: factura_albaran
Column: problema
Column: is_replacement
Column: creation_date_pieza
Column: modification_date_pieza
Column: id_estado
Column: ref
Column: color
Column: valor
Column: titulo_es
Column: titulo_en
Column: titulo_fr
Column: titulo_it
Column: titulo_pt
Column: id_tipo
Column: titulo_es_tipo
Column: titulo_en_tipo
Column: titulo_fr_tipo
Column: titulo_it_tipo
Colum

In [17]:
# Get only the columns with the fields of interest
text_to_analyse = clean_dataset[['desc_problema_translated','descripcion_translated','problema_translated','cod_articulo']]
# Fill NA with empty string
text_to_analyse.fillna("", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.fillna("", inplace=True)


In [18]:
text_to_analyse.loc[:, 'text_to_analyse'] = text_to_analyse['desc_problema_translated'] + ' ' + text_to_analyse['descripcion_translated'] + ' ' + text_to_analyse['problema_translated'] + ' ' + text_to_analyse['cod_articulo']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, 'text_to_analyse'] = text_to_analyse['desc_problema_translated'] + ' ' + text_to_analyse['descripcion_translated'] + ' ' + text_to_analyse['problema_translated'] + ' ' + text_to_analyse['cod_articulo']


In [19]:
text_to_analyse.head(10)

Unnamed: 0,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse
0,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MODULO ALARMAS TECNICAS,INDICAN SE HA ESTROPEADO EN LA INSTALACION Y ...,MATS,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...
1,NO FUNCIONA,,NO FUNCIONA,AZC3TACTOCSB,NO FUNCIONA NO FUNCIONA AZC3TACTOCSB
2,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,,TACTO BLOQUEADO. NO ENCIENDE-APAGA NI DEJA CAM...,AZATACTORSB,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...
3,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,MODELO 2 PERSIANAS,UNA DE LAS 2 PERSIANAS NO FUNCIONA CON PULSADO...,PER2,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...
4,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR",,Rafael el viernes llego mi pedido nº 23349 pa...,RINT040015BKMTE,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR Rafa..."
5,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC,MODULO DE FANCOIL DE ZONA 32Z,AZAMFANCOILC,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...
6,MODULO DE FANCOIL NO SACA 7V POR EL BUS EN VEZ...,MODULO DE FANCOIL DE ZONA 32Z,"SE DETECTA QUE EL BLUEFACE AL ENCENDERLO, SE Q...",AZAMFANCOILC,MODULO DE FANCOIL NO SACA 7V POR EL BUS EN VEZ...
7,"Error al pedir la pasarela, a última hora hubo...",Pasarela Mitsubishi Heavy,"Error comercial, por cambio de última hora en ...",AZXEQADAPMHI,"Error al pedir la pasarela, a última hora hubo..."
8,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...,,SE ROMPEN TAPAS TRASERAS. CAMBIAR TERMOSTATO.,AZC3BLUEFECOSB,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...
9,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...,,5 UNIDADES. SE ROMPEN TAPAS TRASERAS.,AZC3TACTOCSB,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...


In [20]:
import multiprocessing
from collections import OrderedDict
import gensim.models.doc2vec
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [21]:
class CommentedDoc2Vec(Doc2Vec):
    def __init__(self, comment="", **kwargs):
        super().__init__(**kwargs)
        self.comment = comment

## Train DocVec and save the model

In [47]:
common_kwargs = dict(
    vector_size=100, epochs=20, min_count=2,
    sample=0, workers=multiprocessing.cpu_count(), negative=5, hs=0,
)

# Create models
simple_models = [
    # PV-DBOW plain
    CommentedDoc2Vec(dm=0, comment="PV-DBOW plain",**common_kwargs),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    CommentedDoc2Vec(dm=1, window=10, alpha=0.05, comment="PV-DM averaging", **common_kwargs),
]

# Create TaggedDocument objects
tagged_data = [TaggedDocument(words=doc.split(), tags=[i]) for i, doc in enumerate(text_to_analyse['text_to_analyse'])]

# Build the vocabulary
for model in simple_models:
    model.build_vocab(tagged_data)
    print("%s vocabulary scanned & state initialized" % model)

CommentedDoc2Vec<"PV-DBOW plain",dbow,d100,n5,mc2,t8> vocabulary scanned & state initialized
CommentedDoc2Vec<"PV-DM w/ averaging",dm/m,d100,n5,w10,mc2,t8> vocabulary scanned & state initialized


In [48]:
# Train the models
for model in simple_models:
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    print("%s training completed" % model)

CommentedDoc2Vec<"PV-DBOW plain",dbow,d100,n5,mc2,t8> training completed
CommentedDoc2Vec<"PV-DM w/ averaging",dm/m,d100,n5,w10,mc2,t8> training completed


In [53]:
for model in simple_models:
    print(model.comment)

PV-DBOW plain
PV-DM averaging


In [54]:
from datetime import date
# Save the models
today_date = date.today().isoformat()
base_path = f"../MODELS/{today_date}"
os.makedirs(base_path, exist_ok=True)
for model in simple_models:
    model_name = f"{base_path}/{type(model).__name__}_{model.comment.replace(' ', '_')}.model"
    model.save(model_name)
    print(f"Model saved at {model_name}")

Model saved at ../MODELS/2024-05-08/CommentedDoc2Vec_PV-DBOW_plain.model
Model saved at ../MODELS/2024-05-08/CommentedDoc2Vec_PV-DM_averaging.model


## Load the models

In [22]:
# Load the models
pv_dbow = f"../MODELS/2024-05-08/CommentedDoc2Vec_PV-DBOW_plain.model"
pv_dm = f"../MODELS/2024-05-08/CommentedDoc2Vec_PV-DM_averaging.model"
loaded_models = []
for model in [pv_dbow, pv_dm]:
    loaded_model = CommentedDoc2Vec.load(model)
    loaded_models.append(loaded_model)
    print(f"Model loaded from {model}")

Model loaded from ../MODELS/2024-05-08/CommentedDoc2Vec_PV-DBOW_plain.model
Model loaded from ../MODELS/2024-05-08/CommentedDoc2Vec_PV-DM_averaging.model


In [23]:
# Get the vectors
vectors = [model.dv.vectors for model in loaded_models]

In [24]:
# Ensure text_to_analyse and vectors have the same length
assert len(text_to_analyse) == len(vectors[0]), "Mismatched document counts between models"

# Add vectors to the text_to_analyse dataframe
text_to_analyse.loc[:, 'PV-DBOW'] = list(vectors[0])
text_to_analyse.loc[:, 'PV-DM'] = list(vectors[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, 'PV-DBOW'] = list(vectors[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, 'PV-DM'] = list(vectors[1])


In [25]:
# Convert the vectors into a 2D array for PCA
vectors_pv_dbow = np.vstack(text_to_analyse['PV-DBOW'])
vectors_pv_dm = np.vstack(text_to_analyse['PV-DM'])

# Combine both sets of vectors
combined_vectors = np.hstack([vectors_pv_dbow, vectors_pv_dm])

In [32]:
# Create a sklearn pipeline to apply a clustering algorithm
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('kmeans', KMeans())
])

# Create grid search parameters
parameters = {
    'kmeans__n_clusters': [200, 220, 240, 260, 280, 300, 320, 340, 360, 380, 400]
}

grid_search = GridSearchCV(pipeline, parameters, cv=2, n_jobs=-1, verbose=1)
grid_search.fit(combined_vectors)

Fitting 2 folds for each of 11 candidates, totalling 22 fits


In [33]:
# Check the best parameters
grid_search.best_params_

{'kmeans__n_clusters': 360}

In [34]:
# Add the cluster to the dataset
text_to_analyse['cluster'] = grid_search.best_estimator_.predict(combined_vectors)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse['cluster'] = grid_search.best_estimator_.predict(combined_vectors)


In [35]:
# Plot PCA of the vectors in 3 dimensions
%matplotlib qt
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
vectors_df_pca = pca.fit_transform(vectors_pv_dbow)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(vectors_df_pca[:, 0], vectors_df_pca[:, 1], vectors_df_pca[:, 2], c=text_to_analyse['cluster'])
plt.title("PCA of the vectors")
plt.show()

In [38]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3, verbose=1, perplexity=40)
tsne_vector = tsne.fit_transform(combined_vectors)
tsne_vector = pd.DataFrame(tsne_vector, columns=['TSNE1', 'TSNE2', 'TSNE3'])

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 44921 samples in 0.005s...
[t-SNE] Computed neighbors for 44921 samples in 5.612s...
[t-SNE] Computed conditional probabilities for sample 1000 / 44921
[t-SNE] Computed conditional probabilities for sample 2000 / 44921
[t-SNE] Computed conditional probabilities for sample 3000 / 44921
[t-SNE] Computed conditional probabilities for sample 4000 / 44921
[t-SNE] Computed conditional probabilities for sample 5000 / 44921
[t-SNE] Computed conditional probabilities for sample 6000 / 44921
[t-SNE] Computed conditional probabilities for sample 7000 / 44921
[t-SNE] Computed conditional probabilities for sample 8000 / 44921
[t-SNE] Computed conditional probabilities for sample 9000 / 44921
[t-SNE] Computed conditional probabilities for sample 10000 / 44921
[t-SNE] Computed conditional probabilities for sample 11000 / 44921
[t-SNE] Computed conditional probabilities for sample 12000 / 44921
[t-SNE] Computed conditional probabilities for sa

In [39]:
# Plot TSNE of the vectors in 3 dimensions
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(tsne_vector['TSNE1'], tsne_vector['TSNE2'], tsne_vector['TSNE3'], c=text_to_analyse['cluster'])
plt.title("TSNE of the vectors")
plt.show()

## Tests with actual standard error text

In [42]:
# Get the most similar texts
def get_similar_texts(text, model, topn=5):
    similar_texts = model.dv.most_similar([model.infer_vector(text.split())], topn=topn)
    return similar_texts

In [166]:
test_texts = [
    "Fallo de comunicaciones con la central El dispositivo intenta comunicar con la central, pero no la detecta. Los leds sí que parpadean.",
    "Fallo de comunicaciones con la máquina, el dispositivo intenta comunicar con la maquina, pero no la detecta. Los leds sí que parpadean.",
    "Unidad no arranca, las comunicaciones y los leds son correctos, pero la máquina no arranca.",
    "Error apertura/cierre. No muestra error en el sistema. La compuerta o rejilla no abre ni cierra, pero no se muesrta ningún error en el sistema."
]

#test_text = test_texts[random.randint(0, len(test_texts)-1)]
test_text = test_texts[3]

print(f'TARGET TEXT: {test_text} \n')

for model in loaded_models:
    similar_texts = get_similar_texts(test_text, model)
    print(f"Model: {model.comment}")
    for i, (index, similarity) in enumerate(similar_texts):
        print(f"Similar text {i+1}: {text_to_analyse['text_to_analyse'][index]} with similarity {similarity}\n")
    break

TARGET TEXT: Error apertura/cierre. No muestra error en el sistema. La compuerta o rejilla no abre ni cierra, pero no se muesrta ningún error en el sistema. 

Model: PV-DBOW plain
Similar text 1: La compuerta falla al abrir/cerrar, no funciona correctamente Compuerta motorizada de 300x150 La compuerta motorizada no funciona correctamente, se bloquea Compuerta motori with similarity 0.8232545256614685

Similar text 2: error cuando da corriente no carga BLUFACE error cuando da corriente no carga AZDI6BLUFACECB with similarity 0.8144838213920593

Similar text 3: No abre la rejilla Rejilla motorizada La rejilla no se abre CR300X150 with similarity 0.8122820854187012

Similar text 4:   No se abre CPM200ION with similarity 0.8110448718070984

Similar text 5: motor compuerta no funciona compuerta motorizada motor en mal estado no funciona cr350x150 with similarity 0.8095138072967529



## Calculate the similarity between the texts

In [162]:
# Read list of errors
errors = pd.read_csv("../DATA/errors.csv", sep=';')

In [471]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_mean_cosine_score(vector, text, model, n=5):
    cosine_scores = []
    for i in range(n):
        cosine_scores.append(
            cosine_similarity(vector.reshape(1, -1), model.infer_vector(text.split()).reshape(1, -1))
        )
    return np.mean(cosine_scores)

In [None]:
# Calculate the cosine similarity with all text_for_analyse for each of the errors descriptions
for i, id_error in enumerate(errors['ID_ERROR']):
    error_description = errors[errors['ID_ERROR'] == id_error]['DESCRIPCION'].values[0]
    text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))
    print(f"Error {i+1} of {len(errors)} calculated")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 1 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 2 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 3 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 4 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 5 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 6 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 7 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 8 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 9 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 10 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 11 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 12 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 13 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 14 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 15 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 16 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 17 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 18 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 19 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 20 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 21 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 22 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 23 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 24 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 25 of 75 calculated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_to_analyse.loc[:, f'cosine_similarity_{id_error}'] = text_to_analyse['PV-DBOW'].apply(lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0]))


Error 26 of 75 calculated


In [None]:
# Save text_to_analyse to disk
# text_to_analyse.to_csv("../DATA/text_to_analyse.csv", sep='¬', encoding='utf-8-sig', index=False)

In [3]:
# Load text_to_analyse from disk
# text_to_analyse = pd.read_csv("../DATA/text_to_analyse.csv", sep='¬', encoding='utf-8-sig')

  text_to_analyse = pd.read_csv("../DATA/text_to_analyse.csv", sep='¬', encoding='utf-8-sig')


In [479]:
text_to_analyse.loc[:,'cosine_similarity'] = text_to_analyse['PV-DBOW'] \
    .apply(lambda x: calculate_mean_cosine_score(x, test_text, loaded_models[0]))

In [478]:
text_to_analyse[['text_to_analyse','cosine_similarity']] \
    .sort_values(by='cosine_similarity', ascending=False) \
    .head(10)

Unnamed: 0,text_to_analyse,cosine_similarity
10493,No abre la rejilla Rejilla motorizada La rejilla no se abre CR300X150,0.822334
7878,"La compuerta falla al abrir/cerrar, no funciona correctamente Compuerta motorizada de 300x150 La compuerta motorizada no funciona correctamente, se bloquea Compuerta motori",0.819627
15293,error cuando da corriente no carga BLUFACE error cuando da corriente no carga AZDI6BLUFACECB,0.818195
13662,La persiana de 1 compuerta motorizada está rota en el eje Zona de aire plenaria La persiana de una compuerta motorizada está rota en el eje AZEZ6MELBS01M6,0.815587
38475,motor compuerta no funciona compuerta motorizada motor en mal estado no funciona cr350x150,0.813199
32401,La rejilla no funciona y vuelve loco al resto del sistema Rejilla inteligente La rejilla no funciona y vuelve loco al resto del sistema Rint 300x150MTE,0.806855
27432,Rejilla P/N: RINT060015BKMRE - S/N: 000C4YF1. La rejilla no se abre ni se cierra. Solicitamos el recambio del motor MOTOR REJILLA La rejilla no se abre ni se cierra. Solicitamos el recambio del motor RINT060015BKMRE,0.805893
35564,No se abre CPM200ION,0.804834
1504,"Apertura de la rejilla al instalarla en el techo. Los clips no sujetan, se abre solo. REJILLA 800x600 RRFR No se puede cerrar la puerta. RRFR080060BTX",0.803322
6925,La rejilla inteligente no se abre. Rejilla inteligente 300x150 Rejilla inteligente no se abre. RINT030015BKMTE,0.800364


In [18]:
cosine_columns = [col for col in text_to_analyse.columns if 'cosine_similarity_' in col]
text_to_analyse.loc[:, 'highest_score'] = text_to_analyse[cosine_columns].max(axis=1)
text_to_analyse.loc[:, 'highest_score_error'] = text_to_analyse[cosine_columns].idxmax(axis=1).apply(lambda x: x.split('_')[-1])

In [19]:
text_to_analyse.head(10)

Unnamed: 0,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse,PV-DBOW,PV-DM,cluster,cosine_similarity,cosine_similarity_1.1,...,cosine_similarity_5.2,cosine_similarity_5.3,cosine_similarity_6.1,cosine_similarity_6.2,cosine_similarity_7.1,cosine_similarity_7.2,cosine_similarity_7.3,cosine_similarity_7.4,highest_score,highest_score_error
0,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MODULO ALARMAS TECNICAS,INDICAN SE HA ESTROPEADO EN LA INSTALACION Y ...,MATS,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,[ 0.02292479 -0.21365319 -0.19756609 0.305030...,[ 2.7738678e-01 1.9454908e+00 -1.0628538e+00 ...,348,0.289285,0.353848,...,0.375369,0.402441,-0.007065,0.587309,0.475844,0.323344,0.38508,0.335893,0.587309,6.2
1,NO FUNCIONA,,NO FUNCIONA,AZC3TACTOCSB,NO FUNCIONA NO FUNCIONA AZC3TACTOCSB,[-0.05360986 -0.3043402 -0.07687344 -0.023034...,[-0.13926362 -0.00577123 0.00343576 -0.034550...,128,0.482504,0.491677,...,0.524526,0.526625,-0.023983,0.717873,0.56666,0.486015,0.574202,0.580762,0.763922,3.61
2,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,,TACTO BLOQUEADO. NO ENCIENDE-APAGA NI DEJA CAM...,AZATACTORSB,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,[-1.01469740e-01 -3.59431654e-01 -3.63641046e-...,[-0.40210313 0.33901432 -0.12698448 0.062674...,310,0.342568,0.315643,...,0.429952,0.357951,-0.054398,0.592114,0.45775,0.293253,0.368031,0.405849,0.661779,3.21
3,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,MODELO 2 PERSIANAS,UNA DE LAS 2 PERSIANAS NO FUNCIONA CON PULSADO...,PER2,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,[-0.22625495 -0.27601463 -0.14876926 0.216895...,[ 0.11031126 0.12480639 0.8863437 0.130832...,230,0.217849,0.374128,...,0.388664,0.367513,-0.082243,0.553798,0.437993,0.242106,0.368806,0.317617,0.553798,6.2
4,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR",,Rafael el viernes llego mi pedido nº 23349 pa...,RINT040015BKMTE,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR Rafa...",[-0.05795009 -0.30249217 -0.01555946 -0.156779...,[-2.64278650e-01 -1.75095573e-01 6.62560701e-...,91,0.486409,0.50568,...,0.431077,0.522106,0.17049,0.496958,0.5512,0.433052,0.579487,0.489847,0.589575,3.29
5,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC,MODULO DE FANCOIL DE ZONA 32Z,AZAMFANCOILC,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,[ 0.15441857 -0.63806075 0.02751571 0.398508...,[-0.4844401 0.09746525 0.689053 0.443628...,118,0.299898,0.311907,...,0.335267,0.394561,0.004328,0.557293,0.444822,0.295023,0.37267,0.308548,0.557293,6.2
6,MODULO DE FANCOIL NO SACA 7V POR EL BUS EN VEZ...,MODULO DE FANCOIL DE ZONA 32Z,"SE DETECTA QUE EL BLUEFACE AL ENCENDERLO, SE Q...",AZAMFANCOILC,MODULO DE FANCOIL NO SACA 7V POR EL BUS EN VEZ...,[ 0.11285342 -0.56048167 -0.09949385 0.374581...,[-1.2038625 -0.0567566 -0.1218907 -0.382796...,209,0.350422,0.320102,...,0.2806,0.344231,-0.116912,0.547864,0.457761,0.304932,0.403356,0.293523,0.547864,6.2
7,"Error al pedir la pasarela, a última hora hubo...",Pasarela Mitsubishi Heavy,"Error comercial, por cambio de última hora en ...",AZXEQADAPMHI,"Error al pedir la pasarela, a última hora hubo...",[ 0.14691335 -0.47134426 0.07787238 0.030501...,[ 5.1617138e-02 1.3685622e+00 4.3865690e-01 ...,304,0.60004,0.709797,...,0.644453,0.623907,0.12557,0.593102,0.548408,0.658222,0.659335,0.690858,0.734234,1.2
8,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...,,SE ROMPEN TAPAS TRASERAS. CAMBIAR TERMOSTATO.,AZC3BLUEFECOSB,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...,[ 0.0945986 -0.44374052 -0.02086731 0.254979...,[-0.51640475 0.6481779 0.16102794 -0.514311...,214,0.262732,0.409268,...,0.401019,0.384124,-0.032623,0.604275,0.540758,0.333147,0.366548,0.342346,0.604275,6.2
9,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...,,5 UNIDADES. SE ROMPEN TAPAS TRASERAS.,AZC3TACTOCSB,SE HAN CAMBIADO LAS TAPAS TRASERAS VARIAS VECE...,[ 0.11505522 -0.42948332 -0.03061972 0.228920...,[-0.6939791 0.54678166 0.26473445 -0.338831...,214,0.256015,0.387654,...,0.390971,0.386645,-0.013545,0.602241,0.535466,0.314,0.369151,0.329442,0.602241,6.2


In [26]:
top10_per_error = text_to_analyse[['text_to_analyse', 'highest_score', 'highest_score_error']] \
    .groupby('highest_score_error', group_keys=False) \
    .apply(lambda x: x.nlargest(10, 'highest_score')) \
    .reset_index(drop=True)

top10_per_error.head(500)

  .apply(lambda x: x.nlargest(10, 'highest_score')) \


Unnamed: 0,text_to_analyse,highest_score,highest_score_error
0,Problema en el mazo de cables en una caja regi...,0.904523,1.1
1,Faltan conectores en el dispositivo. AZPV0P1...,0.899310,1.1
2,Falta en el kit AZPV6WIRE2514,0.888198,1.1
3,dañado durante el transporte traznsport dañad...,0.879552,1.1
4,dañado durante el transporte transporte dañad...,0.879184,1.1
...,...,...,...
495,problema del servomotor servo motor AZ PV 0 M...,0.878018,4.04
496,motor HS Prueba HS del motor realizada azpv6m...,0.875286,4.04
497,motor del amortiguador defectuoso motor defec...,0.873244,4.04
498,Eje del motor y cubierta del gabinete. Panel c...,0.870889,4.04
