# After sales text clustering using Doc2Vec
## Enhanced corpus with more data

In [7]:
import pandas as pd
import numpy as np
import spacy

import multiprocessing
from datetime import date
import matplotlib.pyplot as plt
import os
import gensim.models.doc2vec

assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

pd.options.mode.chained_assignment = None

data_version = "2024-05-14"
corpus_version = "2024-05-15"
model_version = "2024-05-15"
data_base_path = f"../DATA/processed/"
model_base_path = f"../MODELS/{data_version}"

nlp = spacy.load("es_core_news_sm")

In [8]:
# Function to preprocess the text
def preprocess_text(docs):
    # Ensure all entries are strings
    docs = docs.fillna("").astype(str)
    # Process the text
    texts = [doc for doc in nlp.pipe(docs, disable=["ner", "parser"])]
    processed_texts = []
    for doc in texts:
        tokens = [
            token.text.lower()
            for token in doc
            if not token.is_punct and not token.is_stop and not token.is_space
        ]
        processed_texts.append(" ".join(tokens))
    return processed_texts

In [9]:
# Class Model with comments
class CommentedDoc2Vec(Doc2Vec):
    def __init__(self, comment="", **kwargs):
        super().__init__(**kwargs)
        self.comment = comment

In [10]:
# Load train corpus from disk
corpus = pd.read_csv("../DATA/processed/2024-05-15/corpus_spanish.csv", sep="¬")
corpus["text_to_analyse"] = (
    corpus["text_to_analyse"].fillna("").astype(str)
)  # Ensure all values are strings
corpus["processed_text"] = preprocess_text(corpus["text_to_analyse"])
corpus = corpus[corpus["processed_text"] != ""]
# corpus = pd.read_csv(f'../DATA/processed/{corpus_version}/corpus_processed.csv', sep='¬')
corpus.sample(10)

  corpus = pd.read_csv("../DATA/processed/2024-05-15/corpus_spanish.csv", sep="¬")


Unnamed: 0,text_to_analyse,language,processed_text
114788,Coloque o acabamento \nverticalmentePlacez la ...,es,coloque acabamento verticalmenteplacez garnitu...
38446,El sistema traba ja de modo \nnorma l \nBorne...,es,sistema traba ja norma l bornes circuito abier...
105686,Certifique-se de isolar o encaixe de ligação. \n,pt,certifique-se isolar encaixe ligação
106083,Permite visualizar información \nacerca de:\n-...,es,permite visualizar información acerca zona fir...
75780,Communication protocol \n Protocolo de comunic...,pt,communication protocol protocolo comunicaçãomo...
168185,una programación\nSelecciona el día de la sema...,es,programación selecciona semana unidad programa...
171936,"En caso de que realice un parpadeo rojo, \nind...",es,caso realice parpadeo rojo indicará zona ocupa...
113845,"Para fixação na parede, execute os passos a \n...",pt,fixação na parede execute passos seguir separe...
51888,Programa por defecto.,es,programa defecto
161083,"- Blueface Zona: Muestra los iconos de Modo, E...",es,blueface zona muestra iconos eco- adapt veloci...


In [11]:
common_kwargs = dict(
    vector_size=200,
    epochs=20,
    min_count=2,
    sample=0,
    workers=multiprocessing.cpu_count(),
    negative=5,
    hs=0,
    seed=0,
)

In [12]:
# PV-DBOW plain
model = CommentedDoc2Vec(
    dm=0,
    comment=f"PV-DBOW-"
    f"v_size {common_kwargs['vector_size']}-"
    f"epochs {common_kwargs['epochs']}-"
    f"hs {common_kwargs['hs']}-"
    f"sample {common_kwargs['sample']}-"
    f"negative {common_kwargs['negative']}-"
    f"min_count {common_kwargs['min_count']}-"
    f"corpus {corpus_version}",
    **common_kwargs,
)

In [13]:
# Create TaggedDocument objects
tagged_data = [
    TaggedDocument(words=doc.split(), tags=[i])
    for i, doc in enumerate(corpus["processed_text"])
]

In [14]:
# Build the vocabulary
model.build_vocab(tagged_data)
print("Model: %s : vocabulary scanned & state initialized" % model.comment)

Model: PV-DBOW-v_size 200-epochs 20-hs 0-sample 0-negative 5-min_count 2-corpus 2024-05-15 : vocabulary scanned & state initialized


In [13]:
# Train the model using the Corpus
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
print("%s training completed" % model.comment)

In [29]:
# Save the model
os.makedirs(model_base_path, exist_ok=True)
model_name = (
    f"{model_base_path}/{type(model).__name__}_{model.comment.replace(' ', '_')}.model"
)
model.save(model_name)
print(f"Model saved at {model_name}")

## Load the model and the data and infer vectors

In [15]:
# Load the model
model_name = "../MODELS/2024-05-15/CommentedDoc2Vec_PV-DBOW-v_size_200-epochs_20-hs_0-sample_0-negative_5-min_count_2.model"
model = CommentedDoc2Vec.load(model_name)
print(f"Model {model} loaded")

Model CommentedDoc2Vec<"PV-DBOW-v_size 200-epochs 20-hs 0-sample 0-negative 5-min_count 2",dbow,d200,n5,mc2,t8> loaded


In [16]:
# Load the data to analyse
text_to_analyse_clean = pd.read_csv(
    f"{data_base_path}/{data_version}/text_to_analyse_clean.csv", sep="¬"
)

  text_to_analyse_clean = pd.read_csv(


In [17]:
# Infer vectors for the text_to_analyse
text_to_analyse_clean["processed_text"] = preprocess_text(
    text_to_analyse_clean["text_to_analyse"]
)
text_to_analyse_clean["vector"] = text_to_analyse_clean["processed_text"].apply(
    lambda x: model.infer_vector(x.split())
)

In [18]:
text_to_analyse_clean.sample(10)

Unnamed: 0,codigo,id_pieza,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse,CODART_A3,Fuzzy_Score,CODART,...,CAR1,CAR2,CAR3,CAR4,DESCCAR1,DESCCAR2,DESCCAR3,DESCCAR4,processed_text,vector
14327,AWDPBWHU20,45308.0,AZCE6LITERB NO FUNCIONA,TERMOSTATO LITE RB,AZCE6LITERB NO FUNCIONA.,AZCE6LITERB,AZCE6LITERB NO FUNCIONA TERMOSTATO LITE RB AZC...,AZCE6LITERB,100.0,AZCE6LITERB,...,1.0,250.0,91.0,3.0,SISTEMAS DE ZONAS,FLEXA (CE6),TERMOSTATOS,LITE,azce6literb funciona termostato lite rb azce6l...,"[-0.08257783, 0.09332839, -0.13394175, -0.0189..."
23454,LJIYMJDL04,81132.0,EL TERMOSTATO NO CONSIGUE REINICIARSE,TERMOSTATO SMART CABLE AIRZONE BLUEFACE BL 3,ENVIAR UN TERMOSTATO NUEVO,AZDI6BLUEFACECB,EL TERMOSTATO NO CONSIGUE REINICIARSE TERMOSTA...,AZDI6BLUEFACECB,100.0,AZDI6BLUEFACECB,...,1.0,251.0,91.0,1.0,SISTEMAS DE ZONAS,ACUAZONE (DI6),TERMOSTATOS,BLUEFACE,termostato reiniciarse termostato smart cable ...,"[-0.08034343, 0.03509668, -0.03916998, -0.1248..."
286,A2LKCZ5NBA,907.0,PROBLEMA RECURRENTE DESDE HACE MÁS DE 2 AÑOS\r...,TARJETA IP6 SERIE 3,NO FUNCIONA,AZC3FIBPRO6,PROBLEMA RECURRENTE DESDE HACE MÁS DE 2 AÑOS\r...,AZC3FIBPRO6,100.0,AZC3FIBPRO6,...,1.0,250.0,90.0,,SISTEMAS DE ZONAS,FLEXA (CE6),CENTRALES,,problema recurrente 2 años intervención planif...,"[-0.042891763, 0.12884247, -0.16951881, -0.046..."
7104,AWZQZWZP61,21630.0,CAMBIAR 2 GRADOS LA ZONA NO SE RESTABLECE\r\n1...,TERMOSTATO INTELIGENTE AIRZON COLOR NEGRO IBPRO6,COMPENSACIÓN DE TEMPERATURA EL TERMOSTATO NO E...,AZCE6BLUEFACECN,CAMBIAR 2 GRADOS LA ZONA NO SE RESTABLECE\r\n1...,AZCE6BLUEFACECN,100.0,AZCE6BLUEFACECN,...,1.0,250.0,91.0,1.0,SISTEMAS DE ZONAS,FLEXA (CE6),TERMOSTATOS,BLUEFACE,cambiar 2 grados zona restablece 1 termostato ...,"[-0.055040207, 0.019157363, -0.13818328, 0.004..."
19877,Y2PMZ2ITAA,65204.0,PROBLEMA CON EL TERMOSTATO THINK. EL CLIENTE S...,TERMOSTATO,SUSTITUCION DE MANDO,AZCE6THINKRB,PROBLEMA CON EL TERMOSTATO THINK. EL CLIENTE S...,AZCE6THINKRB,100.0,AZCE6THINKRB,...,1.0,250.0,91.0,2.0,SISTEMAS DE ZONAS,FLEXA (CE6),TERMOSTATOS,THINK,problema termostato think cliente solicita cam...,"[-0.034480777, 0.08338468, -0.12941803, -0.135..."
25576,N2FXZGLJEE,92686.0,PB TH RADIO + PLATINO CENTRAL,TERMOSTATO THINK RADIO BLANCO,PB TH PENSAR RADIO,AZCE6THINKRB,PB TH RADIO + PLATINO CENTRAL TERMOSTATO THINK...,AZCE6THINKRB,100.0,AZCE6THINKRB,...,1.0,250.0,91.0,2.0,SISTEMAS DE ZONAS,FLEXA (CE6),TERMOSTATOS,THINK,pb th radio + platino central termostato think...,"[0.06715114, 0.2336269, 0.015827589, -0.061567..."
13811,AWXMZMXTD8,43796.0,pantalla hs,termostato,SIN CONEXIÓN EN PANTALLA,ACCEBLUEFACECN,pantalla hs termostato SIN CONEXIÓN EN PANTALL...,AZCE6BLUEFACECN,90.0,AZCE6BLUEFACECN,...,1.0,250.0,91.0,1.0,SISTEMAS DE ZONAS,FLEXA (CE6),TERMOSTATOS,BLUEFACE,pantalla hs termostato conexión pantalla azce6...,"[-0.13532853, 0.05922585, -0.24502893, -0.1278..."
24683,LMPPAWPJ99,87716.0,"No sabemos el problema que han tenido, lo han ...",PENSAR RADIO BLANCO ACUAZONE,"No sabemos el problema que han tenido, lo han ...",AZDI6THINKRB,"No sabemos el problema que han tenido, lo han ...",AZDI6THINKRB,100.0,AZDI6THINKRB,...,1.0,251.0,91.0,2.0,SISTEMAS DE ZONAS,ACUAZONE (DI6),TERMOSTATOS,THINK,problema tratado directamente pensar radio bla...,"[-0.00014233572, 0.021523885, -0.062336545, -0..."
5761,M26YLPPM87,16711.0,AIRZONE TERMOSTATO INTELIG. BLUEFACE 32Z (BL) ...,TERMOSTATO INTELIGENTE AIRZONE CARA AZUL 32Z (BL),AIRZONE TERMOSTATO INTELIG. BLUEFACE 32Z (BL) ...,AZDI6BLUEFACECB,AIRZONE TERMOSTATO INTELIG. BLUEFACE 32Z (BL) ...,AZDI6BLUEFACECB,100.0,AZDI6BLUEFACECB,...,1.0,251.0,91.0,1.0,SISTEMAS DE ZONAS,ACUAZONE (DI6),TERMOSTATOS,BLUEFACE,airzone termostato intelig blueface 32z bl blo...,"[-0.19169681, -0.0061185006, 0.014725132, -0.2..."
13582,Z2LNAGVV75,43153.0,TERMOSTATO BLUEFACE SE REINICIA,TERMOSTATO,"EL TERMOSTATO SE REINICIA CONTINUAMENTE, TRAS ...",AZCE6BLUEFACECB,TERMOSTATO BLUEFACE SE REINICIA TERMOSTATO EL ...,AZCE6BLUEFACECB,100.0,AZCE6BLUEFACECB,...,1.0,250.0,91.0,1.0,SISTEMAS DE ZONAS,FLEXA (CE6),TERMOSTATOS,BLUEFACE,termostato blueface reinicia termostato termos...,"[-0.079074375, 0.032197904, -0.022248238, -0.2..."


In [20]:
text_to_analyse_clean['vector'].iloc[0].shape

(200,)

## Calculate the similarity between the texts

In [17]:
# Read list of errors
errors = pd.read_csv("../DATA/TablaTipoErrorPostventa.csv", sep=";", header=0)[
    ["Código", "CODCAR3", "CODCAR2", "DESCFAM", "Motivo General", "DESCRICION"]
]
errors.columns = [
    "ID_ERROR",
    "CODCAR3",
    "CODCAR2",
    "DESCFAM",
    "MOTIVO",
    "DESCRIPCION",
]  # Rename columns
errors["DESCRIPCION"] = (
    errors["MOTIVO"] + " " + errors["DESCRIPCION"]
)  # Concatenate MOTIVO and DESCRIPCION
errors["CODCAR2"] = errors["CODCAR2"].str.replace("-", "0").astype(int)  # Clean CODCAR2

In [18]:
# Infer vector for errors
errors["description_processed"] = preprocess_text(errors["DESCRIPCION"])
errors["vector"] = errors["description_processed"].apply(
    lambda x: model.infer_vector(x.split())
)

In [19]:
errors

In [20]:
from sklearn.metrics.pairwise import cosine_similarity


def calculate_cosine_score(vector, vector_error):
    return cosine_similarity(vector.reshape(1, -1), vector_error.reshape(1, -1))[0][0]


def calculate_mean_cosine_score(vector, vector_error, n=5):
    if vector.size == 0 or vector_error.size == 0:
        return np.nan  # Return NaN if there's no vector to compare
    cosine_scores = []
    for i in range(n):
        cosine_scores.append(calculate_cosine_score(vector, vector_error))
    return np.mean(cosine_scores)

In [21]:
# Calculate the cosine similarity between the text_to_analyse and the errors
for index, row in errors.iterrows():
    # Create a condition for filtering
    condition = text_to_analyse_clean["CAR3"] == row["CODCAR3"]
    if row["CODCAR2"]:
        condition &= text_to_analyse_clean["CAR2"] == row["CODCAR2"]

    if not text_to_analyse_clean.loc[condition, "vector"].empty:
        text_to_analyse_clean.loc[condition, f'cosine_similarity_{row["ID_ERROR"]}'] = (
            text_to_analyse_clean.loc[condition, "vector"].apply(
                lambda x: calculate_mean_cosine_score(x, row["vector"])
            )
        )

    print(f"Error {row['ID_ERROR']} calculated")

In [22]:
text_to_analyse_clean.sample(10)

In [None]:
# Save text_to_analyse to disk
# text_to_analyse_clean.to_csv(f"{data_base_path}/text_to_analyse_with_errors.csv", sep='¬', encoding='utf-8-sig', index=False)

In [None]:
# Load text_to_analyse from disk
# text_to_analyse_clean = pd.read_csv(f"{data_base_path}/text_to_analyse_with_errors.csv", sep='¬', encoding='utf-8-sig')

In [23]:
cosine_columns = [
    col for col in text_to_analyse_clean.columns if "cosine_similarity_" in col
]
text_to_analyse_clean[cosine_columns] = text_to_analyse_clean[cosine_columns].fillna(
    0
)  # Fill NA with 0
text_to_analyse_clean.loc[:, "highest_score"] = text_to_analyse_clean[
    cosine_columns
].max(axis=1)
text_to_analyse_clean.loc[:, "highest_score_error"] = (
    text_to_analyse_clean[cosine_columns]
    .idxmax(axis=1)
    .apply(lambda x: x.split("_")[-1])
)

In [24]:
text_to_analyse_clean.head(10)

In [25]:
top10_per_error = (
    text_to_analyse_clean[
        ["codigo", "text_to_analyse", "highest_score", "highest_score_error"]
    ]
    .groupby("highest_score_error", group_keys=False)
    .apply(lambda x: x.nlargest(10, "highest_score"))
    .reset_index(drop=True)
)

top10_per_error.head(500)

In [26]:
text_to_analyse_clean[text_to_analyse_clean["codigo"] == "MMHSNG1V2C"][
    ["codigo", "text_to_analyse", "highest_score", "highest_score_error"]
]

 ## Visualize the results

In [45]:
results = text_to_analyse_clean[["vector", "highest_score_error"]]
# Convert string representations of lists to actual numpy arrays
results["vector"] = results["vector"].apply(
    lambda x: np.fromstring(x.strip("[]"), sep=" ") if isinstance(x, str) else x
)

In [46]:
# Expand each vector into its own column
expanded_vectors = results["vector"].apply(pd.Series)
expanded_vectors.columns = [f"vector_{i}" for i in range(expanded_vectors.shape[1])]
results = pd.concat([results, expanded_vectors], axis=1)

In [47]:
X = results.drop(["vector", "highest_score_error"], axis=1)
y = pd.to_numeric(results["highest_score_error"], errors="coerce")

In [48]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=0)
X_tsne = tsne.fit_transform(X)

In [50]:
# Plot the results
plt.figure(figsize=(20, 10))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap="tab20")
plt.colorbar()
plt.show()

In [51]:
# Create TSNE with 3 components
tsne = TSNE(n_components=3, random_state=0)
X_tsne_3d = tsne.fit_transform(X)

In [52]:
# Plot data in 3D
%matplotlib qt6

fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111, projection="3d")
scatter = ax.scatter(
    X_tsne_3d[:, 0], X_tsne_3d[:, 1], X_tsne_3d[:, 2], c=y, cmap="tab20"
)
# Add legend
plt.legend(*scatter.legend_elements(num=10), title="Classes")
plt.show()