# After sales text clustering using Doc2Vec

## Data preprocessing (Merging the translated text)

In [1]:
import os
import pandas as pd

pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
from src.db.connections import MySQLConnector

## Load the data

In [2]:
# Load the data
conn = MySQLConnector(
    user="readmyzone",
    password=os.environ.get("MYSQL_PASSWORD"),
    host="192.168.2.7",
    port="3306",
)

In [3]:
sav_incidencias = conn.query_data(
    query="SELECT * FROM sav_incidencias", database="myzone"
)
sav_piezas = conn.query_data(query="SELECT * FROM sav_piezas", database="myzone")
sav_estados = conn.query_data(query="SELECT * FROM sav_estados", database="myzone")
sav_incidencias_tipo = conn.query_data(
    query="SELECT * FROM sav_incidencias_tipo", database="myzone"
)

In [4]:
dataset = sav_incidencias.merge(
    sav_piezas,
    left_on="codigo",
    right_on="codigo_incidencia",
    how="left",
    suffixes=(None, "_pieza"),
)
dataset = dataset.merge(
    sav_estados, left_on="estado", right_on="id", how="left", suffixes=(None, "_estado")
)
dataset = dataset.merge(
    sav_incidencias_tipo,
    left_on="tipo",
    right_on="id",
    how="left",
    suffixes=(None, "_tipo"),
)

In [5]:
clean_dataset = dataset[(dataset["tipo"] == 1) & (dataset["estado"].isin([2, 6]))]

In [6]:
# Load from disk the text to translate dictionary
fields_to_translate = ["desc_problema", "problema", "descripcion"]
text_to_translate = {}
for text in fields_to_translate:
    text_to_translate[text] = pd.read_csv(
        f"../DATA/{text}.csv", sep="¬", encoding="utf-8-sig"
    )

In [7]:
desc_problema_translated = pd.read_csv(
    "../DATA/desc_problema_translated.csv",
    sep="¬",
    encoding="utf-8-sig",
    engine="python",
)
descripcion_translated = pd.read_csv(
    "../DATA/descripcion_translated.csv", sep="¬", encoding="utf-8-sig", engine="python"
)
problema_translated = pd.read_csv(
    "../DATA/problema_translated.csv", sep="¬", encoding="utf-8-sig", engine="python"
)  # Data preprocessing (Merging the translated text)

In [8]:
# Delete rows with values (desc_problema, desc_problema_translated)
desc_problema_translated = desc_problema_translated[
    ~desc_problema_translated["desc_problema_translated"].isin(
        ["desc_problema_translated"]
    )
]
descripcion_translated = descripcion_translated[
    ~descripcion_translated["descripcion_translated"].isin(["descripcion_translated"])
]
problema_translated = problema_translated[
    ~problema_translated["problema_translated"].isin(["problema_translated"])
]

In [9]:
desc_problema_translated.count()

In [10]:
# Merge the translated text with the text_to_translate dataframe
desc_problema_translated = text_to_translate["desc_problema"].merge(
    desc_problema_translated,
    left_on="desc_problema",
    right_on="desc_problema",
    how="left",
)
descripcion_translated = text_to_translate["descripcion"].merge(
    descripcion_translated, left_on="descripcion", right_on="descripcion", how="left"
)
problema_translated = text_to_translate["problema"].merge(
    problema_translated, left_on="problema", right_on="problema", how="left"
)

In [11]:
# Fill NA with the original texts
desc_problema_translated.fillna(
    {"desc_problema_translated": desc_problema_translated["desc_problema"]},
    inplace=True,
)
descripcion_translated.fillna(
    {"descripcion_translated": descripcion_translated["descripcion"]}, inplace=True
)
problema_translated.fillna(
    {"problema_translated": problema_translated["problema"]}, inplace=True
)

In [12]:
desc_problema_translated.head(5)

In [13]:
# Merge the translated text with the original dataset
clean_dataset = clean_dataset.merge(
    desc_problema_translated,
    left_on="desc_problema",
    right_on="desc_problema",
    how="left",
)
clean_dataset = clean_dataset.merge(
    descripcion_translated, left_on="descripcion", right_on="descripcion", how="left"
)
clean_dataset = clean_dataset.merge(
    problema_translated, left_on="problema", right_on="problema", how="left"
)

In [14]:
clean_dataset[["desc_problema"]].head(100)

In [15]:
clean_dataset.describe(include="all")

In [16]:
for column in clean_dataset.columns:
    print(f"Column: {column}")

In [17]:
# Get only the columns with the fields of interest
text_to_analyse = clean_dataset[
    [
        "desc_problema_translated",
        "descripcion_translated",
        "problema_translated",
        "cod_articulo",
    ]
]
# Fill NA with empty string
text_to_analyse.fillna("", inplace=True)

In [18]:
text_to_analyse.loc[:, "text_to_analyse"] = (
    text_to_analyse["desc_problema_translated"]
    + " "
    + text_to_analyse["descripcion_translated"]
    + " "
    + text_to_analyse["problema_translated"]
    + " "
    + text_to_analyse["cod_articulo"]
)

In [19]:
text_to_analyse.head(10)

In [20]:
import multiprocessing
from collections import OrderedDict
import gensim.models.doc2vec

assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [21]:
class CommentedDoc2Vec(Doc2Vec):
    def __init__(self, comment="", **kwargs):
        super().__init__(**kwargs)
        self.comment = comment

## Train DocVec and save the model

In [47]:
common_kwargs = dict(
    vector_size=100,
    epochs=20,
    min_count=2,
    sample=0,
    workers=multiprocessing.cpu_count(),
    negative=5,
    hs=0,
)

# Create models
simple_models = [
    # PV-DBOW plain
    CommentedDoc2Vec(dm=0, comment="PV-DBOW plain", **common_kwargs),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    CommentedDoc2Vec(
        dm=1, window=10, alpha=0.05, comment="PV-DM averaging", **common_kwargs
    ),
]

# Create TaggedDocument objects
tagged_data = [
    TaggedDocument(words=doc.split(), tags=[i])
    for i, doc in enumerate(text_to_analyse["text_to_analyse"])
]

# Build the vocabulary
for model in simple_models:
    model.build_vocab(tagged_data)
    print("%s vocabulary scanned & state initialized" % model)

In [48]:
# Train the models
for model in simple_models:
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    print("%s training completed" % model)

In [53]:
for model in simple_models:
    print(model.comment)

In [54]:
from datetime import date

# Save the models
today_date = date.today().isoformat()
base_path = f"../MODELS/{today_date}"
os.makedirs(base_path, exist_ok=True)
for model in simple_models:
    model_name = (
        f"{base_path}/{type(model).__name__}_{model.comment.replace(' ', '_')}.model"
    )
    model.save(model_name)
    print(f"Model saved at {model_name}")

## Load the models

In [22]:
# Load the models
pv_dbow = f"../MODELS/2024-05-08/CommentedDoc2Vec_PV-DBOW_plain.model"
pv_dm = f"../MODELS/2024-05-08/CommentedDoc2Vec_PV-DM_averaging.model"
loaded_models = []
for model in [pv_dbow, pv_dm]:
    loaded_model = CommentedDoc2Vec.load(model)
    loaded_models.append(loaded_model)
    print(f"Model loaded from {model}")

In [23]:
# Get the vectors
vectors = [model.dv.vectors for model in loaded_models]

In [24]:
# Ensure text_to_analyse and vectors have the same length
assert len(text_to_analyse) == len(
    vectors[0]
), "Mismatched document counts between models"

# Add vectors to the text_to_analyse dataframe
text_to_analyse.loc[:, "PV-DBOW"] = list(vectors[0])
text_to_analyse.loc[:, "PV-DM"] = list(vectors[1])

In [25]:
# Convert the vectors into a 2D array for PCA
vectors_pv_dbow = np.vstack(text_to_analyse["PV-DBOW"])
vectors_pv_dm = np.vstack(text_to_analyse["PV-DM"])

# Combine both sets of vectors
combined_vectors = np.hstack([vectors_pv_dbow, vectors_pv_dm])

In [32]:
# Create a sklearn pipeline to apply a clustering algorithm
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([("kmeans", KMeans())])

# Create grid search parameters
parameters = {
    "kmeans__n_clusters": [200, 220, 240, 260, 280, 300, 320, 340, 360, 380, 400]
}

grid_search = GridSearchCV(pipeline, parameters, cv=2, n_jobs=-1, verbose=1)
grid_search.fit(combined_vectors)

In [33]:
# Check the best parameters
grid_search.best_params_

In [34]:
# Add the cluster to the dataset
text_to_analyse["cluster"] = grid_search.best_estimator_.predict(combined_vectors)

In [35]:
# Plot PCA of the vectors in 3 dimensions
%matplotlib qt
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
vectors_df_pca = pca.fit_transform(vectors_pv_dbow)
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.scatter(
    vectors_df_pca[:, 0],
    vectors_df_pca[:, 1],
    vectors_df_pca[:, 2],
    c=text_to_analyse["cluster"],
)
plt.title("PCA of the vectors")
plt.show()

In [38]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3, verbose=1, perplexity=40)
tsne_vector = tsne.fit_transform(combined_vectors)
tsne_vector = pd.DataFrame(tsne_vector, columns=["TSNE1", "TSNE2", "TSNE3"])

In [39]:
# Plot TSNE of the vectors in 3 dimensions
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.scatter(
    tsne_vector["TSNE1"],
    tsne_vector["TSNE2"],
    tsne_vector["TSNE3"],
    c=text_to_analyse["cluster"],
)
plt.title("TSNE of the vectors")
plt.show()

## Tests with actual standard error text

In [42]:
# Get the most similar texts
def get_similar_texts(text, model, topn=5):
    similar_texts = model.dv.most_similar([model.infer_vector(text.split())], topn=topn)
    return similar_texts

In [166]:
test_texts = [
    "Fallo de comunicaciones con la central El dispositivo intenta comunicar con la central, pero no la detecta. Los leds sí que parpadean.",
    "Fallo de comunicaciones con la máquina, el dispositivo intenta comunicar con la maquina, pero no la detecta. Los leds sí que parpadean.",
    "Unidad no arranca, las comunicaciones y los leds son correctos, pero la máquina no arranca.",
    "Error apertura/cierre. No muestra error en el sistema. La compuerta o rejilla no abre ni cierra, pero no se muesrta ningún error en el sistema.",
]

# test_text = test_texts[random.randint(0, len(test_texts)-1)]
test_text = test_texts[3]

print(f"TARGET TEXT: {test_text} \n")

for model in loaded_models:
    similar_texts = get_similar_texts(test_text, model)
    print(f"Model: {model.comment}")
    for i, (index, similarity) in enumerate(similar_texts):
        print(
            f"Similar text {i+1}: {text_to_analyse['text_to_analyse'][index]} with similarity {similarity}\n"
        )
    break

## Calculate the similarity between the texts

In [162]:
# Read list of errors
errors = pd.read_csv("../DATA/errors.csv", sep=";")

In [471]:
from sklearn.metrics.pairwise import cosine_similarity


def calculate_mean_cosine_score(vector, text, model, n=5):
    cosine_scores = []
    for i in range(n):
        cosine_scores.append(
            cosine_similarity(
                vector.reshape(1, -1), model.infer_vector(text.split()).reshape(1, -1)
            )
        )
    return np.mean(cosine_scores)

In [None]:
# Calculate the cosine similarity with all text_for_analyse for each of the errors descriptions
for i, id_error in enumerate(errors["ID_ERROR"]):
    error_description = errors[errors["ID_ERROR"] == id_error]["DESCRIPCION"].values[0]
    text_to_analyse.loc[:, f"cosine_similarity_{id_error}"] = text_to_analyse[
        "PV-DBOW"
    ].apply(
        lambda x: calculate_mean_cosine_score(x, error_description, loaded_models[0])
    )
    print(f"Error {i+1} of {len(errors)} calculated")

In [None]:
# Save text_to_analyse to disk
# text_to_analyse.to_csv("../DATA/text_to_analyse.csv", sep='¬', encoding='utf-8-sig', index=False)

In [3]:
# Load text_to_analyse from disk
# text_to_analyse = pd.read_csv("../DATA/text_to_analyse.csv", sep='¬', encoding='utf-8-sig')

In [479]:
text_to_analyse.loc[:, "cosine_similarity"] = text_to_analyse["PV-DBOW"].apply(
    lambda x: calculate_mean_cosine_score(x, test_text, loaded_models[0])
)

In [478]:
text_to_analyse[["text_to_analyse", "cosine_similarity"]].sort_values(
    by="cosine_similarity", ascending=False
).head(10)

In [18]:
cosine_columns = [col for col in text_to_analyse.columns if "cosine_similarity_" in col]
text_to_analyse.loc[:, "highest_score"] = text_to_analyse[cosine_columns].max(axis=1)
text_to_analyse.loc[:, "highest_score_error"] = (
    text_to_analyse[cosine_columns].idxmax(axis=1).apply(lambda x: x.split("_")[-1])
)

In [19]:
text_to_analyse.head(10)

In [26]:
top10_per_error = (
    text_to_analyse[["text_to_analyse", "highest_score", "highest_score_error"]]
    .groupby("highest_score_error", group_keys=False)
    .apply(lambda x: x.nlargest(10, "highest_score"))
    .reset_index(drop=True)
)

top10_per_error.head(500)