# After sales text clustering using Doc2Vec
## Adding CODART and Characteristics from A3ERP and improving the text preprocessing

## Data preprocessing (Merging the translated text)

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date

pd.options.mode.chained_assignment = None
from src.preprocessing.incidencias import Incidencias
from src.preprocessing.articulos import Articulos
from src.preprocessing.dataset import Dataset

today_date = date.today().isoformat()
data_version = "2024-05-14"
data_base_path = f"../DATA/processed/{data_version}"
model_base_path = f"../MODELS/{today_date}"

## Load the data

In [None]:
# Get articulos
articulos = Articulos().get_articulos().data
# Load incidencias
incidencias = (
    Incidencias()
    .get_incidencias()
    .load_best_match("../DATA/fuzzy_matches_w_scores.csv")
    .data
)

clean_dataset = Dataset(incidencias, articulos).generate_dataset().data

In [16]:
# Get only the columns with the fields of interest
text_to_analyse = clean_dataset[
    [
        "desc_problema_translated",
        "descripcion_translated",
        "problema_translated",
        "cod_articulo",
        "text_to_analyse",
    ]
]
# Fill NA with empty string
text_to_analyse.fillna("", inplace=True)

In [27]:
text_to_analyse.sample(10)

## Train DocVec and save the model

In [256]:
import spacy

nlp = spacy.load("es_core_news_sm")
# nlp = spacy.load('es_core_news_md')
# nlp = spacy.load('es_core_news_sm')
import multiprocessing
from collections import OrderedDict

import gensim.models.doc2vec

assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [257]:
# Function to preprocess the text
def preprocess_text(docs):
    texts = [doc for doc in nlp.pipe(docs, disable=["ner", "parser"])]
    processed_texts = []
    for doc in texts:
        tokens = [
            token.text.lower()
            for token in doc
            if not token.is_punct and not token.is_stop and not token.is_space
        ]
        processed_texts.append(" ".join(tokens))
    return processed_texts

In [258]:
# Class Model with comments
class CommentedDoc2Vec(Doc2Vec):
    def __init__(self, comment="", **kwargs):
        super().__init__(**kwargs)
        self.comment = comment

In [50]:
# Load train corpus from disk
corpus = pd.read_csv("../DATA/processed/2024-05-13/corpus.csv")
corpus["processed_text"] = preprocess_text(corpus["text_to_analyse"])
corpus.head()

In [51]:
common_kwargs = dict(
    vector_size=200,
    epochs=20,
    min_count=2,
    sample=0,
    workers=multiprocessing.cpu_count(),
    negative=5,
    hs=0,
    seed=0,
)

In [None]:
# PV-DBOW plain
model = CommentedDoc2Vec(
    dm=0,
    comment=f"PV-DBOW-"
    f"v_size {common_kwargs['vector_size']}-"
    f"epochs {common_kwargs['epochs']}-"
    f"hs {common_kwargs['hs']}-"
    f"sample {common_kwargs['sample']}-"
    f"negative {common_kwargs['negative']}-"
    f"min_count {common_kwargs['min_count']}",
    **common_kwargs,
)

In [None]:
# Create TaggedDocument objects
tagged_data = [
    TaggedDocument(words=doc.split(), tags=[i])
    for i, doc in enumerate(corpus["processed_text"])
]

In [None]:
# Build the vocabulary
model.build_vocab(tagged_data)
print("Model: %s : vocabulary scanned & state initialized" % model.comment)

In [None]:
# Train the model using the Corpus
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
print("%s training completed" % model.comment)

In [None]:
# Save the model
os.makedirs(model_base_path, exist_ok=True)
model_name = (
    f"{model_base_path}/{type(model).__name__}_{model.comment.replace(' ', '_')}.model"
)
model.save(model_name)
print(f"Model saved at {model_name}")

## Load the model and infer vectors

In [259]:
# Load the model
model_name = "../MODELS/2024-05-13/CommentedDoc2Vec_PV-DBOW-v_size_200-epochs_20-hs_0-sample_0-negative_5-min_count_2.model"
model = CommentedDoc2Vec.load(model_name)
print(f"Model {model} loaded")

In [260]:
# Infer vectors for the text_to_analyse
text_to_analyse["processed_text"] = preprocess_text(text_to_analyse["text_to_analyse"])
text_to_analyse["vector"] = text_to_analyse["processed_text"].apply(
    lambda x: model.infer_vector(x.split())
)

In [261]:
text_to_analyse.sample(10)

## Calculate the similarity between the texts

In [262]:
# Read list of errors
errors = pd.read_csv("../DATA/TablaTipoErrorPostventa.csv", sep=";", header=1)[
    ["Código", "CODCAR3", "CODCAR2", "DESCFAM", "Motivo General"]
]
errors.columns = [
    "ID_ERROR",
    "CODCAR3",
    "CODCAR2",
    "DESCFAM",
    "DESCRIPCION",
]  # Rename columns
errors["CODCAR2"] = errors["CODCAR2"].str.replace("-", "0").astype(int)  # Clean CODCAR2

In [263]:
# Infer vector for errors
errors["description_processed"] = preprocess_text(errors["DESCRIPCION"])
errors["vector"] = errors["description_processed"].apply(
    lambda x: model.infer_vector(x.split())
)

In [264]:
errors

In [265]:
from sklearn.metrics.pairwise import cosine_similarity


def calculate_cosine_score(vector, vector_error):
    return cosine_similarity(vector.reshape(1, -1), vector_error.reshape(1, -1))[0][0]


def calculate_mean_cosine_score(vector, vector_error, n=5):
    if vector.size == 0 or vector_error.size == 0:
        return np.nan  # Return NaN if there's no vector to compare
    cosine_scores = []
    for i in range(n):
        cosine_scores.append(calculate_cosine_score(vector, vector_error))
    return np.mean(cosine_scores)

In [266]:
# Calculate the cosine similarity between the text_to_analyse and the errors
for index, row in errors.iterrows():
    # Create a condition for filtering
    condition = text_to_analyse["CAR3"] == row["CODCAR3"]
    if row["CODCAR2"]:
        condition &= text_to_analyse["CAR2"] == row["CODCAR2"]

    if not text_to_analyse.loc[condition, "vector"].empty:
        text_to_analyse.loc[condition, f'cosine_similarity_{row["ID_ERROR"]}'] = (
            text_to_analyse.loc[condition, "vector"].apply(
                lambda x: calculate_mean_cosine_score(x, row["vector"])
            )
        )

    print(f"Error {row['ID_ERROR']} calculated")

In [267]:
text_to_analyse.sample(10)

In [269]:
# Save text_to_analyse to disk
# text_to_analyse_clean.to_csv(f"{data_base_path}/text_to_analyse_with_errors.csv", sep='¬', encoding='utf-8-sig', index=False)

In [4]:
# Load text_to_analyse from disk
text_to_analyse_clean = pd.read_csv(
    f"{data_base_path}/text_to_analyse_with_errors.csv", sep="¬", encoding="utf-8-sig"
)

In [5]:
cosine_columns = [
    col for col in text_to_analyse_clean.columns if "cosine_similarity_" in col
]
text_to_analyse_clean[cosine_columns] = text_to_analyse_clean[cosine_columns].fillna(
    0
)  # Fill NA with 0
text_to_analyse_clean.loc[:, "highest_score"] = text_to_analyse_clean[
    cosine_columns
].max(axis=1)
text_to_analyse_clean.loc[:, "highest_score_error"] = (
    text_to_analyse_clean[cosine_columns]
    .idxmax(axis=1)
    .apply(lambda x: x.split("_")[-1])
)

In [6]:
text_to_analyse_clean.head(10)

In [7]:
top10_per_error = (
    text_to_analyse_clean[
        ["codigo", "text_to_analyse", "highest_score", "highest_score_error"]
    ]
    .groupby("highest_score_error", group_keys=False)
    .apply(lambda x: x.nlargest(10, "highest_score"))
    .reset_index(drop=True)
)

top10_per_error.head(500)

## Visualize the results

In [8]:
results = text_to_analyse_clean[["vector", "highest_score_error"]]
# Convert string representations of lists to actual numpy arrays
results["vector"] = results["vector"].apply(
    lambda x: np.fromstring(x.strip("[]"), sep=" ") if isinstance(x, str) else x
)

In [9]:
# Expand each vector into its own column
expanded_vectors = results["vector"].apply(pd.Series)
expanded_vectors.columns = [f"vector_{i}" for i in range(expanded_vectors.shape[1])]
results = pd.concat([results, expanded_vectors], axis=1)

In [333]:
X = results.drop(["vector", "highest_score_error"], axis=1)
y = pd.to_numeric(results["highest_score_error"], errors="coerce")

In [334]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=0)
X_tsne = tsne.fit_transform(X)

In [342]:
# Plot the results
plt.figure(figsize=(20, 10))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap="tab20")
plt.colorbar()
plt.show()

In [343]:
# Create TSNE with 3 components
tsne = TSNE(n_components=3, random_state=0)
X_tsne_3d = tsne.fit_transform(X)

In [350]:
# Plot data in 3D
%matplotlib qt6

fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111, projection="3d")
scatter = ax.scatter(
    X_tsne_3d[:, 0], X_tsne_3d[:, 1], X_tsne_3d[:, 2], c=y, cmap="tab20"
)
# Add legend
plt.legend(*scatter.legend_elements(num=10), title="Classes")
plt.show()

In [1]:
from src.preprocessing.utils import pre_process_text_spacy

In [2]:
custom_stopwords = [
    "funciona",
    "funcionar",
    "averiado",
    "averiada",
    "averiados",
    "averiadas",
    "falla",
    "fallar",
    "fallado",
    "fallada",
    "fallados",
    "falladas",
    "falló",
    "falla",
    "falló",
    "fallaron",
    "fallado",
    "defecto",
    "defectos",
    "defectuoso",
    "defectuosa",
    "error",
    "errores",
    "problema",
    "problemas",
    "termostato",
    "tto"
]

In [3]:
test_str = [
    "El producto no funciona? 1341 Alo alo alo. AZCE6BLUECB",
    "TTO. TACTO: NO RESPONDE CORRECTAMENTE EL TACTIL TTO. TACTO CABLE SUPERF. BL. TTO. TACTO: NO RESPONDE CORRECTAMENTE EL TACTIL",
]

pre_process_text_spacy(test_str, stop_words=True, alpha=True, custom_stopwords=custom_stopwords)

['producto alo alo alo',
 'tacto responde correctamente tactil tacto cable superf bl tacto responde correctamente tactil']