# After sales text clustering using TF-IDF and KMeans

## Data preprocessing (Merging the translated text)

In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

In [3]:
def query_data(query):
    """
    Function to query data from the database using sqlalchemy
    :param query:
    :return: pd.DataFrame

    Connection parameters:
    user = readmyzone
    password = (get from environment variable MYSQL_PASSWORD)
    host = 192.168.2.7
    port = 3306
    """

    # Create the connection string
    user = "readmyzone"
    password = os.environ.get("MYSQL_PASSWORD")
    host = "192.168.2.7"
    port = "3306"
    db = "myzone"
    connection_string = f"mysql+pymysql://{user}:{password}@{host}:{port}/{db}"

    # Create the engine
    engine = create_engine(connection_string)

    try:
        # Query the data
        data = pd.read_sql(query, engine)
    except Exception as e:
        print(e)
        data = None

    return data

# Load the data

In [4]:
sav_incidencias = query_data("SELECT * FROM sav_incidencias")
sav_piezas = query_data("SELECT * FROM sav_piezas")
sav_estados = query_data("SELECT * FROM sav_estados")
sav_incidencias_tipo = query_data("SELECT * FROM sav_incidencias_tipo")

In [5]:
dataset = sav_incidencias.merge(
    sav_piezas,
    left_on="codigo",
    right_on="codigo_incidencia",
    how="left",
    suffixes=(None, "_pieza"),
)
dataset = dataset.merge(
    sav_estados, left_on="estado", right_on="id", how="left", suffixes=(None, "_estado")
)
dataset = dataset.merge(
    sav_incidencias_tipo,
    left_on="tipo",
    right_on="id",
    how="left",
    suffixes=(None, "_tipo"),
)

In [6]:
clean_dataset = dataset[(dataset["tipo"] == 1) & (dataset["estado"].isin([2, 6]))]

In [7]:
# Load from disk the text to translate dictionary
fields_to_translate = ["desc_problema", "problema", "descripcion"]
text_to_translate = {}
for text in fields_to_translate:
    text_to_translate[text] = pd.read_csv(
        f"../DATA/{text}.csv", sep="¬", encoding="utf-8-sig"
    )

In [8]:
desc_problema_translated = pd.read_csv(
    "../DATA/desc_problema_translated.csv",
    sep="¬",
    encoding="utf-8-sig",
    engine="python",
)
descripcion_translated = pd.read_csv(
    "../DATA/descripcion_translated.csv", sep="¬", encoding="utf-8-sig", engine="python"
)
problema_translated = pd.read_csv(
    "../DATA/problema_translated.csv", sep="¬", encoding="utf-8-sig", engine="python"
)  # Data preprocessing (Merging the translated text)

In [9]:
# Delete rows with values (desc_problema, desc_problema_translated)
desc_problema_translated = desc_problema_translated[
    ~desc_problema_translated["desc_problema_translated"].isin(
        ["desc_problema_translated"]
    )
]
descripcion_translated = descripcion_translated[
    ~descripcion_translated["descripcion_translated"].isin(["descripcion_translated"])
]
problema_translated = problema_translated[
    ~problema_translated["problema_translated"].isin(["problema_translated"])
]

In [10]:
desc_problema_translated.count()

In [11]:
# Merge the translated text with the text_to_translate dataframe
desc_problema_translated = text_to_translate["desc_problema"].merge(
    desc_problema_translated,
    left_on="desc_problema",
    right_on="desc_problema",
    how="left",
)
descripcion_translated = text_to_translate["descripcion"].merge(
    descripcion_translated, left_on="descripcion", right_on="descripcion", how="left"
)
problema_translated = text_to_translate["problema"].merge(
    problema_translated, left_on="problema", right_on="problema", how="left"
)

In [12]:
# Fill NA with the original texts
desc_problema_translated.fillna(
    {"desc_problema_translated": desc_problema_translated["desc_problema"]},
    inplace=True,
)
descripcion_translated.fillna(
    {"descripcion_translated": descripcion_translated["descripcion"]}, inplace=True
)
problema_translated.fillna(
    {"problema_translated": problema_translated["problema"]}, inplace=True
)

In [13]:
desc_problema_translated.head(5)

In [14]:
# Merge the translated text with the original dataset
clean_dataset = clean_dataset.merge(
    desc_problema_translated,
    left_on="desc_problema",
    right_on="desc_problema",
    how="left",
)
clean_dataset = clean_dataset.merge(
    descripcion_translated, left_on="descripcion", right_on="descripcion", how="left"
)
clean_dataset = clean_dataset.merge(
    problema_translated, left_on="problema", right_on="problema", how="left"
)

In [15]:
clean_dataset[["desc_problema"]].head(100)

In [16]:
clean_dataset.describe(include="all")

In [17]:
for column in clean_dataset.columns:
    print(f"Column: {column}")

In [58]:
# Get only the columns with the fields of interest
text_to_analyse = clean_dataset[
    [
        "desc_problema_translated",
        "descripcion_translated",
        "problema_translated",
        "cod_articulo",
    ]
]
# Fill NA with empty string
text_to_analyse.fillna("", inplace=True)

In [59]:
text_to_analyse.loc[:, "text_to_analyse"] = (
    text_to_analyse["desc_problema_translated"]
    + " "
    + text_to_analyse["descripcion_translated"]
    + " "
    + text_to_analyse["problema_translated"]
    + " "
    + text_to_analyse["cod_articulo"]
)

In [60]:
text_to_analyse.head(10)

## Apply TF-IDF and KMeans clustering

In [61]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import string

# Download the stopwords and punkt
nltk.download("stopwords")
nltk.download("punkt")


def pre_process_text(text):
    """
    Function to preprocess the text
    Use nltk library to preprocess the text and return the preprocessed text
    Tokenize the text, remove stopwords, remove punctuation, lowercase the text
    :param text: text to preprocess
    :return: preprocessed text
    """
    try:
        text = str(text)  # Convert all entries to string
    except Exception as e:
        print(f"Error occurred during text conversion: {e}")
        return ""
    # Create the stopwords list
    stop_words = set(stopwords.words("spanish"))
    # Create the stemmer
    stemmer = SnowballStemmer("spanish")
    # Tokenize the text
    tokens = word_tokenize(text)
    # print(f'After tokenizer: {tokens}')
    # Remove the stopwords
    tokens = [word for word in tokens if word.lower() not in stop_words]
    # print(f'After remove stop words: {tokens}')
    # Remove the punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    # print(f'After remove punctuation: {tokens}')
    # Lowercase the text
    tokens = [word.lower() for word in tokens]
    # print(f'After lowercase words: {tokens}')
    # Stem the words
    tokens = [stemmer.stem(word) for word in tokens]
    # print(f'After Stemmer: {tokens}')
    return " ".join(tokens)

In [62]:
example_text = "Hola, esto es un ejemplo de texto que vamos a preprocesar para testear"
pre_process_text(example_text)

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    lowercase=False, preprocessor=pre_process_text, min_df=0.01, max_df=0.99
)
vector = vectorizer.fit_transform(text_to_analyse["text_to_analyse"])
vector = pd.DataFrame(vector.toarray(), columns=vectorizer.get_feature_names_out())

In [64]:
vector.sum().sort_values(ascending=False)

In [66]:
# Create a cloud of words with the most common words
from wordcloud import WordCloud

wordcloud = WordCloud(
    width=800, height=400, background_color="white"
).generate_from_frequencies(vector.sum().to_dict())
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [67]:
# Use PCA to reduce the dimensionality of the data
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
pca_vector = pca.fit_transform(vector)
pca_vector = pd.DataFrame(pca_vector, columns=["PC1", "PC2", "PC3"])

plt.figure(figsize=(20, 10))
plt.scatter(pca_vector["PC1"], pca_vector["PC2"], c=pca_vector["PC3"], cmap="viridis")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA of the text data")
plt.show()

In [68]:
# Plotting in 3D
%matplotlib qt
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(
    pca_vector["PC1"],
    pca_vector["PC2"],
    pca_vector["PC3"],
    c=pca_vector["PC3"],
    cmap="viridis",
)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
plt.title("PCA of the text data")
plt.show()

In [69]:
%matplotlib inline

In [70]:
# Use another dimensionality reduction technique
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3)
tsne_vector = tsne.fit_transform(vector)
tsne_vector = pd.DataFrame(tsne_vector, columns=["TSNE1", "TSNE2", "TSNE3"])

In [71]:
%matplotlib qt
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(
    tsne_vector["TSNE1"],
    tsne_vector["TSNE2"],
    tsne_vector["TSNE3"],
    c=tsne_vector["TSNE3"],
    cmap="viridis",
)
ax.set_xlabel("TSNE1")
ax.set_ylabel("TSNE2")
ax.set_zlabel("TSNE3")
plt.title("TSNE of the text data")
plt.show()

In [38]:
%matplotlib inline

In [78]:
# Create a sklearn pipeline to apply a clustering algorithm
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([("kmeans", KMeans())])

# Create grid search parameters
parameters = {
    "kmeans__n_clusters": [
        75,
        80,
        95,
        100,
        105,
        110,
        115,
        120,
        130,
        140,
        150,
        160,
        170,
        180,
        190,
        200,
    ]
}

grid_search = GridSearchCV(pipeline, parameters, cv=2, n_jobs=-1, verbose=1)
grid_search.fit(vector)

In [79]:
# Check the best parameters
grid_search.best_params_

In [80]:
# Add the cluster to the dataset
text_to_analyse["cluster"] = grid_search.best_estimator_.predict(vector)

In [75]:
# Plot the clusters
%matplotlib qt

# Plotting code
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(
    pca_vector["PC1"],
    pca_vector["PC2"],
    pca_vector["PC3"],
    c=text_to_analyse["cluster"],
    cmap="viridis",
)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
plt.title("PCA of the text data")
plt.show()

In [81]:
# Plot the clusters
%matplotlib qt

# Plotting code
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(
    tsne_vector["TSNE1"],
    tsne_vector["TSNE2"],
    tsne_vector["TSNE3"],
    c=text_to_analyse["cluster"],
    cmap="viridis",
)
ax.set_xlabel("TSNE1")
ax.set_ylabel("TSNE2")
ax.set_zlabel("TSNE3")
plt.title("After sales text clustering")
plt.show()

In [52]:
%matplotlib inline

In [82]:
text_to_analyse.head(10)

In [83]:
text_to_analyse[text_to_analyse["cluster"] == 1]["problema_translated"]

In [84]:
# Create a temporary column 'temp_index' to assign a unique row identifier within each cluster
text_to_analyse["temp_index"] = text_to_analyse.groupby("cluster").cumcount()

# Pivot the DataFrame
pivoted_df = text_to_analyse.pivot(
    index="temp_index", columns="cluster", values="text_to_analyse"
)

# Optionally, if you don't want the multi-level index that comes from pivoting
pivoted_df.columns = [f"Cluster_{int(col)}" for col in pivoted_df.columns]

# Drop the index if it is irrelevant
pivoted_df.reset_index(drop=True, inplace=True)

In [85]:
pivoted_df.head(50)

In [57]:
clean_dataset.head()

In [56]:
clean_dataset.groupby(["cod_articulo"]).size().sort_values(ascending=False)