In [38]:
import numpy as np
from gensim.models import Word2Vec
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import unidecode
import numpy as np
from scipy import spatial
import numpy as np
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import os
from gensim.models import KeyedVectors
import numpy as np

# !MARK: - Constants

file_paths = {
    "Auditorias": "../chunks/auditorias.txt",
    "Ausentismo": "../chunks/ausentismo.txt",
    "Estructura Organizativa": "../chunks/estructura organizativa.txt",
    "Riesgos": "../chunks/riesgos.txt",
    "Información": "../chunks/información.txt",
    "Emergencia": "../chunks/emergencia.txt"
}

questions = [
    "¿Cómo se diferencia la ubicación física de la operativa?",  # Manual Estructura Organizativa pág. 60 y 73.
    "¿Qué información se incluye en el informe de la evaluación de riesgos que ofrece la plataforma y cuál es su utilidad para la gestión de riesgos en la empresa?",  # Manual Identificación y Evaluación de Riesgos (IER); pág. 24 - 26.
    "¿Hay indicadores relativos al cumplimiento de normas?",  # Manual Auditorías pág. 16 17 30 y 31.
    "¿Qué papel juega la estructura organizativa en la funcionalidad general de la plataforma y cómo interactúa con otros módulos?",  # Manual Estructura Organizativa
    "¿Qué permite realizar la evaluación de riesgos?",  # Manual Identificación y Evaluación de Riesgos (IER)
    "¿Qué tipo de documentos se pueden almacenar y compartir en el espacio denominado 'Documentos' y cuál es su importancia dentro del contexto de la gestión empresarial en la plataforma?",  # Manual del Repositorio documental
    "¿Se pueden llevar a cabo auditorías internas?",  # Manual Auditorías
    "¿Cuál es la tasa de adopción de esta plataforma en el mercado?",
    "¿Cómo se compara esta plataforma con otras soluciones de gestión de SST?",
    "¿Qué día hará mañana?"
]

# !MARK: - Classes

class TextUtils:
    @staticmethod
    def clean_and_tokenize(text, language):
        text = unidecode.unidecode(text.lower())
        stop_words = set(stopwords.words(language))
        text = re.sub(r'[^a-zA-Z\s]', '', text, re.I | re.A)
        tokens = word_tokenize(text, language=language)
        return [token for token in tokens if token not in stop_words]

class Model:
    def __init__(self, original_paragraphs, preprocessed_paragraphs, language='spanish'):
        self.original_paragraphs = original_paragraphs
        self.preprocessed_paragraphs = preprocessed_paragraphs
        self.language = language

    def predict(self, query, similarity_threshold):
        raise NotImplementedError("Subclasses should implement this method")
    
    @staticmethod
    def _cosine_similarity(vec1, vec2):
        if np.all(vec1 == 0) or np.all(vec2 == 0):
            return -1
        return 1 - spatial.distance.cosine(vec1, vec2)

class Word2VecModel(Model):
    def __init__(self, original_paragraphs, language='spanish'):
        super().__init__(original_paragraphs, [], language)
        self.embeddings_path = 'w2v_embeddings.npy'  # Path to save embeddings
        self.model = self._train_model()

    def _train_model(self):
        sentences = [TextUtils.clean_and_tokenize(paragraph, self.language) for paragraph in self.original_paragraphs]
        model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
        model.save("word2vec.model")  # Save the entire model
        return model

    def _get_sentence_embedding(self, sentence):
        words = TextUtils.clean_and_tokenize(sentence, self.language)
        word_vectors = [self.model.wv[word] for word in words if word in self.model.wv]
        if len(word_vectors) == 0:
            return np.zeros(self.model.vector_size)
        sentence_embedding = np.mean(word_vectors, axis=0)
        np.save(self.embeddings_path, sentence_embedding)  # Save the sentence embedding
        return sentence_embedding

    def predict(self, query, similarity_threshold):
        query_embedding = self._get_sentence_embedding(query)
        similarities = [(self._cosine_similarity(query_embedding, self._get_sentence_embedding(' '.join(TextUtils.clean_and_tokenize(para, self.language)))), para) for para in self.original_paragraphs]
        filtered_and_sorted = sorted([sim for sim in similarities if sim[0] > similarity_threshold], key=lambda x: x[0], reverse=True)
        return filtered_and_sorted, query_embedding

# !MARK: - File Contents

file_contents = {}

for key, path in file_paths.items():
    with open(path, 'r') as file:
        file_contents[key] = file.read().split('\n\n')


# !MARK: - Model Initialization

models = {}

for key, path in file_paths.items():
    with open(path, 'r') as file:
        file_contents[key] = file.read().split('\n\n')
    models[key] = Word2VecModel(file_contents[key])


In [39]:
# Collecting sentence embeddings for each model
model_embeddings = []
model_labels = []
models_list = []

for key, model in models.items():
    for paragraph in model.original_paragraphs:
        embedding = model._get_sentence_embedding(paragraph)
        model_embeddings.append(embedding)
        model_labels.append(key)
    models_list.append(key)

# Collecting embeddings for questions and finding the closest model for each question
question_embeddings = []
question_closest_models = []
question_similarities = []
question_closest_paragraphs = []

for question in questions:
    query_embedding = model._get_sentence_embedding(question)
    question_embeddings.append(query_embedding)
    
    # Find the closest model
    closest_model = None
    highest_similarity = -1
    closest_paragraph = ""
    for model_name, model in models.items():
        similarities = [
            (model._cosine_similarity(query_embedding, model._get_sentence_embedding(paragraph)), paragraph)
            for paragraph in model.original_paragraphs
        ]
        max_similarity, best_paragraph = max(similarities, key=lambda x: x[0])
        if max_similarity > highest_similarity:
            highest_similarity = max_similarity
            closest_model = model_name
            closest_paragraph = best_paragraph
    
    question_closest_models.append(closest_model)
    question_similarities.append(highest_similarity)
    question_closest_paragraphs.append(closest_paragraph)

# Printing questions and their related data based on similarity
for question, closest_model, similarity in zip(questions, question_closest_models, question_similarities):
    if similarity > 0.75:
        print(f"Question: {question}")
        print(f"Closest Model: {closest_model}")
        print(f"Similarity: {similarity:.2f}")
        print()
    else:
        print(f"'{question}' does not have enough similarity")

# Combining model embeddings and question embeddings
all_embeddings = np.array(model_embeddings + question_embeddings)
labels = model_labels + ["Question"] * len(questions)
hover_texts = model_labels + [
    f"Question: {question}<br>Closest Model: {closest_model}<br>Similarity: {similarity:.2f}<br>Chunk: {closest_paragraph[:100]}..."
    for question, closest_model, similarity, closest_paragraph in zip(questions, question_closest_models, question_similarities, question_closest_paragraphs)
]

# Reducing dimensionality using PCA for 2D and 3D plots
pca_2d = PCA(n_components=2)
pca_2d_result = pca_2d.fit_transform(all_embeddings)

pca_3d = PCA(n_components=3)
pca_3d_result = pca_3d.fit_transform(all_embeddings)

# Reducing dimensionality using t-SNE for 2D and 3D plots
tsne_2d = TSNE(n_components=2, random_state=42)
tsne_2d_result = tsne_2d.fit_transform(all_embeddings)

tsne_3d = TSNE(n_components=3, random_state=42)
tsne_3d_result = tsne_3d.fit_transform(all_embeddings)

# Plotting with 2D PCA
fig_pca = px.scatter(
    x=pca_2d_result[:, 0], 
    y=pca_2d_result[:, 1], 
    color=labels, 
    hover_data={'label': hover_texts},
    labels={'x': 'PCA Component 1', 'y': 'PCA Component 2'},
    title="Embeddings Visualization with 2D PCA"
)

# Adding annotations for questions in 2D PCA plot
for i, question in enumerate(questions):
    fig_pca.add_annotation(
        x=pca_2d_result[len(model_embeddings) + i, 0], 
        y=pca_2d_result[len(model_embeddings) + i, 1], 
        text=f"Q{i+1}", 
        showarrow=True,
        arrowhead=2
    )

# Plotting with 2D t-SNE
fig_tsne = px.scatter(
    x=tsne_2d_result[:, 0], 
    y=tsne_2d_result[:, 1], 
    color=labels, 
    hover_data={'label': hover_texts},
    labels={'x': 't-SNE Component 1', 'y': 't-SNE Component 2'},
    title="Embeddings Visualization with 2D t-SNE"
)

# Adding annotations for questions in 2D t-SNE plot
for i, question in enumerate(questions):
    fig_tsne.add_annotation(
        x=tsne_2d_result[len(model_embeddings) + i, 0], 
        y=tsne_2d_result[len(model_embeddings) + i, 1], 
        text=f"Q{i+1}", 
        showarrow=True,
        arrowhead=2
    )

# Plotting with 3D PCA
fig_pca_3d = px.scatter_3d(
    x=pca_3d_result[:, 0], 
    y=pca_3d_result[:, 1], 
    z=pca_3d_result[:, 2], 
    color=labels, 
    hover_data={'label': hover_texts},
    labels={'x': 'PCA Component 1', 'y': 'PCA Component 2', 'z': 'PCA Component 3'},
    title="Embeddings Visualization with 3D PCA"
)

# Adding annotations for questions in 3D PCA plot
for i, question in enumerate(questions):
    fig_pca_3d.add_trace(go.Scatter3d(
        x=[pca_3d_result[len(model_embeddings) + i, 0]],
        y=[pca_3d_result[len(model_embeddings) + i, 1]],
        z=[pca_3d_result[len(model_embeddings) + i, 2]],
        text=f"Q{i+1}",
        mode='markers+text',
        textposition='top center',
        showlegend=False
    ))

# Plotting with 3D t-SNE
fig_tsne_3d = px.scatter_3d(
    x=tsne_3d_result[:, 0], 
    y=tsne_3d_result[:, 1], 
    z=tsne_3d_result[:, 2], 
    color=labels, 
    hover_data={'label': hover_texts},
    labels={'x': 't-SNE Component 1', 'y': 't-SNE Component 2', 'z': 't-SNE Component 3'},
    title="Embeddings Visualization with 3D t-SNE"
)

# Adding annotations for questions in 3D t-SNE plot
for i, question in enumerate(questions):
    fig_tsne_3d.add_trace(go.Scatter3d(
        x=[tsne_3d_result[len(model_embeddings) + i, 0]],
        y=[tsne_3d_result[len(model_embeddings) + i, 1]],
        z=[tsne_3d_result[len(model_embeddings) + i, 2]],
        text=f"Q{i+1}",
        mode='markers+text',
        textposition='top center',
        showlegend=False
    ))

# Show plots
fig_pca.show()
fig_tsne.show()
fig_pca_3d.show()
fig_tsne_3d.show()


'¿Cómo se diferencia la ubicación física de la operativa?' does not have enough similarity
Question: ¿Qué información se incluye en el informe de la evaluación de riesgos que ofrece la plataforma y cuál es su utilidad para la gestión de riesgos en la empresa?
Closest Model: Emergencia
Similarity: 0.93

'¿Hay indicadores relativos al cumplimiento de normas?' does not have enough similarity
'¿Qué papel juega la estructura organizativa en la funcionalidad general de la plataforma y cómo interactúa con otros módulos?' does not have enough similarity
Question: ¿Qué permite realizar la evaluación de riesgos?
Closest Model: Emergencia
Similarity: 0.96

Question: ¿Qué tipo de documentos se pueden almacenar y compartir en el espacio denominado 'Documentos' y cuál es su importancia dentro del contexto de la gestión empresarial en la plataforma?
Closest Model: Emergencia
Similarity: 0.90

Question: ¿Se pueden llevar a cabo auditorías internas?
Closest Model: Emergencia
Similarity: 0.91

'¿Cuál es

# Conclusiones de la visualización:

Las visualizaciones utilizando tanto PCA como t-SNE muestran que los clusters de las categorías están bastante cerca uno del otro. A pesar de que las preguntas parecen tener buenos valores de similitud, al analizar las respuestas obtenidas, observamos que no tienen mucho sentido en relación a las preguntas planteadas.

In [40]:
# !MARK: - Classes

class PretrainedModel(Model):
    def __init__(self, original_paragraphs, language='spanish', model_path='sbw_vectors.bin'):
        super().__init__(original_paragraphs, [], language)
        self.model = self._load_pretrained_model(model_path)

    def _load_pretrained_model(self, model_path):
        if os.path.exists(model_path):
            return KeyedVectors.load_word2vec_format(model_path, binary=True)
        else:
            raise FileNotFoundError(f"El modelo preentrenado no se encuentra en la ruta: {model_path}")

    def predict(self, query, similarity_threshold):
        query_embedding = self._get_sentence_embedding_pretrained(query)
        similarities = [(self._cosine_similarity(query_embedding, self._get_sentence_embedding_pretrained(' '.join(TextUtils.clean_and_tokenize(para, self.language)))), para) for para in self.original_paragraphs]
        filtered_and_sorted = sorted([sim for sim in similarities if sim[0] > similarity_threshold], key=lambda x: x[0], reverse=True)
        return filtered_and_sorted

    def _get_sentence_embedding_pretrained(self, sentence):
        words = TextUtils.clean_and_tokenize(sentence, self.language)
        word_vectors = [self.model[word] for word in words if word in self.model]
        if len(word_vectors) == 0:
            return np.zeros(self.model.vector_size)
        return np.mean(word_vectors, axis=0)

# !MARK: - Model Initialization

models = {}

for key, path in file_paths.items():
    with open(path, 'r') as file:
        file_contents[key] = file.read().split('\n\n')
    models[key] = PretrainedModel(file_contents[key])

In [45]:
# Collecting sentence embeddings for each model
model_embeddings = []
model_labels = []
models_list = []

for key, model in models.items():
    for paragraph in model.original_paragraphs:
        embedding = model._get_sentence_embedding_pretrained(paragraph)
        model_embeddings.append(embedding)
        model_labels.append(key)
    models_list.append(key)

# Collecting embeddings for questions and finding the closest model for each question
question_embeddings = []
question_closest_models = []
question_similarities = []
question_closest_paragraphs = []

for question in questions:
    query_embedding = model._get_sentence_embedding_pretrained(question)
    question_embeddings.append(query_embedding)
    
    # Find the closest model
    closest_model = None
    highest_similarity = -1
    closest_paragraph = ""
    for model_name, model in models.items():
        similarities = [
            (model._cosine_similarity(query_embedding, model._get_sentence_embedding_pretrained(paragraph)), paragraph)
            for paragraph in model.original_paragraphs
        ]
        max_similarity, best_paragraph = max(similarities, key=lambda x: x[0])
        if max_similarity > highest_similarity:
            highest_similarity = max_similarity
            closest_model = model_name
            closest_paragraph = best_paragraph
    
    question_closest_models.append(closest_model)
    question_similarities.append(highest_similarity)
    question_closest_paragraphs.append(closest_paragraph)

# Printing questions and their related data based on similarity
for question, closest_model, similarity in zip(questions, question_closest_models, question_similarities):
    if similarity > 0.75:
        print(f"Question: {question}")
        print(f"Closest Model: {closest_model}")
        print(f"Similarity: {similarity:.2f}")
        print()
    else:
        print(f"'{question}' does not have enough similarity")

# Combining model embeddings and question embeddings
all_embeddings = np.array(model_embeddings + question_embeddings)
labels = model_labels + ["Question"] * len(questions)
hover_texts = model_labels + [
    f"Question: {question}<br>Closest Model: {closest_model}<br>Similarity: {similarity:.2f}<br>Chunk: {closest_paragraph[:100]}..."
    for question, closest_model, similarity, closest_paragraph in zip(questions, question_closest_models, question_similarities, question_closest_paragraphs)
]

# Reducing dimensionality using PCA for 2D and 3D plots
pca_2d = PCA(n_components=2)
pca_2d_result = pca_2d.fit_transform(all_embeddings)

pca_3d = PCA(n_components=3)
pca_3d_result = pca_3d.fit_transform(all_embeddings)

# Reducing dimensionality using t-SNE for 2D and 3D plots
tsne_2d = TSNE(n_components=2, random_state=42)
tsne_2d_result = tsne_2d.fit_transform(all_embeddings)

tsne_3d = TSNE(n_components=3, random_state=42)
tsne_3d_result = tsne_3d.fit_transform(all_embeddings)

# Plotting with 2D PCA
fig_pca = px.scatter(
    x=pca_2d_result[:, 0], 
    y=pca_2d_result[:, 1], 
    color=labels, 
    hover_data={'label': hover_texts},
    labels={'x': 'PCA Component 1', 'y': 'PCA Component 2'},
    title="Embeddings Visualization with 2D PCA"
)

# Adding annotations for questions in 2D PCA plot
for i, question in enumerate(questions):
    fig_pca.add_annotation(
        x=pca_2d_result[len(model_embeddings) + i, 0], 
        y=pca_2d_result[len(model_embeddings) + i, 1], 
        text=f"Q{i+1}", 
        showarrow=True,
        arrowhead=2
    )

# Plotting with 2D t-SNE
fig_tsne = px.scatter(
    x=tsne_2d_result[:, 0], 
    y=tsne_2d_result[:, 1], 
    color=labels, 
    hover_data={'label': hover_texts},
    labels={'x': 't-SNE Component 1', 'y': 't-SNE Component 2'},
    title="Embeddings Visualization with 2D t-SNE"
)

# Adding annotations for questions in 2D t-SNE plot
for i, question in enumerate(questions):
    fig_tsne.add_annotation(
        x=tsne_2d_result[len(model_embeddings) + i, 0], 
        y=tsne_2d_result[len(model_embeddings) + i, 1], 
        text=f"Q{i+1}", 
        showarrow=True,
        arrowhead=2
    )

# Plotting with 3D PCA
fig_pca_3d = px.scatter_3d(
    x=pca_3d_result[:, 0], 
    y=pca_3d_result[:, 1], 
    z=pca_3d_result[:, 2], 
    color=labels, 
    hover_data={'label': hover_texts},
    labels={'x': 'PCA Component 1', 'y': 'PCA Component 2', 'z': 'PCA Component 3'},
    title="Embeddings Visualization with 3D PCA"
)

# Adding annotations for questions in 3D PCA plot
for i, question in enumerate(questions):
    fig_pca_3d.add_trace(go.Scatter3d(
        x=[pca_3d_result[len(model_embeddings) + i, 0]],
        y=[pca_3d_result[len(model_embeddings) + i, 1]],
        z=[pca_3d_result[len(model_embeddings) + i, 2]],
        text=f"Q{i+1}",
        mode='markers+text',
        textposition='top center',
        showlegend=False
    ))

# Plotting with 3D t-SNE
fig_tsne_3d = px.scatter_3d(
    x=tsne_3d_result[:, 0], 
    y=tsne_3d_result[:, 1], 
    z=tsne_3d_result[:, 2], 
    color=labels, 
    hover_data={'label': hover_texts},
    labels={'x': 't-SNE Component 1', 'y': 't-SNE Component 2', 'z': 't-SNE Component 3'},
    title="Embeddings Visualization with 3D t-SNE"
)

# Adding annotations for questions in 3D t-SNE plot
for i, question in enumerate(questions):
    fig_tsne_3d.add_trace(go.Scatter3d(
        x=[tsne_3d_result[len(model_embeddings) + i, 0]],
        y=[tsne_3d_result[len(model_embeddings) + i, 1]],
        z=[tsne_3d_result[len(model_embeddings) + i, 2]],
        text=f"Q{i+1}",
        mode='markers+text',
        textposition='top center',
        showlegend=False
    ))

# Show plots
fig_pca.show()
fig_tsne.show()
fig_pca_3d.show()
fig_tsne_3d.show()


'¿Cómo se diferencia la ubicación física de la operativa?' does not have enough similarity
Question: ¿Qué información se incluye en el informe de la evaluación de riesgos que ofrece la plataforma y cuál es su utilidad para la gestión de riesgos en la empresa?
Closest Model: Riesgos
Similarity: 0.88

'¿Hay indicadores relativos al cumplimiento de normas?' does not have enough similarity
Question: ¿Qué papel juega la estructura organizativa en la funcionalidad general de la plataforma y cómo interactúa con otros módulos?
Closest Model: Estructura Organizativa
Similarity: 0.86

Question: ¿Qué permite realizar la evaluación de riesgos?
Closest Model: Riesgos
Similarity: 0.86

Question: ¿Qué tipo de documentos se pueden almacenar y compartir en el espacio denominado 'Documentos' y cuál es su importancia dentro del contexto de la gestión empresarial en la plataforma?
Closest Model: Información
Similarity: 0.87

Question: ¿Se pueden llevar a cabo auditorías internas?
Closest Model: Emergencia

# Conclusiones de la visualización:

Ocurre el mismo problema que con el modelo anterior, los valores parecen estar en todos lados, no hay clusters bien definidos y eso se refleja en las respuestas obtenidas, que en su mayoria no tienen mucha relacion con la pregunta.