In [1]:
import os
import spacy
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
# Cargar el modelo de lenguaje en español de SpaCy
nlp = spacy.load('en_core_web_sm')

In [2]:
# Función de preprocesamiento avanzado
def advanced_preprocess(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_space and not token.is_stop]
    return tokens


In [3]:
# Directorio donde están los archivos
directory = "ArchivosWiki"

# Preparar los datos
sentences = []
contexts = []
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        filepath = os.path.join(directory, filename)
        with open(filepath, "r", encoding='utf-8') as file:
            content = file.read()
        
        # Dividir el contenido en fragmentos más pequeños si es necesario
        max_chunk_size = 100000  # Tamaño máximo de cada fragmento
        content_chunks = [content[i:i+max_chunk_size] for i in range(0, len(content), max_chunk_size)]
        
        for chunk in content_chunks:
            # Preprocesar el fragmento
            tokens = advanced_preprocess(chunk)
            sentences.append(tokens)
            
            # Dividir el fragmento en oraciones utilizando SpaCy
            doc = nlp(chunk)
            paragraphs = [sent.text.strip() for sent in doc.sents]
            contexts.extend(paragraphs)


In [4]:
# Crear un DataFrame con los contextos
df = pd.DataFrame({'text': contexts})


In [5]:
# Generar una matriz TF-IDF para ponderar los embeddings
vectorizer = TfidfVectorizer(tokenizer=lambda x: advanced_preprocess(x), lowercase=True)
tfidf_matrix = vectorizer.fit_transform(df['text'])
tfidf_feature_names = vectorizer.get_feature_names_out()

# Crear un diccionario palabra: idf
idf_dict = dict(zip(tfidf_feature_names, vectorizer.idf_))




Paso 1: Cargar los Embeddings de GloVe
Primero, necesitamos cargar los embeddings desde el archivo glove.6B.300d.txt:

In [6]:
# Función para cargar los embeddings de GloVe
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            try:
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
            except ValueError:
                continue
    return embeddings_index

# Cargar los embeddings
glove_embeddings = load_glove_embeddings('./ModelosEmbeddings/Glove/glove.6B.300d.txt')


Paso 2: Modificar la Función para Obtener Embeddings de Oraciones
Actualiza la función get_weighted_sentence_embedding para utilizar los embeddings de GloVe:

In [7]:
def get_weighted_sentence_embedding(sentence, embeddings_index, idf_dict, vector_size=300):
    tokens = advanced_preprocess(sentence)
    word_embeddings = []
    weights = []
    for word in tokens:
        if word in embeddings_index and word in idf_dict:
            word_embeddings.append(embeddings_index[word])
            weights.append(idf_dict[word])
    if not word_embeddings:
        return np.zeros(vector_size)
    word_embeddings = np.array(word_embeddings)
    weights = np.array(weights).reshape(-1, 1)
    weighted_average = np.sum(word_embeddings * weights, axis=0) / np.sum(weights)
    return weighted_average



In [8]:
# Generar embeddings para cada contexto usando GloVe
df['embeddings'] = df['text'].apply(lambda x: get_weighted_sentence_embedding(x, glove_embeddings, idf_dict, vector_size=300))



In [9]:
def find_most_relevant_contexts(question, df, embeddings_index, idf_dict, top_n=3, vector_size=300):
    question_embedding = get_weighted_sentence_embedding(question, embeddings_index, idf_dict, vector_size)
    context_embeddings = np.vstack(df['embeddings'].values)
    similarities = cosine_similarity([question_embedding], context_embeddings)[0]
    df['similarity'] = similarities
    top_indices = df['similarity'].argsort()[-top_n:][::-1]
    most_relevant_contexts = df.iloc[top_indices]['text'].tolist()
    return most_relevant_contexts


In [10]:
# Ejemplo de uso
question = "¿Qué es la fotosíntesis?"

# Encontrar los contextos más relevantes
most_relevant_contexts = find_most_relevant_contexts(question, df, glove_embeddings, idf_dict, top_n=3, vector_size=300)

# Imprimir los contextos más relevantes
print("Pregunta:", question)
print("Contextos más relevantes:")
for idx, context in enumerate(most_relevant_contexts, 1):
    print(f"{idx}. {context}")


Pregunta: ¿Qué es la fotosíntesis?
Contextos más relevantes:
1. ==See also==

===Structure-related concepts===

(For example, in )

==Footnotes==

===Works cited===

==Further reading==

==External links==
2. John Potts of Potts, Oliver and Potts used a copper-engraved master to produce rollers to transfer the inks.
3. The production volume for printed cloth in Lancashire in 1750 was estimated at 50,000 pieces of 30; in 1850, it was 20,000,000 pieces.


In [11]:
import pickle

# Guardar el DataFrame en un archivo pickle
with open('context_embeddings.pkl', 'wb') as f:
    pickle.dump(df, f)


In [12]:
# Convertir la columna de embeddings en una matriz numpy
embeddings_matrix = np.vstack(df['embeddings'].values)

# Guardar los embeddings en un archivo numpy
np.save('embeddingsGlove.npy', embeddings_matrix)


In [13]:
# Guardar los textos y otros datos en un archivo CSV
df.drop('embeddings', axis=1).to_csv('context_dataGlove.csv', index=False)


In [14]:
# Guardar idf_dict en un archivo pickle
with open('idf_dict.pkl', 'wb') as f:
    pickle.dump(idf_dict, f)


In [15]:
# Guardar los embeddings de GloVe en un archivo pickle
with open('glove_embeddings.pkl', 'wb') as f:
    pickle.dump(glove_embeddings, f)


#  LECTURA DE LOS ARCHIVOS GENERADOS

In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import spacy

# Cargar el modelo de lenguaje
nlp = spacy.load('en_core_web_sm')

# Función de preprocesamiento
def advanced_preprocess(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_space and not token.is_stop]
    return tokens

# Función para calcular el embedding ponderado por TF-IDF
def get_weighted_sentence_embedding(sentence, embeddings_index, idf_dict, vector_size=300):
    tokens = advanced_preprocess(sentence)
    word_embeddings = []
    weights = []
    for word in tokens:
        if word in embeddings_index and word in idf_dict:
            word_embeddings.append(embeddings_index[word])
            weights.append(idf_dict[word])
    if not word_embeddings:
        return np.zeros(vector_size)
    word_embeddings = np.array(word_embeddings)
    weights = np.array(weights).reshape(-1, 1)
    weighted_average = np.sum(word_embeddings * weights, axis=0) / np.sum(weights)
    return weighted_average

# Función para encontrar los contextos más relevantes
def find_most_relevant_contexts(question, df, embeddings_index, idf_dict, top_n=3, vector_size=300):
    question_embedding = get_weighted_sentence_embedding(question, embeddings_index, idf_dict, vector_size)
    context_embeddings = np.vstack(df['embeddings'].values)
    similarities = cosine_similarity([question_embedding], context_embeddings)[0]
    df['similarity'] = similarities
    top_indices = df['similarity'].argsort()[-top_n:][::-1]
    most_relevant_contexts = df.iloc[top_indices][['text', 'similarity']].to_dict(orient='records')
    return most_relevant_contexts

# Cargar los datos guardados
with open('idf_dict.pkl', 'rb') as f:
    idf_dict = pickle.load(f)

with open('glove_embeddings.pkl', 'rb') as f:
    glove_embeddings = pickle.load(f)

df = pd.read_pickle('context_embeddings.pkl')

# Función para responder una pregunta
def answer_question(question, top_n=3):
    results = find_most_relevant_contexts(question, df, glove_embeddings, idf_dict, top_n=top_n, vector_size=300)
    print("\nPregunta:", question)
    print("Contextos más relevantes:")
    for idx, result in enumerate(results, 1):
        print(f"{idx}. {result['text']} (Similitud: {result['similarity']:.2f})")

# Ejemplo de uso
question = "What is the capital of France?"
answer_question(question, top_n=3)
