In [1]:
import nltk
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# Download nltk resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\devch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\devch\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\devch\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def load_glove_embeddings(file_path):
    word_vectors = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            word_vectors[word] = vector
    return word_vectors

In [3]:
def tokenize_text(text):
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence.lower()) for sentence in sentences]
    return tokenized_sentences

---

In [3]:
#Read from Text file
def read_textfile(file_path):
    with open(file_path,'r', encoding = 'utf-8') as file:
        text = file.read()
        return text
    

In [5]:
# Load Glove embeddings
glove_file_path = 'glove.42B.300d.txt'  # Path to GloVe embeddings file
text_file_path = 'Data/alice_wonderland_chapter1.txt'

# Load GloVe embeddings
print("Loading GloVe embeddings...")
glove_embeddings = load_glove_embeddings(glove_file_path)
print("GloVe embeddings loaded.")

Loading GloVe embeddings...


GloVe embeddings loaded.


In [6]:
# Function that returns processed text and original sentences in a list (2 outputs)

def preprocess_text(text):
    sentences = sent_tokenize(text)
    preprocessed_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
        preprocessed_sentences.append(' '.join(filtered_tokens))
    return preprocessed_sentences, sentences

In [7]:
def top_n_sentences_origs(text_sentences,processed_text,processed_query,embeddings, top_n):
    # Vectorizer for document
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(processed_text)
    doc_vocabulary = tfidf_vectorizer.vocabulary_

    # Calculate GloVe Sentence Vectors for the document
    glove_sentence_vectors = []
    for sentence in processed_text:
        # Tokenize and average GloVe embeddings for words in the sentence
        word_embeddings = [embeddings[word] for word in sentence.split() if word in embeddings]
        if word_embeddings:
            sentence_vector = np.mean(word_embeddings, axis=0)
            glove_sentence_vectors.append(sentence_vector)
        else:
            # Handle the case where no words in the sentence have GloVe embeddings
            # You may choose to assign a default vector or skip the sentence
            pass
        
    #Vectorizer for query based on document vocabulary
    query_tfidf_vectorizer = TfidfVectorizer(vocabulary=doc_vocabulary)
    query_tfidf_vector = query_tfidf_vectorizer.fit_transform(processed_query)

    # Join the list of processed query words into a single string
    processed_query_str = ' '.join(processed_query)

    # Calculate GloVe Sentence Vector for the query sentence
    query_glove_vector = np.mean([embeddings[word] for word in processed_query_str.split() if word in embeddings], axis=0)

    # Compute cosine similarity between the query sentence and each sentence in the document
    cosine_similarities = cosine_similarity(query_tfidf_vector, tfidf_matrix)
    glove_cosine_similarities = cosine_similarity([query_glove_vector], glove_sentence_vectors)

    # Flatten cosine_similarities to match the shape of glove_cosine_similarities[0]
    cosine_similarities = cosine_similarities.flatten()

    # Combine TF-IDF and GloVe Similarity
    final_cosine_similarities = 0.5 * cosine_similarities + 0.5 * glove_cosine_similarities[0]

    # Get indices of top 3 most similar sentences
    top_indices = final_cosine_similarities.argsort()[-top_n:][::-1]

    # Get top 3 most similar sentences
    top_sentences = [text_sentences[i] for i in top_indices]
    return top_sentences


In [8]:
def top_n_sentences_origs_alice(text_sentences,processed_text,processed_query,embeddings, top_n):
    # Vectorizer for document
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(processed_text)
    doc_vocabulary = tfidf_vectorizer.vocabulary_

    # Calculate GloVe Sentence Vectors for the document
    glove_sentence_vectors = []
    for sentence in processed_text:
        # Tokenize and average GloVe embeddings for words in the sentence
        word_embeddings = [embeddings[word] for word in sentence.split() if word in embeddings]
        if word_embeddings:
            sentence_vector = np.mean(word_embeddings, axis=0)
            glove_sentence_vectors.append(sentence_vector)
        else:
            # Handle the case where no words in the sentence have GloVe embeddings
            # You may choose to assign a default vector or skip the sentence
            pass

    #Vectorizer for query based on document vocabulary
    query_tfidf_vectorizer = TfidfVectorizer(vocabulary=doc_vocabulary)

    query_tfidf_vector = query_tfidf_vectorizer.fit_transform(processed_query)

    # Join the list of processed query words into a single string
    processed_query_str = ' '.join(processed_query)

    # Calculate GloVe Sentence Vector for the query sentence
    query_glove_vector = np.mean([embeddings[word] for word in processed_query_str.split() if word in embeddings], axis=0)

    # Compute cosine similarity between the query sentence and each sentence in the document
    cosine_similarities = cosine_similarity(query_tfidf_vector, tfidf_matrix)
    glove_cosine_similarities = cosine_similarity([query_glove_vector], glove_sentence_vectors)

    # Flatten cosine_similarities to match the shape of glove_cosine_similarities[0]
    cosine_similarities = cosine_similarities.flatten()

    # Remove the last element from cosine_similarities to align shapes
    cosine_similarities = cosine_similarities[:-1]

    # Combine TF-IDF and GloVe Similarity
    final_cosine_similarities = 0.5 * cosine_similarities + 0.5 * glove_cosine_similarities[0]

    # Get indices of top 3 most similar sentences
    top_indices = final_cosine_similarities.argsort()[-top_n:][::-1]

    # Get top 3 most similar sentences
    top_sentences = [text_sentences[i] for i in top_indices]
    return top_sentences


---

#### Alice in Wonderland

In [9]:
def ask_question_alice(file_path, query):
    text = read_textfile(file_path)
    processed_text, orig_sentences = preprocess_text(text)
    processed_query,_ = preprocess_text(query)
    print("Query: ", query)
    top_sentences_alice = top_n_sentences_origs_alice(orig_sentences,processed_text, processed_query, glove_embeddings,3)
    print("top 5 sentences are: \n")
    for sentence in top_sentences_alice:
        print(sentence)
    print("------------------\n")


#### Try using function on Alice in Wonderland chapter 1

In [10]:
file_path = "Data/alice_wonderland_chapter1.txt"

queries = ["What was the three-legged table made of?",
           "What strange thing did Alice notice about the key she found?",
           "What did Alice think when tumbling down stairs?"]

for query in queries:
    ask_question_alice(file_path, query)

Query:  What was the three-legged table made of?
top 5 sentences are: 

Suddenly she came upon a little three-legged table, all made of solid glass; there was nothing on it except a tiny golden key, and Alice’s first thought was that it might belong to one of the doors of the hall; but, alas!
when she got to the door, she found she had forgotten the little golden key, and when she went back to the table for it, she found she could not possibly reach it: she could see it quite plainly through the glass, and she tried her best to climb up one of the legs of the table, but it was too slippery; and when she had tired herself out with trying, the poor little thing sat down and cried.
Why, there’s hardly enough of me left to make one respectable person!”

Soon her eye fell on a little glass box that was lying under the table: she opened it, and found in it a very small cake, on which the words “EAT ME” were beautifully marked in currants.
------------------

Query:  What strange thing did Al

---

### Psychology Textbook

In [11]:
def ask_question_psych(file_path, query):
    text = read_textfile(file_path)
    processed_text, orig_sentences = preprocess_text(text)
    processed_query,_ = preprocess_text(query)
    print("Query: ", query)
    top_sentences_alice = top_n_sentences_wo_minus1_origs(orig_sentences,processed_text, processed_query, glove_embeddings,3)
    print("top n sentences are: \n")
    for sentence in top_sentences_alice:
        print(sentence,"\n")
    print("----------\n")

In [12]:
file_path = "Data/psych_text.txt"

queries = ["how are ideas traditionally perceived in contrast to the physical world?",
           "According to Descartes, what is the nature of the mind or soul?",
           "What distinction is made between reality and truth in the passage?",
           "How does the passage explain the relationship between sensation and image?",
           "What is emphasized regarding the continuity between perception and ideation?"
           ]

for query in queries:
    ask_question_psych(file_path, query)

Query:  how are ideas traditionally perceived in contrast to the physical world?


top n sentences are: 

So long as one does not carefully analyse the value of ideas, one remains under the impression that ideas form a world apart, which is sharply distinguished from the physical world, and behaves towards it as an antithesis. 

We can now consider the world [87]of ideas as a physical world; but it is one of a peculiar nature, which is not, like the other, accessible to all, and is subject to its own laws, which are laws of association. 

By these very different characteristics, it separates itself so sharply from the outer world that all endeavour to bring the two together seems shocking; and it is very easy to understand that many minds should wish to remain faithful to the conception that ideas form a mental or moral world. 

----------

Query:  According to Descartes, what is the nature of the mind or soul?
top n sentences are: 

Descartes, in his Discours de la Méthode (4th part), remarking that he may pretend "not to have a body, and that there is no world or p