In [3]:
# Mini Search Engine

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import wordnet
from spellchecker import SpellChecker
from collections import defaultdict

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.7.2-py3-none-any.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [None]:
class SearchEngine():
  def __init__(self):
    # constructor
    self.documents = {} # key is index of document, value is the actual words of the document

  def add_document(self, index, content):
    self.documents[index] = content

  '''def index_document(self, document):
    # Documents are keys, values are words contained in that doc
    words = self.getstem(document)
    for word in words:
      if word in self.index:
        self.inverted_index[word].append(document)
      else:
        self.inverted_index[word] = [document]'''

  def normalize(self, document):
    # tokenize the word
    document = document.lower()
    tokens = word_tokenize(document) # list of strings

    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    # stemmer.stem takes the tokens into its root form (i.e. jumping -> jump, dogs -> dog)

    return stemmed_tokens


  def search(self, query, numresults=3):
      query_words = self.normalize(query)
      matching_docs = defaultdict(int)
      for index, content in self.documents.items():
        doc_words = self.normalize(content)
        for word in query_words:
          if word in doc_words:
            matching_docs[index] += 1

      sorted_docs = sorted(matching_docs.items(), key=lambda pair: pair[1], reverse=True)

      sorted_docs = sorted_docs[:numresults]
      sorted_docs = [item1 for item1, item2 in sorted_docs]
      return sorted_docs



# Takes in a query, gives us terms that are synonyms to the query automatically without manually having to specify alternative words to the query.
class QueryExpander():
  def __init__(self):
    pass

  def get_syn(self, term):
    # find synonym for term
    synonyms = set()
    wordNetsynonyms = wordnet.synsets(term)
    for s in wordNetsynonyms:
      for lemma in s.lemmas():
        synonyms.add(lemma.name())

    return list(synonyms)

  def query_expand(self, query):
    query_terms = self.normalize(query)
    expanded_terms = []
    for term in query_terms:
      synonyms = self.get_syn(term)
      expanded_terms.extend(synonyms) #.extend merges two lists into one list
    return " ".join(expanded_terms)


  def normalize(self, query):
    query = query.lower()
    tokens = word_tokenize(query) # list of strings

    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
  # stemmer.stem takes the tokens into its root form (i.e. jumping -> jump, dogs -> dog)

    return stemmed_tokens

def main():
  searchengine = SearchEngine()
  searchengine.add_document("doc1", "NLP for beginners")
  searchengine.add_document("doc2", "NLP stands for Natural Language Processessing")
  searchengine.add_document("doc3", "Python Developers like PyTorch")
  searchengine.add_document("doc4", "dog cat jump")

  queryexpand = QueryExpander()
  while True:
    userinput = input("Enter your query: ")
    if userinput.lower() == "exit":
      break
    expandedQuery = queryexpand.query_expand(userinput)
    results = searchengine.search(expandedQuery)
    print(" ")
    print("expandedQuery: ", expandedQuery)
    if results:
      for i, result in enumerate(results, 1):
        print(result)
    else:
      print("No match found.")


if __name__ == "__main__":
  main()

 
expandedQuery:  natural_language_processing NLP human_language_technology
doc1
doc2
 
expandedQuery:  pawl weenie give_chase frank heel dog track click hound domestic_dog tail wiener firedog blackguard cad tag frankfurter frump wienerwurst chase_after bounder hot_dog andiron hotdog chase dog-iron trail detent go_after Canis_familiaris
doc4


In [15]:
search = SearchEngine()
search.add_document("doc1", "hello i'm troy.")
search.search("hello i'm troy")

['doc1']