In [7]:
# import modules
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import sent_tokenize , word_tokenize
import glob
import re
import os
import numpy as np
import sys

In [8]:
# to keep these words unchangable we use python set
# as our docs are in english we use english stopwords
Stopwords = set(stopwords.words('english'))

In [9]:
def get_unique_word_freq(word_list):
    '''
    takes word list and returns unique-words with their freq in dict
    '''
    unqiue_words=list()
    words_freq={}
    # store unqiue words in list
    for word in word_list:
        if word not in unqiue_words:
            unqiue_words.append(word)
    # get unique words freq from all words list using count
    for unqiue_word in unqiue_words:
        # NOTE: this same method can be used to get freq of a particular word in given document
        words_freq[unqiue_word]=word_list.count(unqiue_word) # used count method of lists
    
    return words_freq

In [10]:
class Node:
    """to init a node with docID and word_freq in that document"""

    def __init__(self,doc_id,word_freq=None):
        self.doc_id=doc_id
        self.word_freq=word_freq
        self.next=None

class LinkedList:
    '''
    init linked list class
    '''
    def __init__(self,head=None):
        self.head=head

In [11]:
test_corpora_dir='test-corpora'
english_corpora_dir='english-corpora'
porter=PorterStemmer()

In [12]:
words_global=list() # to store words from all docs
words_freq_global_dict={} # to store word:freq from all docs
indexed_files={}

for i,file in enumerate(os.listdir(test_corpora_dir)):
    with open(test_corpora_dir+'/'+file,'r') as f:
        corpora_file_str=f.read() # text str

    """remove css code lines"""
    css_regex=re.compile(r'.mw.*}')
    # substitue regex expression by ''
    corpora_file_str=css_regex.sub('',corpora_file_str)

    """remove html tag lines"""
    html_regex=re.compile(r'<.*>')
    # substitue regex expression by ''
    corpora_file_str=html_regex.sub('',corpora_file_str)

    """remove special characters and only keep a-z;A-Z;0-9;space"""
    '''
    to avoid deprecation warning used one more backslash to escape backslash
    so \s -> \\s
    '''
    special_regex=re.compile('[^a-zA-Z0-9\\s]')
    # substitue regex expression by ''
    corpora_file_str=special_regex.sub('',corpora_file_str)

    """remove digits for file"""
    digit_regex=re.compile('\d')
    # substitue regex expression by ''
    corpora_file_str=digit_regex.sub('',corpora_file_str)

    sentence_tokens=sent_tokenize(corpora_file_str)
    # print(len(sentence_tokens))
    word_tokens=word_tokenize(corpora_file_str)
    # print(len(word_tokens))

    """avoid single characters and lower them and remove stopwords"""
    # TODO: what about special cases like UP > up or PIN > pin 
    # TODO: incase of tf-idf stopwords do not matter. 
    # for this should keep them? for cases "like to be or not to be"
    word_tokens=[porter.stem(word.lower()) for word in word_tokens if len(word)>1 and word not in Stopwords]
    # print(len(word_tokens))

    words_freq_global_dict.update(get_unique_word_freq(word_tokens)) # used update method of dict

    indexed_files[i+1]=file

unique_words_global=set(words_freq_global_dict.keys()) # used set to keep words unchangable



In [13]:
"""create inverted index linked list with freq count"""
inverted_index_data={}
for word in unique_words_global:
    inverted_index_data[word]=LinkedList()
    inverted_index_data[word].head=Node(1,Node)

for i,file in enumerate(os.listdir(test_corpora_dir)):
    with open(test_corpora_dir+'/'+file,'r') as f:
        corpora_file_str=f.read() # text str

    """remove css code lines"""
    css_regex=re.compile(r'.mw.*}')
    # substitue regex expression by ''
    corpora_file_str=css_regex.sub('',corpora_file_str)

    """remove html tag lines"""
    html_regex=re.compile(r'<.*>')
    # substitue regex expression by ''
    corpora_file_str=html_regex.sub('',corpora_file_str)

    """remove special characters and only keep a-z;A-Z;0-9;space"""
    '''
    to avoid deprecation warning used one more backslash to escape backslash
    so \s -> \\s
    '''
    special_regex=re.compile('[^a-zA-Z0-9\\s]')
    # substitue regex expression by ''
    corpora_file_str=special_regex.sub('',corpora_file_str)

    """remove digits for file"""
    digit_regex=re.compile('\d')
    # substitue regex expression by ''
    corpora_file_str=digit_regex.sub('',corpora_file_str)

    sentence_tokens=sent_tokenize(corpora_file_str)
    # print(len(sentence_tokens))
    word_tokens=word_tokenize(corpora_file_str)
    # print(len(word_tokens))

    """avoid single characters and lower them and remove stopwords"""
    # TODO: what about special cases like UP > up or PIN > pin 
    # TODO: incase of tf-idf stopwords do not matter. 
    # for this should keep them? for cases "like to be or not to be"
    word_tokens=[porter.stem(word.lower()) for word in word_tokens if len(word)>1 and word not in Stopwords]
    # print(len(word_tokens))

    tmp_word_freq_of_doc=get_unique_word_freq(word_tokens)

    for word in tmp_word_freq_of_doc.keys():
        tmp_LinkedList=inverted_index_data[word].head
        while tmp_LinkedList.next is not None:
            tmp_LinkedList=tmp_LinkedList.next
        tmp_LinkedList.next=Node(i+1,tmp_word_freq_of_doc[word])


In [25]:
query_input=input('Query > ')
tokenized_query=[word.lower() for word in word_tokenize(query_input)]


"""seperate logic and search terms"""
bool_words=list()
search_words=list()

for word in tokenized_query:
    if word != "and" and word != "or" and word != "not":
        search_words.append(word)
    else:
        bool_words.append(word)


total_documents=len(indexed_files)

query_word_zero_one=list()
for word in search_words:
    if word in unique_words_global:
        tmp_zero_one=[0]*total_documents
        curr_linkedlist=inverted_index_data[word].head
        while curr_linkedlist.next is not None:
            tmp_zero_one[curr_linkedlist.next.doc_id-1]=1
            curr_linkedlist=curr_linkedlist.next
        query_word_zero_one.append(tmp_zero_one)
    else:
        print(f'word > {word} < is not found in any document')
        # sys.exit() 
"""create a merged boolean(zero-one) list using bitwise operations"""
# try:
for word in bool_words:
    zero_one_list1=query_word_zero_one[0]
    zero_one_list2=query_word_zero_one[1]
    # implement and using '&'
    if word == 'and':
        bitwise_logic=[l1 & l2 for (l1,l2) in zip(zero_one_list1,zero_one_list2)]
        query_word_zero_one.remove(zero_one_list1)
        query_word_zero_one.remove(zero_one_list2)
        query_word_zero_one.insert(0,bitwise_logic)
    # implement or using '|'
    elif word == 'or':
        bitwise_logic=[l1 | l2 for (l1,l2) in zip(zero_one_list1,zero_one_list2)]
        query_word_zero_one.remove(zero_one_list1)
        query_word_zero_one.remove(zero_one_list2)
        query_word_zero_one.insert(0,bitwise_logic)
    # implement not using 'not'
    elif word == 'not':
        bitwise_logic=[int(not l1 == True) for l1 in zero_one_list2]
        query_word_zero_one.remove(zero_one_list2)
        query_word_zero_one.remove(zero_one_list1)
        bitwise_logic=[l1 & l2 for (l1,l2) in zip(zero_one_list1,bitwise_logic)]
    query_word_zero_one.insert(0,bitwise_logic)
# except IndexError:
    # print()
        
query_files_result=list()
try:
    for i,zero_one in enumerate(query_word_zero_one[0]):
        if zero_one==1:
            query_files_result.append(indexed_files[i+1]) # recall indexed_files is a dict
    print(f'{query_input} is present in {len(query_files_result)} files and they are \n {query_files_result}')
except IndexError:
    print(f'No files for query > {query_input}')

word > sunil < is not found in any document
ram and ram and sunil is present in 24 files and they are 
 ['C00059.txt', 'C00052.txt', 'C00050.txt', 'C00027.txt', 'C00030.txt', 'C00042.txt', 'C00012.txt', 'C00078.txt', 'C00011.txt', 'C00015.txt', 'C00094.txt', 'C00095.txt', 'C00068.txt', 'C00049.txt', 'C00071.txt', 'C00016.txt', 'C00086.txt', 'C00066.txt', 'C00021.txt', 'C00006.txt', 'C00018.txt', 'C00073.txt', 'C00090.txt', 'C00053.txt']


In [15]:
len(unique_words_global)

37088

In [16]:
porter=PorterStemmer()
porter.stem('Watchmaker')

'watchmak'