In [3]:
import os
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shahe\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [None]:
lmao = ['lmoa', 'bozen-bolzano']
print(lmao.index('bozen-bolzano'))

In [6]:
def get_stopwords():
    """
    This function is used to extract stopwords from 'Stopword-List.txt' file.

    It reads each line from the file, and if the line is not empty, it appends the line to the stopwords list.
    The function continues this process until it reaches the end of the file. Assumes the file is in your current working directory.

    Returns:
        stopwords (list): A list of stopwords extracted from the file.
    """

    stopwords = []
    with open('Stopword-List.txt', 'r') as f: # The 'Stopword-List.txt' file is opened in read mode
        while True:
            text = f.readline() # Each line from the file is read one by one
            if not text: # If the line read is empty (which means end of file), the loop is broken
                break
            stopwords.append(text) # else append the read line to the stopwords list

    stopwords = [c.rstrip(' \n') for c in stopwords if c != '\n'] # A new list is created from stopwords, excluding any newline characters. Newline characters are also removed from the strings.
    return stopwords

def get_docIDs():
    """
    This function is used to extract document IDs based on the names of the files in the 'ResearchPapers' directory.

    It gets the current working directory and lists all the files in the 'ResearchPapers' directory. 
    It then extracts the document IDs from the names of these files, sorts them, and returns the sorted list.
    Assumes the 'ResearchPapers' folder is in your current working directory.

    Returns:
        docID (list): A sorted list of document IDs extracted from the file names in the 'ResearchPapers' directory.
    """

    curr_dir = os.getcwd() # get the current directory
    docID = [int(c.rstrip('.txt')) for c in os.listdir(curr_dir + '\ResearchPapers')] # extract the docIDs from the names of the files in the ResearchPapers directory
    docID.sort()
    return docID

def create_positional_index(total_tokens):
    """
    This function creates a positional index from the given tokens.

    Args:
        total_tokens (list): A list of processed tokens from which the positional index is to be created.

    Returns:
        terms (dict): A dictionary representing the positional index.
    """

    terms = {} # declare an empty dictionary for the positional index
    porter_stemmer = PorterStemmer() # initialize the stemmer

    # get the stopwords. Although stopwords is not going to be inserted in the positional index, we still need them to find the correct positions of the rest of the words
    stopwords = get_stopwords()
    doc = get_docIDs() # get the docIDs

    for i, tokens in enumerate(total_tokens): # Loop through each token in total_tokens
        for j, word in enumerate(tokens): # Loop through each word in tokens
            word = porter_stemmer.stem(word) # Stem the word
            if word[-1] == "'": # If the word ends with an apostrophe, remove it
                word = word.rstrip("'")
            if word not in stopwords: # Filter the stopwords
                if word in terms: # If the word is already in the positional index, add the docID to the index
                    if (i+1) in terms[word]: # If the docID is already in the index for that word, add the position
                        terms[word][i+1].append(j+1)
                    else: # else add the docID as well as the position
                        terms[word][i+1] = [j+1]
                else: # Add the word in the index along with the docID and the position
                    terms[word] = {i+1: [j+1]}

    return terms

def create_inverted_index(total_tokens):
    """
    This function creates an inverted index from the given tokens.

    Args:
        total_tokens (list): A list of tokens from which the inverted index is to be created.

    Returns:
        terms (dict): A dictionary representing the inverted index.
    """

    terms = {} # An empty dictionary for the inverted index
    porter_stemmer = PorterStemmer() # initialize the stemmer
    stopwords = get_stopwords() # get the stopwords
    doc = get_docIDs() # get the docIDs

    for i, tokens in enumerate(total_tokens): # Loop through each token in total_tokens and remove the stopwords
        total_tokens[i] = [c for c in tokens if c not in stopwords]

    for i, tokens in enumerate(total_tokens): # Loop through each token in total_tokens again
        for word in tokens: # Loop through each word in tokens
            word = porter_stemmer.stem(word) # Stem the word
            if word[-1] == "'": # remove the apostrophe
                word = word.rstrip("'")
            if word in terms: # If the word is already in the inverted index
                if (i+1) not in terms[word]: # Append the docID if it isn't in the index
                    terms[word].append(i+1)
            else: # Add the word among with the docID
                terms[word] = [i+1]

    return terms    

def preprocessing():

    """
    This function is used to preprocess the text files in the 'ResearchPapers' directory.

    It reads each file, tokenizes the text, removes punctuation and converts the text to lowercase. 
    It also splits the tokens at '.' and '-'. Assumes the 'ResearchPapers' folder is in your current working directory.

    Returns:
        total_tokens (list): A list of preprocessed tokens from all the files.
    """

    total_tokens = [] # An empty list to store the tokens from all the files
    doc = get_docIDs() # get the docIDs

    for i in doc: # iterate through each doc
        tokens = []
        with open('ResearchPapers/' + str(i) +'.txt', 'r') as f: # open the file corresponding to the current document ID
            while True:
                text = f.readline() # read a line from the file
                if not text: # if the line is empty (which means end of file), break the loop
                    break
                tokens += word_tokenize(text) # tokenize the line and add the tokens to the list
        
        j = 0
        while j < len(tokens): # Loop through each token
            # Remove symbols and numbers from the start and end of the token and convert it to lowercase (case folding)
            tokens[j] = tokens[j].lstrip('0123456789!@#$%^&*()-_=+[{]}\|;:\'",<.>/?`~').rstrip('0123456789!@#$%^&*()-_=+[{]}\|;:\'",<.>/?`~').lower()
            if '.' in tokens[j]: # if '.' exists in a word, split the word at that point and add the splitted words at the end of the tokens list while removing the original word
                word = tokens[j].split('.')
                del tokens[j]
                tokens.extend(word)
            elif '-' in tokens[j]: # do the same for words with '-'
                word = tokens[j].split('-')
                del tokens[j]
                tokens.extend(word)
            j += 1 # move the index forward
        tokens = [c for c in tokens if c.isalpha()] # filter out any strings that contain symbols, numbers, etc.
        total_tokens.append(tokens) # add the processed tokens as a seperate list. Did this to keep track of which tokens appear in which docs (needed to construct indexes). List at index 0 indicate tokens found in doc 1 and so on.
    return total_tokens

def save_indexes():
    tokens = preprocessing()
    inverted_index = create_inverted_index(tokens)
    print(len(inverted_index))
    positional_index = create_positional_index(tokens)
    print(len(positional_index))

    with open('inverted_index.txt', 'w') as f:
        for key, value in inverted_index.items():
            f.write('{}:{}\n'.format(key, value))
    with open('positional_index.txt', 'w') as f:
        for key, value in positional_index.items():
            f.write('{}:{}\n'.format(key, value))

  docID = [int(c.rstrip('.txt')) for c in os.listdir(curr_dir + '\ResearchPapers')] # extract the docIDs from the names of the files in the ResearchPapers directory
  tokens[j] = tokens[j].lstrip('0123456789!@#$%^&*()-_=+[{]}\|;:\'",<.>/?`~').rstrip('0123456789!@#$%^&*()-_=+[{]}\|;:\'",<.>/?`~').lower()
  tokens[j] = tokens[j].lstrip('0123456789!@#$%^&*()-_=+[{]}\|;:\'",<.>/?`~').rstrip('0123456789!@#$%^&*()-_=+[{]}\|;:\'",<.>/?`~').lower()


In [7]:
save_indexes()

16278
16263


In [None]:
print(inverted_index[PorterStemmer().stem('feature')])
print(inverted_index[PorterStemmer().stem('selection')])
print(inverted_index[PorterStemmer().stem('redundency')])

In [None]:
print(inverted_index[PorterStemmer().stem('classification')])
print(inverted_index[PorterStemmer().stem('selection')])

In [None]:
print(inverted_index[PorterStemmer().stem('heart')])
print(inverted_index[PorterStemmer().stem('attack')])

In [None]:
query = 'cancer AND learning'
processed_query = query.split()
print(processed_query)

for i in processed_query:
    if i == 'AND' or i == 'OR' or i == 'NOT':
        condition.append(i)
    else:
        terms.append(i)