In [1]:
import pandas as pd
import numpy as np
import string
import os
import re
import pickle
from sortedcontainers import SortedDict, SortedList, SortedSet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from collections import Counter

### Read from files

In [2]:
def getListOfFiles(directory):
    '''
    Parameters:
        directory: type(string)
        
    returns: list of all files in directory with the full path of file
    '''
    
    list_of_files = []
    
    for file_path in os.listdir(directory):
        full_path = os.path.join(directory, file_path)
        if os.path.isfile(full_path):
            list_of_files.append(full_path)
    
    return list_of_files

### Preprocessing Functions

In [3]:
def lowercase(data):
    '''
    Parameters:
        data: type(string)
    
    returns: lowercase of data
    '''
    
    return data.lower()

In [4]:
def perform_word_tokenize(corpus):
    '''
    Parameters:
        corpus: type(string)
    
    returns word-level tokenization of corpus
    '''
    
    return word_tokenize(corpus)

In [5]:
def remove_stopwords_from_tokens(tokens):
    '''
    Parameters:
        tokens: type(list)
        stopwords_set: type(set)
    
    returns: tokens without stopwords
    '''
    stopwords_set = set(stopwords.words('english'))
    tokens_sans_stopwords = [x for x in tokens if x not in stopwords_set]
    
    return tokens_sans_stopwords

In [6]:
def remove_punctuation_from_tokens(tokens):
    '''
    Parameters:
        tokens: type(list)
    
    returns: tokens without punctuation
    '''
    tokens_sans_punctuation = [x.translate(str.maketrans('', '', string.punctuation)) for x in tokens]
    
    return tokens_sans_punctuation

In [7]:
def remove_blank_space_tokens(tokens):
    '''
    Parameters:
        tokens: type(list)
    
    returns: tokens without blank tokens
    '''
    tokens_sans_blank_space = [x for x in tokens if x!='']
    
    return tokens_sans_blank_space

In [8]:
def preprocess(corpus):
    # Convert the text to lower case
    lowercase_corpus = lowercase(corpus)
    #print(len(lowercase_corpus))
    
    # Perform word tokenization (word_tokenize also takes care of whitespace)
    word_tokens = perform_word_tokenize(lowercase_corpus)
    #print(len(word_tokens))
    
    # Remove stopwords from tokens
    word_tokens_sans_stopwords = remove_stopwords_from_tokens(word_tokens)
    #print(len(word_tokens_sans_stopwords))
    
    # Remove punctuation marks from tokens
    word_tokens_sans_punctuation = remove_punctuation_from_tokens(word_tokens_sans_stopwords)
    #print(len(word_tokens_sans_punctuation))
    
    # Remove blank space tokens
    word_tokens_sans_blank_tokens = remove_blank_space_tokens(word_tokens_sans_punctuation)
    #print(len(word_tokens_sans_blank_tokens))
    
    return set(word_tokens_sans_blank_tokens)

### Union Function

In [9]:
def union(l1,l2):
    union_list = list(set(l1) | set(l2))
    return union_list

### Intersection Function

In [10]:
def intersection(l1,l2):
    intersection_list = list(set(l1) & set(l2))
    return intersection_list

### Calculate Jaccard Coefficient

In [11]:
def jaccardCoefficient(query, list_of_files):
    for i,filePath in enumerate (list_of_files):
        file = open(filePath, encoding="utf8", errors = "ignore")
        read = file.read()    
        file.close()
    
        sanitized_query = preprocess(read)                    
        # calculate jaccard coefficient value based on formula that is intersection of document and query divided by union of 
        # document and query
        jaccard_coefficient[filePath] = len(intersection(sanitized_query, query))/len(union(sanitized_query, query))     

### Finding top 5 relevant documents

In [12]:
# Get List of Files in Dataset
list_of_files = getListOfFiles("E:\Sem6\IR\Assignments\Assignment 2\IR2022_A2_47\Dataset\Humor,Hist,Media,Food")
#print(list_of_files)
jaccard_coefficient = {}        
sentence_query = input("Enter the query: ")
query = preprocess(sentence_query)      # Query Processing
jaccardCoefficient(query, list_of_files)
# it counts the elements in the dictionary and prints the 5 most common documents based on it.
#k = Counter(dict(jaccard_coefficient)).most_common(5) 
print(" Top 5 relevant documents based on Jaccard Coefficient ")  
for i in Counter(dict(jaccard_coefficient)).most_common(5) :
    print("{} --> {}".format(i[1],i[0]))

Enter the query: american dream
 Top 5 relevant documents based on Jaccard Coefficient 
0.016129032258064516 --> E:\Sem6\IR\Assignments\Assignment 2\IR2022_A2_47\Dataset\Humor,Hist,Media,Food\p-law.hum
0.0136986301369863 --> E:\Sem6\IR\Assignments\Assignment 2\IR2022_A2_47\Dataset\Humor,Hist,Media,Food\psalm.reagan
0.0136986301369863 --> E:\Sem6\IR\Assignments\Assignment 2\IR2022_A2_47\Dataset\Humor,Hist,Media,Food\psalm_nixon
0.0136986301369863 --> E:\Sem6\IR\Assignments\Assignment 2\IR2022_A2_47\Dataset\Humor,Hist,Media,Food\psalm_re.aga
0.013333333333333334 --> E:\Sem6\IR\Assignments\Assignment 2\IR2022_A2_47\Dataset\Humor,Hist,Media,Food\oxymoron.txt
