In [1]:
# Importing all the neccessary libraries
import os
import nltk
import string
import math
import itertools 
from nltk.stem.porter import *
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.stem import snowball, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import TweetTokenizer
from collections import defaultdict

In [2]:
# Downloading the nltk libraries
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\d4r18\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\d4r18\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\d4r18\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Initialise path variable
path = "dataset/VSM"

In [4]:
def string_preprocessing(final_string):
    tokenizer = TweetTokenizer()
    token_list = tokenizer.tokenize(final_string)
    
    # Initialise list of stopwords
    stwords = set(stopwords.words('english'))

    # Initilaise stemmer
    stemmer = snowball.SnowballStemmer('english')
    
    # Initialise lemmatiser
    wordnet_lemmatizer = WordNetLemmatizer()
    
    # Remove punctuations.
    table = str.maketrans('', '', '\t')
    token_list = [word.translate(table) for word in token_list]
    punctuations = (string.punctuation).replace("'", "")
    trans_table = str.maketrans('', '', punctuations)
    stripped_words = [word.translate(trans_table) for word in token_list]
    token_list = [str for str in stripped_words if str]
 
    # Change to lowercase.
    token_list =[word.lower() for word in token_list]

    # Stemmer the words
    token_list =[stemmer.stem(word) for word in token_list]
    
    # Lemmatize the words
    token_list =[wordnet_lemmatizer.lemmatize(word) for word in token_list]
    
    # Remove the words present in the stop words
    final_list = list()
    for word in token_list:
      if word not in stwords:
        final_list.append(word)

    # Return the final list
    return final_list

# Create the inverted index

In [28]:
def create_index(path):
    # Initialize the dictionary.
    pos_index = defaultdict(list)

    # List all files in database
    file_list=os.listdir(path)

    # Make the inverted index
    for doc_id,_file in enumerate(file_list):
        f = open(os.path.join(path,_file),  encoding="utf8")
        corpus = f.read()
        
        # Send the corpus to string preprocessing 
        final_token_list = string_preprocessing(corpus)
 
        # For position and term in the tokens.
        for pos, term in enumerate(final_token_list):
                    if term not in pos_index:
                      pos_index[term] = dict()
                      pos_index[term] = {'df': 0, 'posting_list': {}}
                      # Initialize a dictionary for the posting list
                      pos_index[term]["posting_list"] = defaultdict(list)

                    # Calculating the document frequency
                    if doc_id not in pos_index[term]['posting_list']:
                      pos_index[term]['df'] = pos_index[term]['df'] + 1
                      pos_index[term]['posting_list'][doc_id] = 0
                    
                    # Calculating the term frequency
                    pos_index[term]['posting_list'][doc_id] = pos_index[term]['posting_list'][doc_id] + 1
                    
    return pos_index 

## Driver code

In [70]:
final_index = create_index(path)
print("Length of the final_index : ", len(final_index))
# print(final_index.get())


# print(final_index)

print("First 10 entries of the final index -")
out = dict(itertools.islice(final_index.items(), 5))   
print(out)

Length of the final_index :  4942
First 10 entries of the final index -
{'c3': {'df': 62, 'posting_list': defaultdict(<class 'list'>, {0: 3, 1: 1, 2: 1, 3: 1, 4: 1, 67: 1, 163: 1, 239: 1, 286: 2, 362: 1, 437: 2, 595: 1, 596: 2, 638: 1, 679: 1, 736: 1, 737: 1, 930: 1, 1115: 1, 1116: 1, 1196: 1, 1362: 1, 1365: 1, 1381: 1, 1446: 1, 1566: 2, 1616: 1, 1827: 1, 1845: 1, 1918: 2, 2187: 2, 2188: 1, 2300: 1, 2321: 1, 2333: 1, 2362: 1, 2427: 1, 2537: 1, 2974: 1, 2975: 1, 2976: 1, 3126: 1, 3154: 1, 3169: 1, 3183: 1, 3225: 1, 3280: 1, 3402: 1, 3459: 1, 3576: 1, 3577: 2, 3578: 3, 3579: 1, 3581: 1, 3600: 1, 3614: 1, 3646: 1, 3739: 2, 3750: 1, 3963: 1, 4334: 1, 4583: 1})}, '81ed': {'df': 1, 'posting_list': defaultdict(<class 'list'>, {0: 1})}, 'a1nmacgabr': {'df': 1, 'posting_list': defaultdict(<class 'list'>, {0: 1})}, 'a1in': {'df': 1, 'posting_list': defaultdict(<class 'list'>, {0: 1})}, '85land': {'df': 1, 'posting_list': defaultdict(<class 'list'>, {1: 1})}}


# Normalise documents

In [7]:
def normalise(index, path):
    dic = defaultdict(list)

    file_list=os.listdir(path)

    for doc_id,_file in enumerate(file_list):
        dic = defaultdict(list)

        f = open(os.path.join(path,_file),  encoding="utf8")
        corpus = f.read()

        tokens = string_preprocessing(corpus)
        tokens = set(tokens)

        dic[doc_id] = defaultdict(list) 
        
        sum=0

        for tok in tokens:
            count = index[tok]['posting_list'][doc_id]
            # print(count)
            count = 1+math.log10(count)

            dic[doc_id][tok] = count
            
            sum = sum + count ** 2

        sum = sum ** 0.5

        for tok in tokens:
            index[tok]['posting_list'][doc_id] = dic[doc_id][tok] / sum
            dic[doc_id][tok] = dic[doc_id][tok] / sum

    return dict

## Driver code

In [8]:
normalised_index = normalise(final_index,path)
# print(final_index)

print("First 10 entries of the final index -")
out = dict(itertools.islice(final_index.items(), 10))   
print(out)

First 10 entries of the final index -
{'c3': {'df': 62, 'posting_list': defaultdict(<class 'list'>, {0: 0.6488916263728387, 1: 0.7071067811865475, 2: 0.7071067811865475, 3: 0.7071067811865475, 4: 0.7071067811865475, 67: 0.5773502691896258, 163: 0.5773502691896258, 239: 0.5773502691896258, 286: 0.5029065709383962, 362: 0.5773502691896258, 437: 0.5452917911900825, 595: 0.5773502691896258, 596: 0.6005883219864204, 638: 0.5773502691896258, 679: 0.5773502691896258, 736: 0.5773502691896258, 737: 0.5, 930: 0.5, 1115: 0.7071067811865475, 1116: 0.7071067811865475, 1196: 0.5773502691896258, 1362: 0.5773502691896258, 1365: 0.5, 1381: 0.5773502691896258, 1446: 0.5773502691896258, 1566: 0.6005883219864204, 1616: 0.5773502691896258, 1827: 0.5773502691896258, 1845: 0.5773502691896258, 1918: 0.6005883219864204, 2187: 0.6005883219864204, 2188: 0.4472135954999579, 2300: 0.5773502691896258, 2321: 0.5773502691896258, 2333: 0.5773502691896258, 2362: 0.5773502691896258, 2427: 0.5773502691896258, 2537: 0.577

# Query Normalisation

## Create word count index

In [9]:
def get_term_freq(query):
    temp_dict = dict()
    
    # Calculate word count
    for word in query:
        if word not in temp_dict:
            temp_dict[word] = 0
        
        temp_dict[word] = temp_dict[word]+1
    
    return temp_dict

## Get tf-idf score

In [10]:
def get_tf_idf(tf_index, index, file_count):
    sum = 0

    for word in tf_index:
        tf_wt = 1 + math.log10(tf_index[word])
        
        if word in index.keys():
            idf_wt = math.log10(file_count/index[word]['df'])

            tf_index[word] = tf_wt*idf_wt
        
        sum += tf_index[word] ** 2 

    return sum ** 0.5, tf_index

## Normalise query

In [11]:
def query_normalise(query, index, path):
    # Get term frequncies
    tf_index = get_term_freq(query)

    # Count total files in dataset
    file_count = len(os.listdir(path))

    sum,tf_index = get_tf_idf(tf_index, index, file_count)
   
    # Normalise the weights
    for word in tf_index:
        tf_index[word]  = tf_index[word]/sum
        
    return tf_index

## Driver Code

In [83]:
query = "Hitler"
query =  string_preprocessing(query)

query = query_normalise(query, final_index, path)

print(query)

# Check if normalisation was successful or not
sum = 0

for word in query:
    sum = sum + query[word] ** 2

print(sum)

{'hitler': 1.0}
1.0


# Cosine similarity

In [84]:
file_count = len(os.listdir(path))

keys = range(file_count + 1)
Scores = dict.fromkeys(keys)
for key in Scores:
        Scores[key] = 0

for word in query:
        if word in final_index.keys():
                for doc in final_index[word]['posting_list']:
                        score = query[word] * final_index[word]['posting_list'][doc]
                        Scores[doc] = Scores[doc] + score

In [85]:
final_answer = sorted(Scores, key=Scores.get, reverse=True)

for i in final_answer:
    print("Document number : ", i ,", Score = ", Scores[i])


Document number :  0 , Score =  0
Document number :  1 , Score =  0
Document number :  2 , Score =  0
Document number :  3 , Score =  0
Document number :  4 , Score =  0
Document number :  5 , Score =  0
Document number :  6 , Score =  0
Document number :  7 , Score =  0
Document number :  8 , Score =  0
Document number :  9 , Score =  0
Document number :  10 , Score =  0
Document number :  11 , Score =  0
Document number :  12 , Score =  0
Document number :  13 , Score =  0
Document number :  14 , Score =  0
Document number :  15 , Score =  0
Document number :  16 , Score =  0
Document number :  17 , Score =  0
Document number :  18 , Score =  0
Document number :  19 , Score =  0
Document number :  20 , Score =  0
Document number :  21 , Score =  0
Document number :  22 , Score =  0
Document number :  23 , Score =  0
Document number :  24 , Score =  0
Document number :  25 , Score =  0
Document number :  26 , Score =  0
Document number :  27 , Score =  0
Document number :  28 , Score 