In [1]:
# Importing all the neccessary libraries
import os
import nltk
import string
import math
import re
import itertools 
from nltk.stem.porter import *
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.stem import snowball, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import TweetTokenizer
from collections import defaultdict

In [2]:
pip install nltk


Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Python\python.exe -m pip install --upgrade pip' command.


In [3]:
# Downloading the nltk libraries
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\d4r18\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\d4r18\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\d4r18\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [42]:
# Initialise path variable
path = "dataset/VSM"

In [43]:
def string_preprocessing(final_string):
    tokenizer = TweetTokenizer()
    final_string = re.sub('[^a-zA-Z0-9\n\.]', ' ', final_string)
    token_list = tokenizer.tokenize(final_string)
        
    # Initialise list of stopwords
    stwords = set(stopwords.words('english'))

    # Initilaise stemmer
    stemmer = snowball.SnowballStemmer('english')
    
    # Initialise lemmatiser
    wordnet_lemmatizer = WordNetLemmatizer()
    
    # Remove punctuations.
    table = str.maketrans('', '', '\t')
    token_list = [word.translate(table) for word in token_list]
    punctuations = (string.punctuation).replace("'", "")
    trans_table = str.maketrans('', '', punctuations)
    stripped_words = [word.translate(trans_table) for word in token_list]
    token_list = [str for str in stripped_words if str]
 
    # Change to lowercase.
    token_list =[word.lower() for word in token_list]

    # Stemmer the words
    token_list =[stemmer.stem(word) for word in token_list]
    
    # Lemmatize the words
    token_list =[wordnet_lemmatizer.lemmatize(word) for word in token_list]
    
    # Remove the words present in the stop words
    final_list = list()
    for word in token_list:
      if word not in stwords:
        final_list.append(word)

    # Return the final list
    return final_list

# Create the inverted index

In [44]:
def create_index(path):
    # Initialize the dictionary.
    pos_index = defaultdict(list)

    # List all files in database
    file_list=os.listdir(path)

    # Make the inverted index
    for doc_id,_file in enumerate(file_list):
        f = open(os.path.join(path,_file),  encoding="utf8")
        # corpus = f.readline().rstrip()
        corpus = f.read()
        
        # Send the corpus to string preprocessing 
        final_token_list = string_preprocessing(corpus)
 
        # For position and term in the tokens.
        for pos, term in enumerate(final_token_list):
                    if term not in pos_index:
                      pos_index[term] = dict()
                      pos_index[term] = {'df': 0, 'posting_list': {}}
                      # Initialize a dictionary for the posting list
                      pos_index[term]["posting_list"] = defaultdict(list)

                    # Calculating the document frequency
                    if doc_id not in pos_index[term]['posting_list']:
                      pos_index[term]['df'] = pos_index[term]['df'] + 1
                      pos_index[term]['posting_list'][doc_id] = 0
                    
                    # Calculating the term frequency
                    pos_index[term]['posting_list'][doc_id] = pos_index[term]['posting_list'][doc_id] + 1
                    
    return pos_index 

## Driver code

In [45]:
final_index = create_index(path)
print("Length of the final_index : ", len(final_index))

# print(final_index)

print("First 10 entries of the final index -")
out = dict(itertools.islice(final_index.items(), 10))   
# print(out)

Length of the final_index :  8150
First 10 entries of the final index -


# Normalise documents

In [48]:
def normalise(index, path):
    dic = defaultdict(list)

    file_list=os.listdir(path)

    for doc_id,_file in enumerate(file_list):
        dic = defaultdict(list)

        f = open(os.path.join(path,_file),  encoding="utf8")
        corpus = f.read()

        tokens = string_preprocessing(corpus)
        tokens = set(tokens)

        dic[doc_id] = defaultdict(list) 
        
        sum=0

        for tok in tokens:
            count = index[tok]['posting_list'][doc_id]
            # print(count)
            count = 1+math.log10(count)

            dic[doc_id][tok] = count
            
            sum = sum + count ** 2

        sum = sum ** 0.5

        for tok in tokens:
            index[tok]['posting_list'][doc_id] = dic[doc_id][tok] / sum
            dic[doc_id][tok] = dic[doc_id][tok] / sum

    return dict

## Driver code

In [49]:
normalised_index = normalise(final_index,path)
# print(final_index)

print("First 10 entries of the final index -")
out = dict(itertools.islice(final_index.items(), 10))   
print(out)

First 10 entries of the final index -
{'c3': {'df': 62, 'posting_list': defaultdict(<class 'list'>, {0: 0.4285362009734741, 1: 0.3779644730092272, 2: 0.35355339059327373, 3: 0.3779644730092272, 4: 0.2581988897471611, 67: 0.21010152342911154, 163: 0.3779644730092272, 239: 0.19611613513818404, 286: 0.2991855194014225, 362: 0.2236067977499789, 437: 0.2878901722185875, 595: 0.31622776601683794, 596: 0.3807835688862079, 638: 0.3015113445777636, 679: 0.3779644730092272, 736: 0.25, 737: 0.242535625036333, 930: 0.2357022603955159, 1115: 0.21320071635561041, 1116: 0.2182178902359924, 1196: 0.2886751345948129, 1362: 0.1924500897298752, 1365: 0.2357022603955159, 1381: 0.25, 1446: 0.22941573387056177, 1566: 0.2991855194014225, 1616: 0.25, 1827: 0.3015113445777636, 1845: 0.2650634818415706, 1918: 0.3099184133017371, 2187: 0.2835239272198676, 2188: 0.20412414523193148, 2300: 0.31622776601683794, 2321: 0.35355339059327373, 2333: 0.35355339059327373, 2362: 0.2773500981126146, 2427: 0.35355339059327373

# Query Normalisation

## Create word count index

In [50]:
def get_term_freq(query):
    temp_dict = dict()
    
    # Calculate word count
    for word in query:
        if word not in temp_dict:
            temp_dict[word] = 0
        
        temp_dict[word] = temp_dict[word]+1
    
    return temp_dict

## Get tf-idf score

In [51]:
def get_tf_idf(tf_index, index, file_count):
    sum = 0

    for word in tf_index:
        tf_wt = 1 + math.log10(tf_index[word])
        
        if word in index.keys():
            idf_wt = math.log10(file_count/index[word]['df'])

            tf_index[word] = tf_wt*idf_wt
        
        sum += tf_index[word] ** 2 

    return sum ** 0.5, tf_index

## Normalise query

In [52]:
def query_normalise(query, index, path):
    # Get term frequncies
    tf_index = get_term_freq(query)

    # Count total files in dataset
    file_count = len(os.listdir(path))

    sum,tf_index = get_tf_idf(tf_index, index, file_count)
   
    # Normalise the weights
    for word in tf_index:
        tf_index[word]  = tf_index[word]/sum
        
    return tf_index

## Driver Code

In [53]:
query = "History"
query =  string_preprocessing(query)

query = query_normalise(query, final_index, path)

print(query)

# Check if normalisation was successful or not
sum = 0

for word in query:
    sum = sum + query[word] ** 2

print(sum)

{'histori': 1.0}
1.0


# Cosine similarity

In [54]:
file_list=os.listdir(path)
file_count = len(file_list)

keys = range(file_count)
Scores = dict.fromkeys(keys)
# print(Scores)
for key in Scores:
        Scores[key] = 0

for word in query:
        if word in final_index.keys():
                for doc in final_index[word]['posting_list']:
                        score = query[word] * final_index[word]['posting_list'][doc]
                        Scores[doc] = Scores[doc] + score

In [55]:
final_answer = sorted(Scores, key=Scores.get, reverse=True)
result = []
for i in final_answer:
    print("Document number : ", i ,", Score = ", Scores[i], "Document title : ", file_list[i])
    if(Scores[i]>0):
        result.append(os.path.splitext(file_list[i])[0])

Document number :  1938 , Score =  0.408248290463863 Document title :  History.txt
Document number :  1957 , Score =  0.408248290463863 Document title :  History_of_New_Jersey.txt
Document number :  1965 , Score =  0.408248290463863 Document title :  History_of_Russia.txt
Document number :  1968 , Score =  0.408248290463863 Document title :  History_of_Singapore.txt
Document number :  1939 , Score =  0.3779644730092272 Document title :  History_of_Alaska.txt
Document number :  1941 , Score =  0.3779644730092272 Document title :  History_of_Arizona.txt
Document number :  1942 , Score =  0.3779644730092272 Document title :  History_of_Buddhism.txt
Document number :  1943 , Score =  0.3779644730092272 Document title :  History_of_Burnside.txt
Document number :  1949 , Score =  0.3779644730092272 Document title :  History_of_Earth.txt
Document number :  1950 , Score =  0.3779644730092272 Document title :  History_of_education.txt
Document number :  1951 , Score =  0.3779644730092272 Docume

In [56]:
print(result)


['History', 'History_of_New_Jersey', 'History_of_Russia', 'History_of_Singapore', 'History_of_Alaska', 'History_of_Arizona', 'History_of_Buddhism', 'History_of_Burnside', 'History_of_Earth', 'History_of_education', 'History_of_Greece', 'History_of_Greenland', 'History_of_Limerick', 'History_of_painting', 'History_of_Solidarity', 'History_of_South_Africa', 'Modern_history', 'History_of_the_Netherlands', 'History_of_Central_Asia', 'History_of_Northwest_Territories_capital_cities', 'History_of_nuclear_weapons', 'History_of_Puerto_Rico', 'History_of_saffron', 'History_of_South_Carolina', 'Military_history_of_Canada', 'Military_history_of_France', 'History_of_the_Panama_Canal', 'Meteorological_history_of_Hurricane_Katrina', 'History_of_Anglo-Saxon_England', 'History_of_Louisville%2C_Kentucky', 'History_of_Miami%2C_Florida', 'History_of_the_Portuguese_Communist_Party', 'Military_history_of_Puerto_Rico', 'Military_history_of_the_Soviet_Union', 'History_of_Cape_Colony_from_1806_to_1870', 'Hist

In [57]:
with open(os.path.join(".", "relevant_docs.txt"), 'w') as f:
    for item in result:
        f.write("%s\n" % item)