In [1]:
import nltk
import os
import numpy as np
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from collections import Counter

In [2]:
# Extracting data
dataset=[]
for (root,dirs,files) in os.walk(str(os.getcwd()+'\\'+'stories'),topdown=True):
    for filename in files:
        if 'html' not in filename:
            dataset.append((root+"\\"+filename,filename))

### Preprocessing

In [3]:
# conversion to lowercase
# removal of punctuation marks
# removal of stopwords
# stemming or lemmatization

In [4]:
# conversion to lowercase
def convertToLower(data):
    return data.lower()

In [5]:
#removal of punctuation marks
def removePunctuation(data):
    symbols="!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n,"
    for i in symbols:
        data=data.replace(i," ")
        data=data.replace("  "," ")
    data=data.replace("'","")
    return data

In [6]:
# removal of stopwords 
def removeStopWords(data):
    stopWords=stopwords.words('english')
    words=word_tokenize(data)
    newData=""
    for w in words:
        if w not in stopWords and len(w)>1:
            newData+=w+" "
    return newData

In [7]:
#stemming
def stemming(data):
    stemmer=nltk.PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [8]:
# preprocess data
def preprocess(data):
    data=convertToLower(data)
    
    data = removePunctuation(data)
    data = removeStopWords(data)
    data=stemming(data)
    return data

In [9]:
# Extracting and preprocessing data
preprocessedData=[]
for (filePath,name) in dataset:
    file=open(filePath,'r',encoding='utf-8',errors='ignore')
    text=file.read().strip()
    file.close()
    preprocessedData.append((word_tokenize(str(preprocess(text))),name))

In [10]:
# Calculating Document Frequency(DF)
DF = {}

for i in range(len(dataset)):
    tokens = preprocessedData[i][0]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
for i in DF:
    DF[i] = len(DF[i])

### TF-IDF

In [11]:
doc = 0
tf_idf = {}
docIndexing=[]
for i in range(len(dataset)):
    tokens = preprocessedData[i][0]
    docIndexing.append((doc,preprocessedData[i][1]))
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in set(tokens):
        
        tf = counter[token]/words_count
        df = 0
        try:
            df=DF[token]
        except:
            pass
        idf = np.log((len(dataset))/(df+1))
        
        tf_idf[doc, token] = tf*idf
    doc += 1

### Document Ranking

In [12]:
def findTopDocs(k, query):
    print('query:')
    preprocessedQuery = preprocess(query)
    tokens = word_tokenize(str(preprocessedQuery))
    queryWeights = {}
    for key in tf_idf:
        if key[1] in tokens:
            try:
                queryWeights[key[0]] += tf_idf[key]
            except:
                queryWeights[key[0]] = tf_idf[key]
    
    queryWeights=sorted(queryWeights.items(), key=lambda x:x[1], reverse=True)
    topDocs=[]
    for i in queryWeights[:k]:
        topDocs.append(docIndexing[i[0]][1])
    print(topDocs)

### Queries

In [13]:
findTopDocs(3,"What a rotten place!")

query:
['buldream.txt', 'buldetal.txt', 'bulironb.txt']


In [14]:
findTopDocs(4,"I was glad to be home.")

query:
['buggy.txt', 'bulfelis.txt', 'bulhuntr.txt', 'breaks2.asc']


In [15]:
findTopDocs(3,"Must see you over Cadogan West. Coming at once. MYCROFT.")

query:
['bruce-p.txt', 'bulhuntr.txt', 'buldream.txt']
