In [1]:
import pandas as pd
import os
import nltk
import pickle
import json
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from collections import defaultdict

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/zisissour/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Reading documents from file

files = os.listdir("./docs")
documents = []

for file in files:
    documents.append((file,pd.read_csv("./docs/"+file, header=None)))

documents = [(doc[0], (*map(str, doc[1][0]),)) for doc in documents]

In [4]:
#Removing stopwords

stop_words = set(stopwords.words('english'))
filtered_documents =[]

for document in documents:
    filtered_document = (document[0],[w for w in document[1] if w.lower() not in stop_words])
    filtered_documents.append(filtered_document)

In [5]:
#Stemming

stemmer = SnowballStemmer(language='english')

filtered_documents = [(doc[0], (*map(stemmer.stem, doc[1]),)) for doc in filtered_documents]

In [6]:
with open("Output/preprocessed_docs", "wb") as file:
    pickle.dump(filtered_documents, file)

In [7]:
#Reading the queries
queries_raw = pd.read_csv('Queries_20', header = None)
queries=[]

for query in queries_raw[0]:
    queries.append([word.lower() for word in query.split()])

In [8]:
#Removing stopword from queries
filtered_queries= []
for query in queries:
    filtered_query = [w for w in query if w not in stop_words]
    filtered_queries.append(filtered_query)

In [9]:
#Stemming the queries
filtered_queries = [(*map(stemmer.stem, query),) for query in filtered_queries]

In [10]:
with open("Output/preprocessed_queries", "wb") as file:
    pickle.dump(filtered_queries,file)

In [11]:
def build_inverted_index(documents):
    inverted_index = defaultdict(lambda: defaultdict(int))

    for _, (doc_title, document) in enumerate(documents):  # Unpack the tuple
        for term in set(document):
            inverted_index[term][doc_title] = document.count(term)

    return inverted_index

In [12]:
inverted_index = build_inverted_index(filtered_documents)


In [13]:
with open("Output/inverted_index","w") as file:
    json.dump(inverted_index, file)