In [90]:
import pandas as pd
import os
import nltk
import pickle
import json
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from collections import defaultdict

In [91]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mpallasmichael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [92]:
#Reading documents from file

files = os.listdir("./docs")
documents = []

for file in files:
    documents.append((file,pd.read_csv("./docs/"+file, header=None)))

documents = [(doc[0], (*map(str, doc[1][0]),)) for doc in documents]

In [94]:
#Removing stopwords

stop_words = set(stopwords.words('english'))
filtered_documents =[]

for document in documents:
    filtered_document = (document[0],[w for w in document[1] if w.lower() not in stop_words])
    filtered_documents.append(filtered_document)

In [95]:
#Stemming

stemmer = SnowballStemmer(language='english')

filtered_documents = [(doc[0], (*map(stemmer.stem, doc[1]),)) for doc in filtered_documents]

In [96]:
with open("Output/preprocessed_docs", "wb") as file:
    pickle.dump(filtered_documents, file)

In [97]:
#Reading the queries
queries_raw = pd.read_csv('Queries_20', header = None)
queries=[]

for query in queries_raw[0]:
    queries.append([word.lower() for word in query.split()])

In [98]:
#Removing stopword from queries
filtered_queries= []
for query in queries:
    filtered_query = [w for w in query if w not in stop_words]
    filtered_queries.append(filtered_query)

In [99]:
#Stemming the queries
filtered_queries = [(*map(stemmer.stem, query),) for query in filtered_queries]

In [100]:
with open("Output/preprocessed_queries", "wb") as file:
    pickle.dump(filtered_queries,file)

In [109]:
''''def build_inverted_index(documents):
    inverted_index = defaultdict(lambda:defaultdict(int))

    for _, (doc_title, document) in enumerate(documents):  # Unpack the tuple
        for term in set(document):
            inverted_index[term][doc_title] = document.count(term)

    return inverted_index'''
def build_inverted_index(documents):
    inverted_index = {}

    for _, (doc_title, document) in enumerate(documents):  # Unpack the tuple
        for term in set(document):
            if term not in inverted_index:
                inverted_index[term] = defaultdict(int)
            inverted_index[term][doc_title] = document.count(term)

    return inverted_index

In [110]:
inverted_index = build_inverted_index(filtered_documents)


In [111]:
print(inverted_index["altern"])

defaultdict(<class 'int'>, {'00693': 1, '01188': 1, '00543': 1, '01204': 1, '01167': 1, '00482': 1, '00281': 1, '00689': 4, '00831': 1, '01112': 1, '00945': 1, '00177': 1, '00919': 2, '01086': 1, '00302': 1, '01074': 3, '00197': 1, '00812': 1, '01139': 1, '00081': 1})


In [105]:
with open("Output/inverted_index","w") as file:
    json.dump(inverted_index, file)