In [None]:
import csv
import math
from collections import defaultdict

class IndexingMapper:
    def __init__(self):
        pass

    def extract_words(self, line):
        article_id, section_text = line
        words = re.findall(r'\w+', section_text.lower())
        return [(word, article_id) for word in words]

    def map(self, line):
        return self.extract_words(line)

class IndexerMapper:
    def __init__(self, total_documents):
        self.total_documents = total_documents

    def compute_tfidf(self, args):

        word, document_count, current_article_id, section_text = args

        word = word.strip('"')
        document_count = int(document_count)

        idf = math.log10(self.total_documents / document_count)

        words = re.findall(r'\w+', section_text.lower())
        tf = words.count(word) / len(words)

        tfidf = tf * idf

        return word, (current_article_id, tfidf)

    def map(self, word_document_count, current_article):

        current_article_id, section_text = current_article

        tfidf_values = [self.compute_tfidf((word, count, current_article_id, section_text)) for word, count in word_document_count]

        return tfidf_values

class QueryrMapper:
    def __init__(self, query, total_documents):

        self.query = query

        self.total_documents = total_documents

    def vectorize_query(self):

        words = re.findall(r'\w+', self.query.lower())
        return [(word, 1) for word in set(words)]

    def map(self, tfidf_values):
        relevance_scores = self.calculate_rele(tfidf_values)
        return relevance_scores

    def calculate_rele(self, tfidf_values):

        query_vector = {word: idf for word, idf in tfidf_values}

        dot_prod = sum(query_vector.get(word, 0) * tfidf for word, tfidf in tfidf_values.values())

        return None, dot_prod

if __name__ == '__main__':

    data = []
    with open('wikipedia_data.csv', 'r', newline='', encoding='utf-8') as csvfile:

        reader = csv.reader(csvfile)

        next(reader)

        for row in reader:

            data.append((row[0], row[3]))


    mapper =IndexMapper()

    word_article_pairs = sum(map(mapper.extract_words, data), [])

    mapper = IndexerMapper(len(data))

    tfidf_values = []

    for article in data:

        tfidf_values.append(mapper.map(document_count, article))

    query = "map function"

    mapper = QueryMapper(query, len(data))

    relevance_scores = mapper.map(tfidf_values)

    relevant_documents = ['3', '7',s '5']

    for doc_id in relevant_documents:

        print(doc_id)
