In [1]:
!pip install PyPDF2 pyspellchecker

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pyspellchecker
  Downloading pyspellchecker-0.8.3-py3-none-any.whl.metadata (9.5 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyspellchecker-0.8.3-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker, PyPDF2
Successfully installed PyPDF2-3.0.1 pyspellchecker-0.8.3


In [2]:
import re
from collections import defaultdict
from spellchecker import SpellChecker
from PyPDF2 import PdfReader


In [3]:
def extract_pages(pdf_path, start_page, end_page, output_file):
    reader = PdfReader(pdf_path)
    text = ""
    for i in range(start_page - 1, end_page):
        if i < len(reader.pages):
            text += reader.pages[i].extract_text() + "\n"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(text)


In [4]:
def mapper_word_count(text):
    words = re.findall(r"\b\w+\b", text.lower())
    pairs = []
    for word in words:
        pairs.append((word, 1))
    return pairs

def mapper_non_english(text, spell):
    words = re.findall(r"\b\w+\b", text.lower())
    pairs = []
    for word in words:
        if word.isalpha() and word not in spell:
            pairs.append((word, 1))
    return pairs

def reducer(pairs):
    word_count = defaultdict(int)
    for word, count in pairs:
        word_count[word] += count
    return word_count


In [5]:
def map_reduce_word_count(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    mapped = mapper_word_count(text)
    reduced = reducer(mapped)
    return reduced

def map_reduce_non_english(file_path):
    spell = SpellChecker()
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    mapped = mapper_non_english(text, spell)
    reduced = reducer(mapped)
    return reduced


In [6]:
if __name__ == "__main__":
    pdf_path = "harrypotter.pdf"
    #My birthday is 23rd of february 2003
    #So my book should be the 2nd one(The chamber of Secrets)
    #since I uploaded the whole book my page numbers would be 23+start page i.e. the book starts from 281 so => 281+23 = 304
    # File1: pages 23–32 in the second book(i.e. 304 to 313)
    extract_pages(pdf_path, 304, 313, "file1.txt")
    # File2: pages 103–112 in the second book(i.e. 385 to 394)
    extract_pages(pdf_path, 385, 394, "file2.txt")

    word_counts_file1 = map_reduce_word_count("file1.txt")
    print("Word Counts in file1.txt (ALL)")
    for word, count in sorted(word_counts_file1.items(), key=lambda x: -x[1]):
        print(word, ":", count)

    non_english_counts_file2 = map_reduce_non_english("file2.txt")
    print("\nNon-English Words in file2.txt (ALL)")
    for word, count in sorted(non_english_counts_file2.items(), key=lambda x: -x[1]):
        print(word, ":", count)


Word Counts in file1.txt (ALL)
the : 138
a : 86
and : 84
to : 66
it : 61
s : 60
said : 56
you : 53
he : 52
harry : 50
was : 49
of : 48
in : 45
they : 30
i : 27
at : 26
ron : 26
with : 25
fred : 24
had : 23
on : 22
his : 22
she : 22
t : 21
all : 21
weasley : 21
george : 18
but : 18
like : 17
as : 17
were : 17
have : 17
back : 16
mrs : 16
him : 15
that : 15
for : 14
out : 14
we : 14
them : 13
been : 13
house : 13
mum : 13
what : 12
there : 12
who : 12
be : 12
around : 12
dad : 12
know : 12
up : 12
car : 11
her : 11
ll : 11
your : 10
so : 10
ve : 10
just : 10
could : 9
can : 9
very : 9
malfoy : 9
when : 8
d : 8
down : 8
this : 8
got : 8
me : 8
old : 8
never : 8
do : 8
gnomes : 8
garden : 8
re : 8
gnome : 8
see : 7
from : 7
about : 7
think : 7
time : 7
one : 7
muggle : 7
go : 7
door : 6
an : 6
looked : 6
long : 6
wouldn : 6
look : 6
their : 6
not : 6
over : 6
come : 6
going : 6
percy : 6
too : 6
or : 6
little : 6
now : 6
eyes : 6
dear : 6
into : 6
uncle : 5
vernon : 5
then : 5
by : 5
getti