# File Grabber

In [11]:
!pip install mrjob



In [12]:
%%file gutenGrabber.py
#!python gutenGrabber.py [start book id] [docs to grab]

import sys
import urllib.request

book_id = int(sys.argv[1])
num_books = int(sys.argv[2])
counter = 0

while(counter < num_books):
    url = f"https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt"
    outfile_name = f"{book_id}.txt"
    try:
        read_file = urllib.request.urlopen(url)
        write_file = open(outfile_name, "w")

        for line in read_file:
            decoded_line = line.decode("utf-8")
            write_file.write(decoded_line)

        print(f"{outfile_name} created.")
        counter += 1
        
    except:
        print(f"{url} doesn't exist.")

    book_id += 1

Overwriting gutenGrabber.py


In [13]:
!python gutenGrabber.py 2600 4

2600.txt created.
2601.txt created.
2602.txt created.
https://www.gutenberg.org/files/2603/2603-0.txt doesn't exist.
2604.txt created.


In [14]:
f = open("bus.txt", "w")
f.write("The wheels on the bus go round and round,\n")
f.write("round and round, round and round\n")
f.write("The wheels on the bus go round and round,\n")
f.write("all through the town.")
f.close()

# 4.3 Inverted Indexing Baseline Implementation

In [15]:
%%file inverted_indexing_base.py

from mrjob.job import MRJob
import re
WORD_RE = re.compile(r"[\w']+")

class MRInvertedIndexingBase(MRJob):
  def mapper_raw(self, file, uri):
    H = dict()
    read_file = open(file, 'r')
    for line in read_file:
      for word in WORD_RE.findall(line):
        if word in H.keys():
          H[word] += 1
        else:
          H[word] = 1
    for key in H.keys():
      yield key, (file, H[key])

  def reducer(self, word, postings):
    P = list()
    for posting in postings:
      P.append(posting)
    P.sort()
    yield word, P

if __name__ == '__main__':
    MRInvertedIndexingBase.run()

Overwriting inverted_indexing_base.py


In [16]:
!python inverted_indexing_base.py *.txt

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
"confuted"	[["/content/2600.txt", 1]]
"confuting"	[["/content/2601.txt", 1], ["/content/2604.txt", 1]]
"congealed"	[["/content/2600.txt", 2], ["/content/2602.txt", 1]]
"congenial"	[["/content/2601.txt", 1]]
"congested"	[["/content/2600.txt", 1]]
"congestion"	[["/content/2600.txt", 1]]
"congratulate"	[["/content/2600.txt", 15], ["/content/2601.txt", 1]]
"congratulated"	[["/content/2600.txt", 6], ["/content/2601.txt", 3], ["/content/2602.txt", 5], ["/content/2604.txt", 2]]
"congratulating"	[["/content/2600.txt", 1], ["/content/2601.txt", 6]]
"congratulation"	[["/content/2600.txt", 1], ["/content/2601.txt", 4]]
"congratulations"	[["/content/2600.txt", 4], ["/content/2601.txt", 3], ["/content/2602.txt", 1], ["/content/2604.txt", 3]]
"congregated"	[["/content/2600.txt", 1]]
"congregation"	[["/content/2601.txt", 2], ["/content/2604.txt", 1]]
"congruous"	[["/content/2601.txt", 1]]
"conjecture"	[["/content/2600.txt", 2], ["/conte

# 4.4 Inverted Indexing Revised Implementation (default partitioner)

In [17]:
%%file inverted_indexing_revised.py

from mrjob.job import MRJob
import re
WORD_RE = re.compile(r"[\w']+")

class MRInvertedIndexingRevised(MRJob):
  def mapper_raw(self, file, uri):
    H = dict()
    read_file = open(file, 'r')
    for line in read_file:
      for word in WORD_RE.findall(line):
        if word in H.keys():
          H[word] += 1
        else:
          H[word] = 1
    for key in H.keys():
      yield (key, file), H[key]
  
  def reducer_init(self):
    self.t_prev = None
    self.P = list()

  def reducer(self, key, value):
    self.t = key[0]
    file = key[1]
    if self.t != self.t_prev and not self.t_prev is None:
      yield self.t_prev, self.P
      self.P = list()
    val = list(value)[0]
    self.P.append((file, val))
    self.t_prev = self.t

  def reducer_final(self):
    yield self.t_prev, self.P

if __name__ == '__main__':
    MRInvertedIndexingRevised.run()

Overwriting inverted_indexing_revised.py


In [18]:
!python inverted_indexing_revised.py *.txt

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
"confute"	[["/content/2601.txt", 1]]
"confuted"	[["/content/2600.txt", 1]]
"confuting"	[["/content/2601.txt", 1], ["/content/2604.txt", 1]]
"congealed"	[["/content/2600.txt", 2], ["/content/2602.txt", 1]]
"congenial"	[["/content/2601.txt", 1]]
"congested"	[["/content/2600.txt", 1]]
"congestion"	[["/content/2600.txt", 1]]
"congratulate"	[["/content/2600.txt", 15], ["/content/2601.txt", 1]]
"congratulated"	[["/content/2600.txt", 6], ["/content/2601.txt", 3], ["/content/2602.txt", 5], ["/content/2604.txt", 2]]
"congratulating"	[["/content/2600.txt", 1], ["/content/2601.txt", 6]]
"congratulation"	[["/content/2600.txt", 1], ["/content/2601.txt", 4]]
"congratulations"	[["/content/2600.txt", 4], ["/content/2601.txt", 3], ["/content/2602.txt", 1], ["/content/2604.txt", 3]]
"congregated"	[["/content/2600.txt", 1]]
"congregation"	[["/content/2601.txt", 2], ["/content/2604.txt", 1]]
"congruous"	[["/content/2601.txt", 1]]
"conjecture

# 4.4 Inverted Indexing Revised Implementation (specified partitioner)

In [29]:
%%file inverted_indexing_revised_partitioned.py

from mrjob.job import MRJob
from mrjob.step import MRStep
import re
WORD_RE = re.compile(r"[\w']+")

class MRInvertedIndexingRevised(MRJob):
  def mapper_raw(self, file, uri):
    H = dict()
    read_file = open(file, 'r')
    for line in read_file:
      for word in WORD_RE.findall(line):
        if word in H.keys():
          H[word] += 1
        else:
          H[word] = 1
    for key in H.keys():
      yield (key, file), H[key]
  
  def reducer_init(self):
    self.t_prev = None
    self.P = list()

  def reducer(self, key, value):
    self.t = key[0]
    file = key[1]
    if self.t != self.t_prev and not self.t_prev is None:
      yield self.t_prev, self.P
      self.P = list()
    val = list(value)[0]
    self.P.append((file, val))
    self.t_prev = self.t

  def reducer_final(self):
    yield self.t_prev, self.P

  def steps(self):
    return [
      MRStep(mapper_raw=self.mapper_raw,
            jobconf={'partitioner':"org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner"},
            reducer=self.reducer,
            reducer_init=self.reducer_init,
            reducer_final=self.reducer_final)
    ]

if __name__ == '__main__':
    MRInvertedIndexingRevised.run()

Overwriting inverted_indexing_revised_partitioned.py


In [30]:
!python inverted_indexing_revised_partitioned.py *.txt

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
"confute"	[["/content/2601.txt", 1]]
"confuted"	[["/content/2600.txt", 1]]
"confuting"	[["/content/2601.txt", 1], ["/content/2604.txt", 1]]
"congealed"	[["/content/2600.txt", 2], ["/content/2602.txt", 1]]
"congenial"	[["/content/2601.txt", 1]]
"congested"	[["/content/2600.txt", 1]]
"congestion"	[["/content/2600.txt", 1]]
"congratulate"	[["/content/2600.txt", 15], ["/content/2601.txt", 1]]
"congratulated"	[["/content/2600.txt", 6], ["/content/2601.txt", 3], ["/content/2602.txt", 5], ["/content/2604.txt", 2]]
"congratulating"	[["/content/2600.txt", 1], ["/content/2601.txt", 6]]
"congratulation"	[["/content/2600.txt", 1], ["/content/2601.txt", 4]]
"congratulations"	[["/content/2600.txt", 4], ["/content/2601.txt", 3], ["/content/2602.txt", 1], ["/content/2604.txt", 3]]
"congregated"	[["/content/2600.txt", 1]]
"congregation"	[["/content/2601.txt", 2], ["/content/2604.txt", 1]]
"congruous"	[["/content/2601.txt", 1]]
"conjecture