In [2]:
# Import libraries 

from PIL import Image 
import pytesseract 
import sys 
from pdf2image import convert_from_path 
import os 
import re
from flair.data import Sentence
from flair.models import SequenceTagger


### Part A: Reading the pdf file

In [3]:
PDF_file = "Problem Statement 2.pdf"

# Store all the pages of the PDF in a variable 

pages = convert_from_path(PDF_file, poppler_path = r"C:\Program Files\poppler-0.68.0\bin")

# Counter to store images of each page of PDF to image 
image_counter = 1

#iterate through all the pages stored above
for page in pages:
    filename = "page_" + str(image_counter) + ".jpg"
    page.save(filename,'JPEG')
    image_counter += 1
    
filelimit = image_counter - 1

### Part B: Recognizing text from the images using OCR

In [4]:
#creating a text file to write the output
outfile = "converted_pdf.txt"

f = open(outfile,'a')
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\tesseract_ocr\tesseract.exe"

for i in range(1, filelimit + 1):
    filename = "page_" + str(i) + ".jpg"
    text = str(((pytesseract.image_to_string(Image.open(filename))))) 
    text = text.replace('-\n', '')
    f.write(text)
    
f.close()

### 1. Counting number of paragraphs


In [5]:
#function for counting number of paragraphs
def para_counter(file):
    para = 0
    with open(file) as f:
        for lines in f:
            if lines == "\n":
                para += 1
    print("Number of paragraphs in the document are:", para)
    return 
    
para_counter("converted_pdf.txt")

Number of paragraphs in the document are: 78


## Input Paragraph and initialising Flair NER

In [6]:
# input paragraph here
paragraphInput = Sentence("""
The new assembly hall, Dasaratha’s latest pride, was crowded all 
day with visiting dignitaries, royal emissaries, and citizens coming in with 
representations or appeals for justice. The King was always accessible, 
and fulfilled his duties as the ruler of Kosala without grudging the hours 
spent in public service.
On a certain afternoon, messengers at the gate came running in to 
announce, “Sage Viswamithra.” When the message was relayed to the 
King, he got up and hurried forward to receive the visitor. Viswamithra, 
once a king, a conqueror, and a dreaded name until he renounced his 
kingly role and chose to become a sage (which he accomplished through 
severe austerities), combined in himself the sage’s eminence and the 
king’s authority and was quick tempered and positive. Dasaratha led him 
to a proper seat and said, “This is a day of glory for us; your gracious 
presence is most welcome. You must have come from afar. Would you 
first rest?”
“No need,” the sage replied simply. He had complete mastery over 
his bodily needs through inner discipline and austerities, and was above 
the effects of heat, cold, hunger, fatigue, and even decrepitude. The King 
later asked politely, “Is there anything I can do?” Viswamithra looked 
steadily at the King and answered, “Yes. I am here to ask of you a favour. 
I wish to perform, before the next full moon, a yagna at Sidhasrama. 
Doubtless you know where it is?”
“I have passed that sacred ground beyond the Ganges many times.”
The sage interrupted. “But there are creatures hovering about 
waiting to disturb every holy undertaking there, who must be overcome in 
the same manner as one has to conquer the fivefold evils4 within before 
one can realize holiness. Those evil creatures are endowed with 
immeasurable powers of destruction. But it is our duty to pursue our aims 
undeterred. The yagna I propose to perform will strengthen the beneficial 
forces of this world, and please the gods above.”
“It is my duty to protect your sublime effort. Tell me when, and I will 
be there.”
The sage said, “No need to disturb your august self. Send your son 
Rama with me, and he will help me. He can.”
“Rama!” cried the King, surprised, “When I am here to serve you.”
Viswamithra’s temper was already stirring. “I know your greatness,” he 
said, cutting the King short. “But I want Rama to go with me. If you are not 
willing, you may say so.”
The air became suddenly tense. The assembly, the ministers and 
officials, watched in solemn silence. The King looked miserable. “Rama is 
still a child, still learning the art sand practising the use of arms.” His 
sentences never seemed to conclude, but trailed away as he tried to 
explain. “He is a boy, a child, he is too young and tender to contend with 
demons.”
""")

# load the NER tagger
tagger = SequenceTagger.load('ner')

# run NER over sentence
tagger.predict(paragraphInput)

2022-05-15 22:40:47,047 loading file C:\Users\donjo\.flair\models\ner-english\4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4
2022-05-15 22:40:49,484 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


### 2. Function to list all names mentioned in input paragraph

In [7]:
def find_names(paragraph):
    print('The following names are found in the given paragraph:')
    names = []
    # iterate over entities and print each
    for entity in paragraph.get_spans('ner'):
        #print(entity)
        if entity.get_label("ner").value == "PER":
            names.append(entity.text)
            #print(entity.get_label("ner").value)
    uniqueNames = set(names)
    print(uniqueNames)
    return

find_names(paragraphInput)    

The following names are found in the given paragraph:
{'Rama', 'Sage Viswamithra', 'Viswamithra', 'King', 'Dasaratha'}


### 3. Function to list all location mentioned in input paragraph

In [8]:

def find_places(paragraph):
    print('The following places are found in the given paragraph:')
    places = []
    # iterate over entities and print each
    for entity in paragraph.get_spans('ner'):
        #print(entity)
        if entity.get_label("ner").value == "LOC":
            places.append(entity.text)
            #print(entity.get_label("ner").value)
    uniquePlaces = set(places)
    print(uniquePlaces)
    return 
   
find_places(paragraphInput)


The following places are found in the given paragraph:
{'Ganges', 'Sidhasrama', 'Kosala'}


### Bonus Questions

In [9]:
    
#Counting the number of words in the paragraph

def word_count(paragraph):
    # using regex findall()
    result = len(re.findall(r'\w+', paragraph.text))

    print("There are " + str(result) + " words.")
    return

#finding heading from the document

def headings():
    text = open('converted_pdf.txt').read()
    pattern = re.compile("(.*.[A-Z'S]+[ A-Z])+\n")
    headings = []

    for match in pattern.finditer(text):
        s=match.start()
        e = match.end()
        headings.append(text[s:e].replace('\n',''))  
    print("Headings from the given document are:")
    print(headings)


word_count(paragraphInput)
headings()

There are 478 words.
Headings from the given document are:
['RAMA’S INITIATION', 'THATAKA’S STORY', 'MAHABALI’S STORY', 'GANGA’S STORY', 'AHALYA’S STORY']
