<a href="https://colab.research.google.com/github/sumanyurosha/practice/blob/master/Stride_Final_Submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Please run below cell to install all the required libraries**
# **requirements.txt**

In [1]:
! apt install libtesseract-dev
!sudo apt install tesseract-ocr
! pip install pytesseract

!pip install pdf2image
!sudo apt-get install poppler-utils

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libleptonica-dev
The following NEW packages will be installed:
  libleptonica-dev libtesseract-dev
0 upgraded, 2 newly installed, 0 to remove and 40 not upgraded.
Need to get 2,755 kB of archives.
After this operation, 13.8 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libleptonica-dev amd64 1.75.3-3 [1,308 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libtesseract-dev amd64 4.00~git2288-10f4998a-2 [1,447 kB]
Fetched 2,755 kB in 0s (18.2 MB/s)
Selecting previously unselected package libleptonica-dev.
(Reading database ... 160837 files and directories currently installed.)
Preparing to unpack .../libleptonica-dev_1.75.3-3_amd64.deb ...
Unpacking libleptonica-dev (1.75.3-3) ...
Selecting previously unselected package libtesseract-dev.
Preparing to unpack .../libtes

In [33]:
import sys
from pdf2image import convert_from_path

"""
Import other libraries/packages here.
"""
import pytesseract
import spacy
import re

In [34]:
def get_paragraph_dict(file_path: str) -> (dict, dict, dict):
    """
    This function takes file path of pdf, reads and processes
    the pdf and returns a dictionary with paragraph_id as key
    and paragraph_text as values. Optionally, it also returns
    a similar dictionary of headings.

    Parameters:
    file_path (str): Relative path of the pdf file with respect
    to current directory.

    Returns:
    dict: {key: paragraph_id, value: paragraph_text}
    dict: {key: heading_id, value: heading_text}
    dict: {key: paragraph_id, value: word_count in the paragraph}

    Example:
    >>> paras, headings, _ = get_paragraph_dict(../RAMA.pdf)
    >>> paras[0]
        'The new assembly hall, Dasaratha’s latest pride, was
        crowded all day with visiting dignitaries, royal emissaries,
        and citizens coming in with representations or appeals for
        justice. The King was always accessible, and fulfilled his
        duties as the ruler of Kosala without grudging the hours
        spent in public service.'
    >>> headings[0]
        'RAMA’S INITIATION'

    """
    paragraph_text_dict, heading_text_dict, para_word_count = {}, {}, {}
    pdf_images = convert_from_path(file_path)

    # Add your code here
    # Note that heading_text_dict and para_word_count are optional
    paragraph_text_dict = extract_paras_from_image(pdf_images)
    paragraph_text_dict, heading_text_dict = extract_paragraphs_and_headings(paragraph_text_dict)
    para_word_count = count_words_in_para(paragraph_text_dict)

    return paragraph_text_dict, heading_text_dict, para_word_count

In [35]:
def extract_paras_from_image(images: list) -> dict:
    """
    This function takes a list of PIL image objects and returns
    a dictionary with paragraph_id as key and paragraph_text as
    values. It processes them one by one and extracts the text
    from the image using Pyteserract library's image_to_string
    method. After extraction it splits the text of a complete
    page into paragraphs on the basis of the continous occurence
    of two new line characters signifying a line break.

    Parameters:
    images (list): A list of PIL.PpmImagePlugin.PpmImageFile objects
    as returned by pdf2image library's convert_from_path() method.

    Returns:
    dict: {key: paragraph_id, value: paragraph_text}

    Example:
    >>> paragraph_text_dict = extract_paras_from_image(pdf_images)
    >>> paragraph_text_dict[1]
        'The new assembly hall, Dasaratha’s latest pride, was crowded
        all\nday with visiting dignitaries, royal emissaries, and
        citizens coming in with\nrepresentations or appeals for justice.
        The King was always accessible,\nand fulfilled his duties as
        the ruler of Kosala without grudging the hours\nspent in public
        service.'
    """


    paragraphs = []
    for image in images:
        extracted_text = pytesseract.image_to_string(image)
        paragraphs.extend(extracted_text.split(sep='\n\n'))

    paragraph_dict = {k: v for k, v in enumerate(paragraphs)}
    return paragraph_dict


In [36]:
def extract_paragraphs_and_headings(para_dict: dict) -> (dict, dict):
    """
    This function takes a dictionary of text paragraphs and seperates
    paragraphs and headings from it and returns two dictionaries with
    paragraph_id, heading_id as key and paragraph_text, heading_text
    as values respectively.

    Parameters:
    para_dict (dict): A dictionary containing paragraph_id as key and
    paragraph_text as values.

    Returns:
    dict: {key: paragraph_id, value: paragraph_text}
    dict: {key: heading_id, value: heading_text}

    Example:
    >>> paragraphs, headings = extract_paragraphs_and_headings(paragraph_text_dict)
    >>> paragraphs[0]
        'The new assembly hall, Dasaratha’s latest pride, was crowded
        all day with visiting dignitaries, royal emissaries, and
        citizens coming in with representations or appeals for justice.
        The King was always accessible, and fulfilled his duties as
        the ruler of Kosala without grudging the hours spent in public
        service.'
    >>> headings[0]
        'RAMA’S INITIATION'
    """


    paragraph_text_dict = {}
    heading_text_dict = {}
    paragraph_id, heading_id = 0, 0
    for para in para_dict.values():
        x = re.findall("^[A-Z’\s]{2,}$", para)
        if x:
            heading_text_dict.update({heading_id: para})
            heading_id += 1
        else:
            paragraph_text_dict.update({paragraph_id: clean_text(para)})
            paragraph_id += 1

    return paragraph_text_dict, heading_text_dict

In [37]:
def count_words_in_para(para_dict: dict) -> dict:
    """
    A function that takes a dictionary with paragraph_id as key and
    paragraph_text as value and counts the number of words in
    paragraph_text and returns a dictionary with paragraph_id as key
    and words_count as key.

    Parameters:
    para_dict (dict): A dictionary containing paragraph_id as key and
    paragraph_text as values.

    Returns:
    dict: {key: paragraph_id, value: words_count}

    Example:
    >>> para_word_count = count_words_in_para(paragraph_text_dict)
    >>> para_word_count[0]
        44
    """
    return {i: len(para.split(sep=' ')) for i, para in para_dict.items()}

In [38]:
def get_persons_name(paragraph: str) -> list:
    """
    This function takes a paragraph text as the input, processes
    it and returns the list of all names of persons that exist in
    the paragraph. If same name occurs multiple times, it returns
    only one instance of it.

    Parameters:
    paragraph (str): Text of the paragraph. Note that it may contain
    some special characters from this set: [. “ ” ; : ? ’]. Other
    special characters are not present in the text.

    Returns:
    list: Unique names of persons present in the paragraph.

    Example:
    >>> paragraph = 'The new assembly hall, Dasaratha’s latest pride,
        was crowded all day with visiting dignitaries, royal emissaries,
        and citizens coming in with representations or appeals for
        justice. The King was always accessible, and fulfilled his
        duties as the ruler of Kosala without grudging the hours
        spent in public service.'
    >>> names = get_persons_name(paragraph)
    >>> names
        [Dasaratha]

    """
    person_names = []

    # Write your code here
    person_names = perform_ner(paragraph, 'PERSON')

    return list(set(person_names))

In [39]:
def clean_text(text: str) -> str:
    """
    This function takes text as input, replaces some tokens from it
    and returns the cleaned text.

    Parameters:
    text (str): Text of the paragraph which contains some unnecessary
    tokens and special characters like '|'

    Returns:
    str: cleaned text of paragraph

	>>> paragraph = 'The new assembly hall, Dasaratha’s latest pride, was
		crowded all\nday with visiting dignitaries, royal emissaries, and
		citizens coming in with\nrepresentations or appeals for justice.
		The King was always accessible,\nand fulfilled his duties as
		the ruler of Kosala without grudging the hours\nspent in public
		service.'
    >>> cleaned_text = clean_text(paragraph)
    >>> cleaned_text
        'The new assembly hall, Dasaratha’s latest pride, was crowded
		all day with visiting dignitaries, royal emissaries, and
		citizens coming in with representations or appeals for justice.
		The King was always accessible, and fulfilled his duties as
		the ruler of Kosala without grudging the hours spent in public
		service.'
    """
    text = text.replace("\n", " ")
    text = text.replace("|", "I")
    text = text.replace("\x0c", "")
    return text


In [40]:
def perform_ner(text: str, label: str) -> list:
    """
	A function that takes text and label and performs Named Entity
	Recogniton on that text and returns a list of entities found in
	the given text matching given label.

	Parameters:
	text (str): Text of a paragraph from the document.
	label (str): Label which we want to search in the given text.
	Possible values contain 'PERSON', 'GPE' and 'LOC'

	Returns:
	list: A list of entities found in the text. Empty list is returned
	if no matching entity is found.

	Example:
	>>> paragraph = 'The new assembly hall, Dasaratha’s latest pride,
        was crowded all day with visiting dignitaries, royal emissaries,
        and citizens coming in with representations or appeals for
        justice. The King was always accessible, and fulfilled his
        duties as the ruler of Kosala without grudging the hours
        spent in public service.'
	>>> perform_ner(paragraph, 'PERSON')
	>>> ['Dasaratha']
    """
    NER = spacy.load('en_core_web_sm')
    ner_text = NER(text)

    entity = []
    for word in ner_text.ents:
        if word.label_ == label:
            entity.append(word.text)

    return entity


In [41]:
def get_places_name(paragraph: str) -> list:
    """
    This function takes a paragraph text as the input, processes
    it and returns the list of all names of places that exist in
    the paragraph. If same name occurs multiple times, it returns
    only one instance of it.

    Parameters:
    paragraph (str): Text of the paragraph. Note that it may contain
    some special characters from this set: [. “ ” ; : ? ’]. Other
    special characters are not present in the text.

    Returns:
    list: Unique names of places present in the paragraph.

    Example:
    >>> paragraph = 'The new assembly hall, Dasaratha’s latest pride,
        was crowded all day with visiting dignitaries, royal emissaries,
        and citizens coming in with representations or appeals for
        justice. The King was always accessible, and fulfilled his
        duties as the ruler of Kosala without grudging the hours
        spent in public service.'
    >>> places = get_places_name(paragraph)
    >>> places
        [Kosala]

    """
    places_names = []

    # Write your code here
    places_names = perform_ner(paragraph, 'GPE') + perform_ner(paragraph, 'LOC')

    return list(set(places_names))



In [42]:
paras, headings, word_counts = get_paragraph_dict('/content/RAMA.pdf')

In [43]:
paras

{0: 'The new assembly hall, Dasaratha’s latest pride, was crowded all day with visiting dignitaries, royal emissaries, and citizens coming in with representations or appeals for justice. The King was always accessible, and fulfilled his duties as the ruler of Kosala without grudging the hours spent in public service.',
 1: 'On a certain afternoon, messengers at the gate came running in to announce, “Sage Viswamithra.” When the message was relayed to the King, he got up and hurried forward to receive the visitor. Viswamithra, once a king, a conqueror, and a dreaded name until he renounced his kingly role and chose to become a sage (which he accomplished through severe austerities), combined in himself the sage’s eminence and the king’s authority and was quick tempered and positive. Dasaratha led him to a proper seat and said, “This is a day of glory for us; your gracious presence is most welcome. You must have come from afar. Would you first rest?”',
 2: '“No need,” the sage replied sim

In [44]:
headings

{0: 'RAMA’S INITIATION',
 1: 'THATAKA’S STORY',
 2: 'MAHABALI’S STORY',
 3: 'GANGA’S STORY',
 4: 'AHALYA’S STORY'}

In [45]:
word_counts

{0: 48,
 1: 110,
 2: 80,
 3: 11,
 4: 77,
 5: 17,
 6: 23,
 7: 44,
 8: 36,
 9: 29,
 10: 11,
 11: 47,
 12: 115,
 13: 36,
 14: 9,
 15: 31,
 16: 29,
 17: 24,
 18: 25,
 19: 13,
 20: 7,
 21: 5,
 22: 19,
 23: 48,
 24: 8,
 25: 88,
 26: 9,
 27: 20,
 28: 59,
 29: 23,
 30: 125,
 31: 168,
 32: 96,
 33: 29,
 34: 43,
 35: 281,
 36: 41,
 37: 15,
 38: 74,
 39: 166,
 40: 8,
 41: 144,
 42: 14,
 43: 150,
 44: 71,
 45: 66,
 46: 11,
 47: 18,
 48: 10,
 49: 6,
 50: 26,
 51: 1,
 52: 39,
 53: 61,
 54: 105,
 55: 68,
 56: 47,
 57: 25,
 58: 90,
 59: 16,
 60: 89,
 61: 32,
 62: 4,
 63: 96,
 64: 62,
 65: 246,
 66: 42,
 67: 177,
 68: 25,
 69: 127,
 70: 169,
 71: 123,
 72: 116,
 73: 93,
 74: 107,
 75: 163,
 76: 109,
 77: 131,
 78: 70,
 79: 80,
 80: 26,
 81: 22,
 82: 97,
 83: 47,
 84: 30,
 85: 68}

In [46]:
get_persons_name(paras[0])

['Dasaratha']

In [47]:
get_places_name(paras[0])

['Kosala']

In [48]:
paras[0]

'The new assembly hall, Dasaratha’s latest pride, was crowded all day with visiting dignitaries, royal emissaries, and citizens coming in with representations or appeals for justice. The King was always accessible, and fulfilled his duties as the ruler of Kosala without grudging the hours spent in public service.'

# **Complete script in one cell**

In [8]:
import sys
from pdf2image import convert_from_path

"""
Import other libraries/packages here.
"""
import pytesseract
import spacy
import re


def get_paragraph_dict(file_path: str) -> (dict, dict, dict):
    """
    This function takes file path of pdf, reads and processes
    the pdf and returns a dictionary with paragraph_id as key
    and paragraph_text as values. Optionally, it also returns
    a similar dictionary of headings.

    Parameters:
    file_path (str): Relative path of the pdf file with respect
    to current directory.

    Returns:
    dict: {key: paragraph_id, value: paragraph_text}
    dict: {key: heading_id, value: heading_text}
    dict: {key: paragraph_id, value: word_count in the paragraph}

    Example:
    >>> paras, headings, _ = get_paragraph_dict(../RAMA.pdf)
    >>> paras[0]
        'The new assembly hall, Dasaratha’s latest pride, was
        crowded all day with visiting dignitaries, royal emissaries,
        and citizens coming in with representations or appeals for
        justice. The King was always accessible, and fulfilled his
        duties as the ruler of Kosala without grudging the hours
        spent in public service.'
    >>> headings[0]
        'RAMA’S INITIATION'

    """
    paragraph_text_dict, heading_text_dict, para_word_count = {}, {}, {}
    pdf_images = convert_from_path(file_path)

    # Add your code here
    # Note that heading_text_dict and para_word_count are optional
    paragraph_text_dict = extract_paras_from_image(pdf_images)
    paragraph_text_dict, heading_text_dict = extract_paragraphs_and_headings(paragraph_text_dict)
    para_word_count = count_words_in_para(paragraph_text_dict)

    return paragraph_text_dict, heading_text_dict, para_word_count


def extract_paras_from_image(images: list) -> dict:
    """
    This function takes a list of PIL image objects and returns
    a dictionary with paragraph_id as key and paragraph_text as
    values. It processes them one by one and extracts the text
    from the image using Pyteserract library's image_to_string
    method. After extraction it splits the text of a complete
    page into paragraphs on the basis of the continous occurence
    of two new line characters signifying a line break.

    Parameters:
    images (list): A list of PIL.PpmImagePlugin.PpmImageFile objects
    as returned by pdf2image library's convert_from_path() method.

    Returns:
    dict: {key: paragraph_id, value: paragraph_text}

    Example:
    >>> paragraph_text_dict = extract_paras_from_image(pdf_images)
    >>> paragraph_text_dict[1]
        'The new assembly hall, Dasaratha’s latest pride, was crowded
        all\nday with visiting dignitaries, royal emissaries, and
        citizens coming in with\nrepresentations or appeals for justice.
        The King was always accessible,\nand fulfilled his duties as
        the ruler of Kosala without grudging the hours\nspent in public
        service.'
    """


    paragraphs = []
    for image in images:
        extracted_text = pytesseract.image_to_string(image)
        paragraphs.extend(extracted_text.split(sep='\n\n'))

    paragraph_dict = {k: v for k, v in enumerate(paragraphs)}
    return paragraph_dict


def extract_paragraphs_and_headings(para_dict: dict) -> (dict, dict):
    """
    This function takes a dictionary of text paragraphs and seperates
    paragraphs and headings from it and returns two dictionaries with
    paragraph_id, heading_id as key and paragraph_text, heading_text
    as values respectively.

    Parameters:
    para_dict (dict): A dictionary containing paragraph_id as key and
    paragraph_text as values.

    Returns:
    dict: {key: paragraph_id, value: paragraph_text}
    dict: {key: heading_id, value: heading_text}

    Example:
    >>> paragraphs, headings = extract_paragraphs_and_headings(paragraph_text_dict)
    >>> paragraphs[0]
        'The new assembly hall, Dasaratha’s latest pride, was crowded
        all day with visiting dignitaries, royal emissaries, and
        citizens coming in with representations or appeals for justice.
        The King was always accessible, and fulfilled his duties as
        the ruler of Kosala without grudging the hours spent in public
        service.'
    >>> headings[0]
        'RAMA’S INITIATION'
    """


    paragraph_text_dict = {}
    heading_text_dict = {}
    paragraph_id, heading_id = 0, 0
    for para in para_dict.values():
        x = re.findall("^[A-Z’\s]{2,}$", para)
        if x:
            heading_text_dict.update({heading_id: para})
            heading_id += 1
        else:
            paragraph_text_dict.update({paragraph_id: clean_text(para)})
            paragraph_id += 1

    return paragraph_text_dict, heading_text_dict


def count_words_in_para(para_dict: dict) -> dict:
    """
    A function that takes a dictionary with paragraph_id as key and
    paragraph_text as value and counts the number of words in
    paragraph_text and returns a dictionary with paragraph_id as key
    and words_count as key.

    Parameters:
    para_dict (dict): A dictionary containing paragraph_id as key and
    paragraph_text as values.

    Returns:
    dict: {key: paragraph_id, value: words_count}

    Example:
    >>> para_word_count = count_words_in_para(paragraph_text_dict)
    >>> para_word_count[0]
        44
    """
    return {i: len(para.split(sep=' ')) for i, para in para_dict.items()}


def get_persons_name(paragraph: str) -> list:
    """
    This function takes a paragraph text as the input, processes
    it and returns the list of all names of persons that exist in
    the paragraph. If same name occurs multiple times, it returns
    only one instance of it.

    Parameters:
    paragraph (str): Text of the paragraph. Note that it may contain
    some special characters from this set: [. “ ” ; : ? ’]. Other
    special characters are not present in the text.

    Returns:
    list: Unique names of persons present in the paragraph.

    Example:
    >>> paragraph = 'The new assembly hall, Dasaratha’s latest pride,
        was crowded all day with visiting dignitaries, royal emissaries,
        and citizens coming in with representations or appeals for
        justice. The King was always accessible, and fulfilled his
        duties as the ruler of Kosala without grudging the hours
        spent in public service.'
    >>> names = get_persons_name(paragraph)
    >>> names
        [Dasaratha]

    """
    person_names = []

    # Write your code here
    person_names = perform_ner(paragraph, 'PERSON')

    return list(set(person_names))


def clean_text(text: str) -> str:
    """
    This function takes text as input, replaces some tokens from it
    and returns the cleaned text.

    Parameters:
    text (str): Text of the paragraph which contains some unnecessary
    tokens and special characters like '|'

    Returns:
    str: cleaned text of paragraph

	>>> paragraph = 'The new assembly hall, Dasaratha’s latest pride, was
		crowded all\nday with visiting dignitaries, royal emissaries, and
		citizens coming in with\nrepresentations or appeals for justice.
		The King was always accessible,\nand fulfilled his duties as
		the ruler of Kosala without grudging the hours\nspent in public
		service.'
    >>> cleaned_text = clean_text(paragraph)
    >>> cleaned_text
        'The new assembly hall, Dasaratha’s latest pride, was crowded
		all day with visiting dignitaries, royal emissaries, and
		citizens coming in with representations or appeals for justice.
		The King was always accessible, and fulfilled his duties as
		the ruler of Kosala without grudging the hours spent in public
		service.'
    """
    text = text.replace("\n", " ")
    text = text.replace("|", "I")
    text = text.replace("\x0c", "")
    return text


def perform_ner(text: str, label: str) -> list:
    """
	A function that takes text and label and performs Named Entity
	Recogniton on that text and returns a list of entities found in
	the given text matching given label.

	Parameters:
	text (str): Text of a paragraph from the document.
	label (str): Label which we want to search in the given text.
	Possible values contain 'PERSON', 'GPE' and 'LOC'

	Returns:
	list: A list of entities found in the text. Empty list is returned
	if no matching entity is found.

	Example:
	>>> paragraph = 'The new assembly hall, Dasaratha’s latest pride,
        was crowded all day with visiting dignitaries, royal emissaries,
        and citizens coming in with representations or appeals for
        justice. The King was always accessible, and fulfilled his
        duties as the ruler of Kosala without grudging the hours
        spent in public service.'
	>>> perform_ner(paragraph, 'PERSON')
	>>> ['Dasaratha']
    """
    NER = spacy.load('en_core_web_sm')
    ner_text = NER(text)

    entity = []
    for word in ner_text.ents:
        if word.label_ == label:
            entity.append(word.text)

    return entity


def get_places_name(paragraph: str) -> list:
    """
    This function takes a paragraph text as the input, processes
    it and returns the list of all names of places that exist in
    the paragraph. If same name occurs multiple times, it returns
    only one instance of it.

    Parameters:
    paragraph (str): Text of the paragraph. Note that it may contain
    some special characters from this set: [. “ ” ; : ? ’]. Other
    special characters are not present in the text.

    Returns:
    list: Unique names of places present in the paragraph.

    Example:
    >>> paragraph = 'The new assembly hall, Dasaratha’s latest pride,
        was crowded all day with visiting dignitaries, royal emissaries,
        and citizens coming in with representations or appeals for
        justice. The King was always accessible, and fulfilled his
        duties as the ruler of Kosala without grudging the hours
        spent in public service.'
    >>> places = get_places_name(paragraph)
    >>> places
        [Kosala]

    """
    places_names = []

    # Write your code here
    places_names = perform_ner(paragraph, 'GPE') + perform_ner(paragraph, 'LOC')

    return list(set(places_names))

