## The Problems of NLP in Under-Resourced Languages

## Getting the Data from PDF to Text

In [1]:
#Source for this section is another TAP Institute Course by Hannah Jacobs => https://hub.binder.constellate.org/user/hlj24-tapi_2021_ocr-ncsnihj6/notebooks/01-WhatIsOCR.ipynb

In [2]:
# Install tesseract on Binder.
# The exclamation runs the command as a terminal command.
# This may take 1-2 minutes.
# Source: Nathan Kelber & JStor Labs Constellate team.
!conda install -c conda-forge -y tesseract

In [3]:
!wget https://github.com/tesseract-ocr/tessdata/raw/master/eng.traineddata
!mv eng.traineddata /srv/conda/envs/notebook/share/tessdata/eng.traineddata

In [4]:
# # Import the Image module from the Pillow Library, which will help us access the image.
# from PIL import Image

# # Import the pytesseract library, which will run the OCR process.
import pytesseract

# # Open a specific image file, convert the text in the image to computer-readable text (OCR),
# # and then print the results for us to see here.
# print(pytesseract.image_to_string(Image.open("data/sample_mgh.JPG"), lang="lat"))

In [5]:
import cv2
image = cv2.imread("data/sample_mgh.JPG")
base_image = image.copy()

In [6]:
def find_footnote_line(image, base_image):
    
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (3,3), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    kernal = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 10))
    dilate = cv2.dilate(thresh, kernal, iterations=1)
    cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
    main_line = []
    for c in cnts:
        x,y,w,h = cv2.boundingRect(c)
        if h < 25 and w > 250:
            roi = base_image[y:y+h, x:x+w]
#             cv2.rectangle(image, (x,y), (x+w, y+h), (36, 255, 12), 2)
            main_line.append([x,y,w,h])
    cv2.imwrite("data/sample_boxes.png", image)
    return (main_line)

In [7]:
main_line = find_footnote_line(image, base_image)
x,y,w,h = main_line[0]
new = base_image[25:y, 0:-25]
cv2.imwrite("data/extraction.png", new)

True

In [8]:
image = cv2.imread("data/final.jpg")
def find_body(image, base_image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5,5), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    kernal = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 50))
    dilate = cv2.dilate(thresh, kernal, iterations=1)
    cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
    main_line = []
    for c in cnts:
        x,y,w,h = cv2.boundingRect(c)
        if h > 200 and w > 250:
            roi = base_image[y:y+h, x:x+w]
#             cv2.rectangle(image, (x,y), (x+w, y+h), (36, 255, 12), 2)
            main_line.append([x,y,w,h])
    cv2.imwrite("data/body_text.png", image)
    return (roi)

In [9]:
final = find_body(new, base_image)
cv2.imwrite("data/final.jpg", final)

True

In [10]:
def find_paras(image, base_image):
    base_image = image.copy()
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (3,3), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    kernal = cv2.getStructuringElement(cv2.MORPH_RECT, (4, 4))
    dilate = cv2.dilate(thresh, kernal, iterations=10)
    cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
    for c in cnts:
        x,y,w,h = cv2.boundingRect(c)
        if h > 200 and w > 600:
            roi = base_image[y:y+h, x:x+w]
            cv2.rectangle(image, (x,y), (x+w, y+h), (36, 255, 12), 2)
    return (roi)

In [11]:
last = find_paras(final, base_image)
cv2.imwrite("data/last.jpg", last)

True

In [19]:
ocr_result = pytesseract.image_to_string(last, lang="lat")
print (ocr_result)

DILECTISSIMO* AMICO TOTIUS PROSPERITATIS PRAESENTIS ET AETERNAE
BEATITUDINIS PERPETUAM SALUTEM.

Magna mihi laetitia est de bona voluntate vestra, quam audivi a fratre nostro
Benedicto! in vobis esse. Opto atque Deum deprecor, ut citius cum omni convenien-
tia perficiatur. Seriptum est enim: 'Ne tardes converti ad dominum Deum; quia
nescis, quid ventura pariat dies, Erue te de harum carcere tribulationum, quae in
hoe mundo fidelium animos torquere solent"; sicut scriptum est: *Multae tribulationes
iustorum; ut, quod sequitur, tibi evenire merearis: 'Sed de his omnibus liberavit eos
Dominus. Et cave diligentissime, ne qua te, aratrum Domini tenentem, iniustitia
retro revocet. Nemo miles sarcinis alienis onustus ad bella bene procedit, nisi armis
tantummodo victrieibus, vel ad defensionem sui vel ad laesionem adversarii.

Omnia quae vobis demandare necessaria videbantur mihi fidelissimo fratri Bene-
dieto dixi: loca, adiutorium et animi constantiam.

Sed scire debes, quod in omni loco, u

In [20]:
sections = ocr_result.split("\n\n")
print (len(sections))

4


In [27]:
final_sections = []
for sec in sections:
    sec = sec.replace("-\n", "")
    sec = sec.replace("\n", " ")
    sec = sec.replace(" ,", ",").replace(" .", ".").replace(" ;", ";").replace("*", " ").replace("\"", "\'")
    while "  " in sec:
        sec = sec.replace("  ", " ")
    final_sections.append(sec)
cleaned_text = "\n\n".join(final_sections)
print (cleaned_text)

DILECTISSIMO AMICO TOTIUS PROSPERITATIS PRAESENTIS ET AETERNAE BEATITUDINIS PERPETUAM SALUTEM.

Magna mihi laetitia est de bona voluntate vestra, quam audivi a fratre nostro Benedicto! in vobis esse. Opto atque Deum deprecor, ut citius cum omni convenientia perficiatur. Seriptum est enim: 'Ne tardes converti ad dominum Deum; quia nescis, quid ventura pariat dies, Erue te de harum carcere tribulationum, quae in hoe mundo fidelium animos torquere solent'; sicut scriptum est: Multae tribulationes iustorum; ut, quod sequitur, tibi evenire merearis: 'Sed de his omnibus liberavit eos Dominus. Et cave diligentissime, ne qua te, aratrum Domini tenentem, iniustitia retro revocet. Nemo miles sarcinis alienis onustus ad bella bene procedit, nisi armis tantummodo victrieibus, vel ad defensionem sui vel ad laesionem adversarii.

Omnia quae vobis demandare necessaria videbantur mihi fidelissimo fratri Benedieto dixi: loca, adiutorium et animi constantiam.

Sed scire debes, quod in omni loco, ubi hom

## Introduction to Word Embeddings

In [13]:
import fasttext

In [14]:
model = fasttext.train_unsupervised("data/100.txt")

In [15]:
def find_matches(search_word):
    res =  model.get_nearest_neighbors(search_word)
    return (res)

In [16]:
find_matches("Carolus")

In [17]:
def find_relationships(base_word, is_to, second_word):
    as_blank = model.get_analogies(base_word, is_to, second_word)
    return (as_blank)

In [18]:
find_relationships("Carolus", "rex", "abbas")