<a href="https://colab.research.google.com/github/yoshighosh/OCR-Testing/blob/main/OCR_Prototypes_(PDF_Version).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Testing Tesseract OCR Models

**Steps:**
- Load in image file
- Preprocess using OpenCV
- 

# Convert PDF to image

In [None]:
pip install pdf2image

In [None]:
!apt-get install poppler-utils 

In [None]:
pip install python-poppler-qt5

In [None]:
pdf_paths = ["/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/pdf files/Depth-First Search and Linear Graph Algorithms - Tarjan (1972).pdf", "/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/pdf files/Efficient String Matching - An Aid to Bibliographic Search - Aho-Corasick (1975).pdf"]

file = "/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/pdf files/Depth-First Search and Linear Graph Algorithms - Tarjan (1972).pdf"


from pdf2image import convert_from_path

pages = convert_from_path(file, 500)

from IPython.display import Image 



In [None]:
pagenumber = 1
page_paths = []

for page in pages:
    filename = "page"+ str(pagenumber) + ".jpg"
    pagenumber += 1
    page_paths.append(filename)
    page.save(filename, 'JPEG')
    #display(page)

# Loading in the Image file

Use OpenCV to load in an image


In [None]:
import cv2
import numpy as np
from google.colab.patches import cv2_imshow 

#paths = ['/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/image files/t1.tif', '/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/image files/t2.tif', '/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/image files/t5.tif', '/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/image files/New Image.jpg', '/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/image files/Picture_003.jpg', '/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/image files/Picture_010.tif', '/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/image files/Picture_025.tif', '/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/image files/Picture_029.tif']


#img = cv2.imread('/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/image files/Picture_003.jpg')

images = []


for page in page_paths:
  images.append(cv2.imread(page))
  



# consider data types: pdf, pptx, word docs, 


In [None]:
# display each page using CV

def displayImages():
  for image in images:
    cv2_imshow(image)

# Different Preprocessing functions

Methods for preprocessing the image before running it through the OCR

In [None]:
# get grayscale image, good for getting rid of color issues
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal, makes image clearer
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding, removes extra marks (lines, smudges, etc)
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation, bolds text to make it easier to read
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)
    
    
#erosion, shrinks the edges to make text thinner
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection, reverses color layout, not that useful
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction, corrects angled writing, good for handwriting??
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching, useful for finding specific parts of a document (if looking for specific symbol or marker)
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED) 

# Testing the preprocessing methods

sample test of all the methods, can skip running later 

In [None]:
for image in images:
  image = get_grayscale(image)

displayImages()
#threshold = thresholding(grayscale)
#dilated = dilate(grayscale)
#eroded = erode(grayscale)
#dil_ero = opening(grayscale)
#skew = deskew(grayscale)
 

#cv2_imshow(grayscale)
#cv2_imshow(threshold)
#cv2_imshow(dil_ero)
#cv2_imshow(dilated)
#cv2_imshow(eroded)
#cv2_imshow(skew)

# Installing tesseract

Install and import tesseract libraries

In [None]:
!sudo apt install tesseract-ocr
!pip install pytesseract

In [None]:
import pytesseract
from pytesseract import Output

# Boxing Words

Test tesseract ability to identify words by adding boxes

In [None]:
for img in images:
  d = pytesseract.image_to_data(img, output_type=Output.DICT)
  # print(d.keys()) # see keys to reference info about the boxes
  n_boxes = len(d['text'])
  for i in range(n_boxes):
      if int(d['conf'][i]) > 60:
          (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
          img = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)

  cv2_imshow(img)
  cv2.waitKey(0)

# Get string output



In [None]:
custom_config = r'-l eng --psm 6'
text = ""

for grayscale in images:
  text += pytesseract.image_to_string(grayscale, config=custom_config)
  
print(text)

# Checking word frequencies
- use NLTK word tokenize


In [None]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')

from nltk import ngrams, FreqDist

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
words = word_tokenize(text)
print(len(words), "words:", words)

7759 words: ['SIAM', 'UJ', ']', 'Comput', '!', 'Vol', '1', 'No', ')', '2', ']', 'June', '1972', 'ROBERT', 'TARJANt', 'Abstract', ',', 'The', 'value', 'of', 'depth-first', 'search', 'on', '[', '``', 'backtracking/', '’', 'as', 'a', 'technique', 'for', 'solving', 'problems', 'is', 'illustrated', 'by|', 'two', 'examples', ']', 'An', 'improved', 'version', 'ofl', 'an', 'algorithm', 'for', 'finding', 'the', 'strongly', 'connected', 'components', 'ofl', 'a', 'directed', 'graph|', 'and', 'an', 'algorithm', 'for', 'finding', 'the', 'biconnected', 'components', 'of', 'an', 'un=', 'direct', 'graph', 'are', 'presented', ']', 'The', 'space', 'and', 'time', 'requirements', 'of', ')', 'both', 'algorithms', 'are', 'bounded', 'by', 'k', ',', 'V', '+', 'k', ',', 'B', '#', 'k', ';', 'for', 'some', 'constants', 'k', ',', ',k', ',', ',and', 'k3', ',', 'where', 'Vis', 'the', 'number', 'of', 'vertices', 'and', 'Eis', 'the', 'number', 'ofl', 'edges', 'ofthe', 'graph', 'being', 'examined', '!', '1', '!', 'Int

In [None]:
# compute the frequency distribution 
mp_freqdist = FreqDist(words) 
mp_freqdist.most_common(25) # show the top 25 (word,frequency) pairs

[(')', 352),
 ('the', 277),
 (']', 209),
 ('(', 198),
 ('is', 187),
 ('a', 177),
 (',', 172),
 ('!', 138),
 ('of', 130),
 ('and', 129),
 ('w', 110),
 ('v', 103),
 ('in', 101),
 ('G', 90),
 ('[', 80),
 (';', 70),
 ('be', 68),
 ('graph', 67),
 ('to', 67),
 ('on', 60),
 ('}', 59),
 ('edges', 54),
 ('connected', 50),
 ('search', 49),
 ('The', 48)]

In [None]:
# next steps: clean up capitalization, punctuation, get bigrams and trigrams, working with new datatypes
# research approximate string matching
# filter with dates as well


# Clean data before calculating Frequencies

In [None]:
# remove capitalization

updated_text = [word.lower() for word in words] 
print(updated_text[0:100])

['siam', 'uj', ']', 'comput', '!', 'vol', '1', 'no', ')', '2', ']', 'june', '1972', 'robert', 'tarjant', 'abstract', ',', 'the', 'value', 'of', 'depth-first', 'search', 'on', '[', '``', 'backtracking/', '’', 'as', 'a', 'technique', 'for', 'solving', 'problems', 'is', 'illustrated', 'by|', 'two', 'examples', ']', 'an', 'improved', 'version', 'ofl', 'an', 'algorithm', 'for', 'finding', 'the', 'strongly', 'connected', 'components', 'ofl', 'a', 'directed', 'graph|', 'and', 'an', 'algorithm', 'for', 'finding', 'the', 'biconnected', 'components', 'of', 'an', 'un=', 'direct', 'graph', 'are', 'presented', ']', 'the', 'space', 'and', 'time', 'requirements', 'of', ')', 'both', 'algorithms', 'are', 'bounded', 'by', 'k', ',', 'v', '+', 'k', ',', 'b', '#', 'k', ';', 'for', 'some', 'constants', 'k', ',', ',k', ',']


In [None]:
# remove punctuation

final_text = [x for x in updated_text if x.isalpha()]

print(final_text[0:100])


['siam', 'uj', 'comput', 'vol', 'no', 'june', 'robert', 'tarjant', 'abstract', 'the', 'value', 'of', 'search', 'on', 'as', 'a', 'technique', 'for', 'solving', 'problems', 'is', 'illustrated', 'two', 'examples', 'an', 'improved', 'version', 'ofl', 'an', 'algorithm', 'for', 'finding', 'the', 'strongly', 'connected', 'components', 'ofl', 'a', 'directed', 'and', 'an', 'algorithm', 'for', 'finding', 'the', 'biconnected', 'components', 'of', 'an', 'direct', 'graph', 'are', 'presented', 'the', 'space', 'and', 'time', 'requirements', 'of', 'both', 'algorithms', 'are', 'bounded', 'by', 'k', 'v', 'k', 'b', 'k', 'for', 'some', 'constants', 'k', 'where', 'vis', 'the', 'number', 'of', 'vertices', 'and', 'eis', 'the', 'number', 'ofl', 'edges', 'ofthe', 'graph', 'being', 'examined', 'introduction', 'consider', 'a', 'graph', 'g', 'consisting', 'a', 'sef', 'ofivertices', 'vw', 'and']


In [None]:
# Check frequencies now

mp_freqdist = FreqDist(final_text)  
mp_freqdist.most_common(25)

[('the', 325),
 ('a', 187),
 ('is', 187),
 ('and', 133),
 ('of', 130),
 ('v', 114),
 ('w', 114),
 ('in', 105),
 ('g', 91),
 ('graph', 68),
 ('be', 68),
 ('to', 67),
 ('on', 61),
 ('search', 60),
 ('an', 54),
 ('edges', 54),
 ('connected', 51),
 ('vertex', 50),
 ('are', 48),
 ('by', 46),
 ('for', 43),
 ('algorithm', 43),
 ('vertices', 43),
 ('edge', 43),
 ('p', 43)]

# Remove stopwords

In [None]:
nltk.download('stopwords') 
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(len(stop_words), "stopwords:", stop_words) # show the words nltk will remove as stopwords
# print the length of our original text
print('Length of original text:',len(final_text))
print(final_text[0:100])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
179 stopwords: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'an

In [None]:
# remove the stopwords from the text
final_text = [word for word in final_text if word not in stop_words] #another regex
print(len(final_text), "without stopwords")

print(final_text[1:100])

3358 without stopwords
['uj', 'comput', 'vol', 'june', 'robert', 'tarjant', 'abstract', 'value', 'search', 'technique', 'solving', 'problems', 'illustrated', 'two', 'examples', 'improved', 'version', 'ofl', 'algorithm', 'finding', 'strongly', 'connected', 'components', 'ofl', 'directed', 'algorithm', 'finding', 'biconnected', 'components', 'direct', 'graph', 'presented', 'space', 'time', 'requirements', 'algorithms', 'bounded', 'k', 'v', 'k', 'b', 'k', 'constants', 'k', 'vis', 'number', 'vertices', 'eis', 'number', 'ofl', 'edges', 'ofthe', 'graph', 'examined', 'introduction', 'consider', 'graph', 'g', 'consisting', 'sef', 'ofivertices', 'vw', 'set', 'ofl', 'edges', 'graphl', 'may', 'directed', 'edges', 'ordered', 'pairs', 'w', 'ofivertices', 'j', 'v', 'tail', 'w', 'head', 'edge', 'undirected', 'edges', 'unordered', 'pairs', 'vertices', 'also', 'fepresented', 'v', 'w', 'graphs', 'form', 'suitable', 'many', 'areas', 'chemistry', 'sociology', 'example', 'thus', 'important', 'economical']


In [None]:
# Check frequencies now

mp_freqdist = FreqDist(final_text)  
mp_freqdist.most_common(25)

[('v', 114),
 ('w', 114),
 ('g', 91),
 ('graph', 68),
 ('search', 60),
 ('edges', 54),
 ('connected', 51),
 ('vertex', 50),
 ('algorithm', 43),
 ('vertices', 43),
 ('edge', 43),
 ('p', 43),
 ('tree', 43),
 ('let', 43),
 ('component', 39),
 ('directed', 31),
 ('strongly', 30),
 ('number', 30),
 ('u', 28),
 ('stack', 28),
 ('one', 27),
 ('components', 24),
 ('point', 24),
 ('numbered', 23),
 ('suppose', 22)]