<a href="https://colab.research.google.com/github/yoshighosh/OCR-Testing/blob/main/Final_OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Preliminary Downloads

Tesseract Download

In [None]:
!sudo apt install tesseract-ocr
!pip install pytesseract

PDF file downloads

In [None]:
!pip install pdf2image
!apt-get install poppler-utils
!pip install python-poppler-qt5

PPTX file downloads

In [None]:
!pip install python-pptx

DOCX file downloads

In [None]:
!pip install python-docx

#Import Required Libraries

In [None]:
import cv2
import numpy as np
from google.colab.patches import cv2_imshow 
import pytesseract
from pytesseract import Output
import nltk
from nltk import word_tokenize
nltk.download('punkt')
from nltk import ngrams, FreqDist
import pandas as pd
import matplotlib.pyplot as plt
nltk.download('stopwords') 
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from pdf2image import convert_from_path
from IPython.display import Image
from pptx import Presentation
import glob
import docx

#Setting up OCR for each file type

Grayscale Function (Image and PDF)

In [None]:
def get_grayscale(image):
  return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

Image Files

In [None]:
def getImageText(filename):
  img = cv2.imread(filename)
  grayscale = get_grayscale(img)
  custom_config = r'-l eng --psm 6'
  text = pytesseract.image_to_string(grayscale, config=custom_config)
  return text

PDF Files

In [None]:
def getPDFText(filename):
  pages = convert_from_path(filename, 500)
  pagenumber = 1
  page_paths = []
  for page in pages:
    filename = "page"+ str(pagenumber) + ".jpg"
    pagenumber += 1
    page_paths.append(filename)
    page.save(filename, 'JPEG')
  images = []
  for page in page_paths:
    images.append(cv2.imread(page))
  for image in images:
    image = get_grayscale(image)
  custom_config = r'-l eng --psm 6'
  text = ""
  for grayscale in images:
    text += pytesseract.image_to_string(grayscale, config=custom_config)
  return text


PPTX Files

In [None]:
def getPPTXText(filename):
  for eachfile in glob.glob(filename):
      prs = Presentation(eachfile)
      fullText = []
      for slide in prs.slides:
          for shape in slide.shapes:
              if hasattr(shape, "text"):
                  fullText.append(shape.text)
  return '\n'.join(fullText)

DOCX Text

In [None]:
def getDOCXText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

# Detecting file type

In [None]:
def detectFileType(filename):
  extension = filename[filename.rfind(".")+1:]
  return extension

def getFileName(filepath):
  filename = filepath[filepath.rfind("/")+1:]
  return filename


In [None]:
filepaths = ["/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/pptx files/Leaf Disk Lab Poster (1).pptx", '/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/image files/New Image.jpg', '/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/image files/Picture_003.jpg', '/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/image files/Picture_025.tif', '/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/image files/Picture_029.tif', '/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/pdf files/Depth-First Search and Linear Graph Algorithms - Tarjan (1972).pdf', '/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/pdf files/Efficient String Matching - An Aid to Bibliographic Search - Aho-Corasick (1975).pdf', '/content/drive/MyDrive/Aroshi_highSchool/SEAP vounteer/docx files/SampleResume.docx']

text = {}

for filepath in filepaths:
  file_type = detectFileType(filepath)
  filename = getFileName(filepath)
  string_text = ""
  if file_type == "jpg":
    string_text = getImageText(filepath)
  elif file_type == "pdf":
    string_text = getPDFText(filepath)
  elif file_type == "pptx":
    string_text = getPPTXText(filepath)
  elif file_type == "docx":
    string_text = getDOCXText(filepath)
  text[filename] = string_text


In [51]:
for file in text:
  print(file)

Leaf Disk Lab Poster (1).pptx
New Image.jpg
Picture_003.jpg
Picture_025.tif
Picture_029.tif
Depth-First Search and Linear Graph Algorithms - Tarjan (1972).pdf
Efficient String Matching - An Aid to Bibliographic Search - Aho-Corasick (1975).pdf
SampleResume.docx


#Preprocessing text

In [52]:
def preprocessing(text):
  words = word_tokenize(text)
  updated_text = [word.lower() for word in words]
  final_text = [x for x in updated_text if x.isalpha()]
  stop_words = stopwords.words('english')
  final_text = [word for word in final_text if word not in stop_words]
  lemmatizer = WordNetLemmatizer()
  for word in final_text:
    word = lemmatizer.lemmatize(word)
  final_string = " ".join(final_text)
  return final_string

In [53]:
for file in text:
  text[file] = preprocessing(text[file])

In [54]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([text[file] for file in text])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

display(df)

Unnamed: 0,ab,abdomen,abilities,able,absence,abstract,abstraction,ac,academic,acceplinge,accept,access,accomplishment,accuracy,accurate,achieved,ackerman,ackley,acknowledgements,acm,acomponent,acquainted,across,action,actions,active,activities,actly,actual,ad,add,added,adding,addition,additional,additionally,address,addressed,addressee,addresses,...,worked,working,works,workshop,workshops,world,worse,worst,would,write,writing,wrote,ws,wt,wu,wv,ww,wy,xn,xy,yamada,year,yearly,years,yet,yi,yis,yk,yn,yo,york,yorktown,young,youth,yr,yw,yy,zero,zo,zoom
0,0.0,0.0,0.0,0.015358,0.0,0.026697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021236,0.0,0.0,0.0,0.0,0.023037,0.030716,0.017798,0.035595,0.0,0.021236,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015358,0.0,0.021236
1,0.0,0.0,0.0,0.026886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062313,0.037176,0.037176,0.0,...,0.031157,0.0,0.0,0.0,0.0,0.031157,0.0,0.0,0.047146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.028629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023993,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.007995,0.00477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011992,0.00477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003997,0.0,0.013798,0.003449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.023849,0.0,0.0,0.0,0.0,0.0,0.003024,0.0,0.0,0.0,0.0,0.00477,0.003997,0.007995,0.00477,0.0,0.00477,0.00477,0.0,0.0,0.0,0.0,0.023985,0.0,0.0,0.0,0.0,0.0,0.006899,0.00477,0.0,0.0,0.0,0.00477,0.0,0.006899,0.00477,0.0
6,0.0,0.0,0.0,0.006222,0.004302,0.0,0.0,0.004302,0.0,0.004302,0.004302,0.004302,0.0,0.0,0.0,0.004302,0.004302,0.004302,0.004302,0.05408,0.0,0.004302,0.0,0.0,0.004302,0.0,0.0,0.004302,0.003605,0.004302,0.015555,0.015555,0.018027,0.003605,0.003605,0.0,0.0,0.0,0.0,0.004302,...,0.0,0.0,0.0,0.0,0.0,0.0,0.004302,0.004302,0.027277,0.0,0.0,0.0,0.004302,0.0,0.018027,0.003605,0.0,0.004302,0.0,0.0,0.004302,0.0,0.0,0.003605,0.003605,0.008604,0.008604,0.004302,0.004302,0.004302,0.003111,0.0,0.0,0.0,0.004302,0.0,0.008604,0.006222,0.0,0.0
7,0.022591,0.0,0.022591,0.0,0.0,0.0,0.0,0.0,0.045182,0.0,0.0,0.0,0.022591,0.022591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022591,0.067773,0.0,0.0,0.045182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018933,0.0,0.0,0.0,0.0,0.0,...,0.056799,0.045182,0.0,0.067773,0.090364,0.094665,0.0,0.0,0.0,0.022591,0.067773,0.067773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022591,0.0,0.018933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022591,0.180728,0.0,0.0,0.0,0.0,0.0,0.0
