In [None]:
# import the necessary packages
from transform import four_point_transform
from skimage.filters import threshold_local
import numpy as np
import argparse
import cv2
import imutils
import matplotlib.pyplot as plt
import pytesseract

### Edge Detection

In [None]:
# load the image and compute the ratio of the old height
# to the new height, clone it, and resize it
image = cv2.imread("page4.jpg")

In [None]:
#plt.figure(figsize=(10, 20))
#imgplot = plt.imshow(image)

In [None]:
ratio = image.shape[0] / 1000
orig = image.copy()
image = imutils.resize(image, height = 1000)

In [None]:
# convert the image to grayscale, blur it, and find edges
# in the image
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gray = cv2.GaussianBlur(gray, (5, 5), 0)
edged = cv2.Canny(gray, 75, 200)

In [None]:
#plt.figure(figsize=(10, 20))
#imgplot = plt.imshow(edged)

In [None]:
# show the original image and the edge detected image
cv2.imshow("Image", image)
cv2.imshow("Edged", edged)
cv2.waitKey(0)
cv2.destroyAllWindows()

### Find edges of pages (contours)

In [None]:
# find the contours in the edged image, keeping only the
# largest ones, and initialize the screen contour
cnts = cv2.findContours(edged.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
cnts = sorted(cnts, key = cv2.contourArea, reverse = True)[:5]

In [None]:
# loop over contours
for c in cnts:
    # approximate the contour
    peri = cv2.arcLength(c, True)
    approx = cv2.approxPolyDP(c, 0.02*peri, True)
    
    # if our approximated contour has 4 points, then we can assume we have foound the page
    if len(approx) == 4:
        screenCnt = approx
        break

In [None]:
# show the contour (outline) of the piece of paper
cv2.drawContours(image, [screenCnt], -1, (0, 255, 0), 2)
cv2.imshow("Outline", image)
cv2.waitKey(0)
cv2.destroyAllWindows()

### Threshold

In [None]:
# convert the warped image to grayscale, then threshold it
# to give it that 'black and white' paper effect
warped = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
T = threshold_local(warped, 11, offset = 10, method = "gaussian")
warped = (warped > T).astype("uint8") * 255

In [None]:
# show the original and scanned images
print("STEP 3: Apply perspective transform")
cv2.imshow("Original", imutils.resize(orig, height = 650))
cv2.imshow("Scanned", imutils.resize(warped, height = 650))
cv2.waitKey(0)

### PyTesseract

In [None]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)

In [None]:
img = cv2.imread('page6.jpg')

# Adding custom options
custom_config = r'--oem 3 --psm 6'

gray = get_grayscale(img)
no_noise = remove_noise(gray)
eroded = erode(no_noise)
dilated = dilate(eroded)
deskewed = deskew(dilated)
thresh = thresholding(deskewed)

In [None]:
pytesseract.image_to_string(thresh, config=custom_config)

In [None]:
filename = "test.png"
cv2.imwrite(filename, thresh)

In [None]:
from BYOD import extractyourown

In [None]:
text = extractyourown("page6.jpg")

In [None]:
text

In [None]:
len(text.split())

***

### Building the API

In [51]:
# import the necessary packages
import cv2
import numpy as np
import pytesseract
import re
import requests
import io
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

In [52]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

In [53]:
file = "page6.jpg"

In [54]:
# read file
img = cv2.imread(file)
# Adding custom options
custom_config = r'--oem 3 --psm 6'
# process image
gray = get_grayscale(img)
no_noise = remove_noise(gray)
eroded = erode(no_noise)
dilated = dilate(eroded)
deskewed = deskew(dilated)
thresh = thresholding(deskewed)

In [62]:
# extract text
text = pytesseract.image_to_string(thresh, config=custom_config)
# format for presentation
text = re.sub("\n\n", " ", text)
text = re.sub("-\n", "", text)
text = re.sub("\n", " ", text)
print(text)

finer is one of the fastest-growing companies of all time. Its name comes from “group coupons,” an ingenious idea that has spawned an entire industry of social commerce imitators. However, it didnt start out successful. When customers took Groupon up on its first deal, a whopping twenty people bought two-for-one pizza in a restaurant on the first floor of the company’s Chicago offices—hardly a world-changing event. In fact, Groupon wasn't originally meant to be about commerce at all. The founder, Andrew Mason, intended his company to become a “collective activism platform” called The Point. Its goal was to bring people together to solve problems they couldn’t solve on their own, such as fund-raising for a cause or boycotting a certain retailer. The Point's early results were disappointing, however, and at the end of 2008 the founders decided to try something new. Although they still had grand ambitions, they were determined to keep the new product simple. They built a minimum viable pr

In [64]:
from wisdomaiengine import summarisepdfdocument

In [65]:
summary = summarisepdfdocument(text)

In [66]:
summary

['When customers took Groupon up on its first deal, a whopping twenty people bought two-for-one pizza in a restaurant on the first floor of the company’s Chicago offices—hardly a world-changing event.',
 'The founder, Andrew Mason, intended his company to become a “collective activism platform” called The Point.',
 'Its goal was to bring people together to solve problems they couldn’t solve on their own, such as fund-raising for a cause or boycotting a certain retailer.',
 "The Point's early results were disappointing, however, and at the end of 2008 the founders decided to try something new.",
 'Mason tells the story: We took a WordPress Blog and we skinned it to say Groupon and then every day we would do a new post.']

In [55]:
filename = "test.png"
cv2.imwrite(filename, thresh);

In [56]:
import img2pdf

pdf_name = filename.split(".")[0]+".pdf"
with open(pdf_name, "wb") as f:
    f.write(img2pdf.convert(filename))

In [57]:
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open("test.pdf", 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
caching = True

for page in PDFPage.get_pages(fp, caching=caching, check_extractable=True):
    interpreter.process_page(page)
    
text = retstr.getvalue()

fp.close()
device.close()
retstr.close()

In [58]:
text

'\x0c'

In [60]:
from tika import parser

In [61]:
raw = parser.from_file('test.pdf')

2020-01-30 12:09:54,028 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.23/tika-server-1.23.jar.md5 to /var/folders/f8/nj2cglwj7_1938gn9wydpwzh0000gn/T/tika-server.jar.md5.
INFO:tika.tika:Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.23/tika-server-1.23.jar.md5 to /var/folders/f8/nj2cglwj7_1938gn9wydpwzh0000gn/T/tika-server.jar.md5.
2020-01-30 12:09:54,407 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.23/tika-server-1.23.jar to /var/folders/f8/nj2cglwj7_1938gn9wydpwzh0000gn/T/tika-server.jar.
INFO:tika.tika:Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.23/tika-server-1.23.jar to /var/folders/f8/nj2cglwj7_1938gn9wydpwzh0000gn/T/tika-server.jar.
2020-01-30 12:10:12,235 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2020-01-30 12:10:17,243 [MainThread  ] 

RuntimeError: Unable to start Tika server.

In [44]:
# from fpdf import FPDF
# pdf = FPDF()
# imagelist = [filename]
# # imagelist is the list with all image filenames
# for image in imagelist:
#     pdf.add_page()
#     pdf.image(image,x,y,w,h)
# pdf.output("yourfile.pdf", "F")