In [None]:
# import the necessary packages
#from transform import four_point_transform
from skimage.filters import threshold_local
import numpy as np
import argparse
import cv2
#import imutils
import matplotlib.pyplot as plt
import pytesseract

### Edge Detection

In [None]:
# load the image and compute the ratio of the old height
# to the new height, clone it, and resize it
image = cv2.imread("page4.jpg")

In [None]:
#plt.figure(figsize=(10, 20))
#imgplot = plt.imshow(image)

In [None]:
ratio = image.shape[0] / 1000
orig = image.copy()
image = imutils.resize(image, height = 1000)

In [None]:
# convert the image to grayscale, blur it, and find edges
# in the image
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gray = cv2.GaussianBlur(gray, (5, 5), 0)
edged = cv2.Canny(gray, 75, 200)

In [None]:
#plt.figure(figsize=(10, 20))
#imgplot = plt.imshow(edged)

In [None]:
# show the original image and the edge detected image
cv2.imshow("Image", image)
cv2.imshow("Edged", edged)
cv2.waitKey(0)
cv2.destroyAllWindows()

### Find edges of pages (contours)

In [None]:
# find the contours in the edged image, keeping only the
# largest ones, and initialize the screen contour
cnts = cv2.findContours(edged.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
cnts = sorted(cnts, key = cv2.contourArea, reverse = True)[:5]

In [None]:
# loop over contours
for c in cnts:
    # approximate the contour
    peri = cv2.arcLength(c, True)
    approx = cv2.approxPolyDP(c, 0.02*peri, True)
    
    # if our approximated contour has 4 points, then we can assume we have foound the page
    if len(approx) == 4:
        screenCnt = approx
        break

In [None]:
# show the contour (outline) of the piece of paper
cv2.drawContours(image, [screenCnt], -1, (0, 255, 0), 2)
cv2.imshow("Outline", image)
cv2.waitKey(0)
cv2.destroyAllWindows()

### Threshold

In [None]:
# convert the warped image to grayscale, then threshold it
# to give it that 'black and white' paper effect
warped = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
T = threshold_local(warped, 11, offset = 10, method = "gaussian")
warped = (warped > T).astype("uint8") * 255

In [None]:
# show the original and scanned images
print("STEP 3: Apply perspective transform")
cv2.imshow("Original", imutils.resize(orig, height = 650))
cv2.imshow("Scanned", imutils.resize(warped, height = 650))
cv2.waitKey(0)

### PyTesseract

In [None]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)

In [None]:
img = cv2.imread('page6.jpg')

# Adding custom options
custom_config = r'--oem 3 --psm 6'

gray = get_grayscale(img)
no_noise = remove_noise(gray)
eroded = erode(no_noise)
dilated = dilate(eroded)
deskewed = deskew(dilated)
thresh = thresholding(deskewed)

In [None]:
pytesseract.image_to_string(thresh, config=custom_config)

In [None]:
filename = "test.png"
cv2.imwrite(filename, thresh)

In [None]:
from BYOD import extractyourown

In [None]:
text = extractyourown("page6.jpg")

In [None]:
text

In [None]:
len(text.split())

***

### Building the API

In [None]:
# import the necessary packages
import cv2
import numpy as np
import re
import requests
import io
from easy_ocr import ocr_image
import numpy as np
import argparse
import cv2
#import imutils
import pytesseract
from pytesseract import Output

In [None]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    img = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return img

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

In [None]:
file = "page2.jpg"

In [None]:
# read file
img = cv2.imread(file)
# Adding custom options
custom_config = r'--oem 3 --psm 6'
# process image
gray = get_grayscale(img)
no_noise = remove_noise(gray)
eroded = erode(no_noise)
dilated = dilate(eroded)
deskewed = deskew(dilated)
thresh = thresholding(deskewed)

In [None]:
cv2.imwrite("test.png", thresh);

In [None]:
thresh.shape

In [None]:
thresh[thresh == 255] = 128
thresh[thresh == 0] = 255
thresh[thresh == 128] = 0

In [None]:
cv2.imwrite("test.png", thresh);

In [None]:
height = thresh.shape[0]
width = thresh.shape[1]
middle_column = round(width/2)
middle_row = round(height/2)

In [None]:
# left
left = middle_column
gap = 0
cnt = 0
tracker = 0
exit = False
while cnt < left:
    for i in range(middle_row, height):
        if tracker > 1000:
            exit = True
            break
        elif thresh[i][left] == 0:
            gap += 1
        else:
            tracker = gap
            gap = 0
            break
    if exit == True:
        break
    else:
        #print(left, " - ", tracker)
        left -= 1
print(left)

In [None]:
# right
right = middle_column
gap = 0
tracker = 0
exit = False
while right < width:
    for i in range(middle_row, height):
        if tracker > 1000:
            exit = True
            break
        elif thresh[i][right] == 0:
            gap += 1
        else:
            tracker = gap
            gap = 0
            break
    if exit == True:
        break
    else:
        #print(right, " - ", tracker)
        right += 1
print(right)

In [None]:
if left > 50:
    left -= 50
if right < width-50:
    right += 50
crop_img = thresh[:height, left:right]
cv2.imwrite("test.png", crop_img);

In [None]:
# top
top = middle_row
exit = False
buffer = []
buffer_size = 20
maxi = crop_img.shape[1]
avg = 0
while top > 0:
    if len(buffer) != buffer_size:
        buffer.append(sum(crop_img[top]))
    else:
        idx = buffer_size-1
        while idx > 0:
            buffer[idx] = buffer[idx-1]
            idx -= 1
        buffer[0] = sum(crop_img[top])
        
        avg = sum(buffer)/buffer_size
        #print(avg)
        if avg/maxi < 0.1:
            #print("done")
            #print(top, " - ", avg, maxi, avg/maxi)
            exit = True
    if exit == True:
        break
    else:
        #print(top, " - ", avg)
        top -= 1
print(top)

In [None]:
# top
end = crop_img.shape[0]
cnt = 0
top = end-1
#maxi = crop_img.shape[1]
values = []
while cnt < top:
    #print(bottom, sum(crop_img[bottom]))
    if sum(crop_img[cnt]) == 0:
        values.append(cnt)
    else:
        values.append(0)    
    cnt += 1
    
idx = [i for i,x in enumerate(values) if x != 0]
top = min(idx)
print(top)

In [None]:
crop_img = thresh[top:height, left:right]
cv2.imwrite("test.png", crop_img);

In [None]:
# bottom
end = crop_img.shape[0]
cnt = 0
bottom = end-1
#maxi = crop_img.shape[1]
values = []
while cnt < bottom:
    #print(bottom, sum(crop_img[bottom]))
    if sum(crop_img[cnt]) == 0:
        values.append(cnt)
    else:
        values.append(0)    
    cnt += 1

bottom = max(values)
print(bottom)

In [None]:
crop_img = crop_img[:bottom, :]
cv2.imwrite("test.png", crop_img);

In [None]:
crop_img[crop_img == 255] = 128
crop_img[crop_img == 0] = 255
crop_img[crop_img == 128] = 0
thresh = crop_img

In [None]:
cv2.imwrite("test.png", thresh);

In [None]:
text = pytesseract.image_to_string(thresh)
text = re.sub("\n", " ", text)
print(text)

In [None]:
words = pytesseract.image_to_data(thresh, output_type=Output.DICT)
word_boxes = len(words['level'])
left = thresh.shape[1]
right = 0
bottom = thresh.shape[0]
top = 0
for i in range(word_boxes):
    (x, y, w, h) = (words['left'][i], words['top'][i], words['width'][i], words['height'][i])
    if x < left:
        left = x
    if x+w > right:
        right = x+w
    if y < bottom:
        bottom = y
    if y+h > top:
        top = y+h

In [None]:
crop_img = thresh[row:height, 0:width]

In [None]:
cv2.imshow('image', crop_img)
cv2.waitKey(0)

In [None]:
cv2.imwrite("test.png", crop_img);

In [None]:
text = pytesseract.image_to_string(crop_img)
text = re.sub("\n", " ", text)
print(text)

In [None]:
text = ocr_image("test.png", service='youdao')
text = " ".join(i for i in text)
text = re.sub("- ", "", text)
text

In [1]:
from wisdomaiengine import summarisepdfdocument, bringyourowndocument

In [2]:
file = "page6.jpg"
text = bringyourowndocument(file)

In [3]:
text

"finer is one ol the fastest-growing companies of all time. Its name comes from “group coupons,” an ingenious idea that has spawned an entire industry of social commerce imitators. However, it didnt start out successful. When customers took Groupon up on its first deal, a whopping twenty people bought two-for-one pizza in a restaurant on the first floor of the company’s Chicago offices—hardly a world-changing event.  In fact, Groupon wasn't originally meant to be about commerce at all. The founder, Andrew Mason, intended his company to become a “collective activism platform” called The Point. Its goal was to bring people together to solve problems they couldn’t solve on their own, such as fund-raising for a cause or boycotting a certain retailer. The Point's early results were disappointing, however, and at the end of 2008 the founders decided to try something new. Although they still had grand ambitions, they were determined to keep the new product simple. They built a minimum viable 

In [10]:
summary = summarisepdfdocument(text)
for i in summary:
    print(i, "\n")

• When customers took Groupon up on its first deal, a whopping twenty people bought two-for-one pizza in a restaurant on the first floor of the company’s Chicago offices—hardly a world-changing event. 

• The founder, Andrew Mason, intended his company to become a “collective activism platform” called The Point. 

• Its goal was to bring people together to solve problems they couldn’t solve on their own, such as fund-raising for a cause or boycotting a certain retailer. 

• The Point's early results were disappointing, however, and at the end of 2008 the founders decided to try something new. 

• Mason tells the story:  We took a WordPress Blog and we skinned it to say Groupon and then every day we would do a new post. 

