<a href="https://colab.research.google.com/github/vishnu-chand/word_search_solver_ocr/blob/master/word_search_solver_ocr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%%capture
!git clone https://github.com/vishnu-chand/word_search_solver_ocr.git
!add-apt-repository ppa:alex-p/tesseract-ocr -y
!apt-get update -y -q
!apt install tesseract-ocr -y -q
!pip install pytesseract

In [2]:
import os
import cv2
!tesseract -v
import numpy as np
from glob import glob
from os.path import basename
from itertools import product
from ipywidgets import interact
from IPython.display import HTML
import pytesseract as pytesseract
from collections import OrderedDict
from matplotlib import pyplot as plt
from ipywidgets import interact_manual
from PIL.Image import fromarray as Img
import matplotlib.animation as animation
from ipywidgets.widgets import IntText, Dropdown, Text

tesseract 4.1.1-rc2-21-gf4ef
 leptonica-1.75.3
  libgif 5.1.4 : libjpeg 8d (libjpeg-turbo 1.5.2) : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libwebp 0.6.1 : libopenjp2 2.3.0

 Found AVX2
 Found AVX
 Found FMA
 Found SSE
 Found libarchive 3.2.2 zlib/1.2.11 liblzma/5.2.2 bz2lib/1.0.6 liblz4/1.7.1


In [0]:
def frameFit(img, bbox):
    # ensure bbox not going away from image
    x, y, w, h = bbox
    imH, imW = img.shape[:2]
    x0, y0 = max(0, x), max(0, y) 
    x1, y1 = min(x0 + w, imW), min(y0 + h, imH)
    return np.array((x0, y0, x1 - x0, y1 - y0), dtype=int)


def bboxScale(img, bbox, scale):
    x, y, w, h = bbox
    wn, hn = w * scale, h * scale
    x, y, w, h = x + (w - wn) / 2, y + (h - hn) / 2, wn, hn
    return frameFit(img, (x, y, w, h))

def getSubImg(im1, bbox):
    '''
    crop sub image from the given input image and bbox
    '''
    x, y, w, h = bbox
    x, y, w, h = int(x), int(y), int(w), int(h)
    img = im1[y:y + h, x:x + w]
    if img.shape[0] and img.shape[1]:
        return img


def displayVideo(imgs, figSize=128):
    ims = []
    h, w = imgs[0].shape[:2]
    fig = plt.figure(figsize=(w/figSize, h/figSize)) # assume 128pix == 1 inch
    for img in imgs:
        im = plt.imshow(img, animated=True)
        plt.axis('off')
        plt.tight_layout()
        ims.append([im])
    ani = animation.ArtistAnimation(fig, ims, interval=100, repeat_delay=1)
    plt.close()
    return ani

In [4]:
impaths = glob('word_search_solver_ocr/puzzle*.*')
img, datas = None, None
def selectImg(impaths):
    global img, datas
    img = cv2.imread(impaths)
    imH, imW = img.shape[:2]
    datas = os.path.splitext(basename(impaths))[0]
    datas = datas.split('_')[1:]
    try:
        datas = [int(d) for d in datas]
        px, py, pw, ph = datas[:4]
        qx, qy, qw, qh = datas[4:]
        datas = dict(
            puzzleBox=[str(px), str(py), str(px+pw), str(py + ph)],
            queryBox =[str(qx), str(qy), str(qx+qw), str(qy + qh)])
    except:
        datas = dict(
            puzzleBox=[str(0), str(0), str(imW), str(imH)],
            queryBox =[str(0), str(0), str(imW), str(imH)])
    print(datas)
interact(selectImg, impaths=impaths)

interactive(children=(Dropdown(description='impaths', options=('word_search_solver_ocr/puzzle1_35_100_715_710_…

<function __main__.selectImg>

In [5]:
imH, imW = img.shape[:2]
selectBbox = Dropdown(options=datas.keys())
textBoxes = OrderedDict()
for k in ['x0', 'y0', 'x1', 'y1']:
    textBoxes[k] = Text(description=k, continuous_update=False)
    

def update_slider(change):
    for old, new in zip(textBoxes, datas[change['new']]):
        textBoxes[old].value = new

selectBbox.observe(update_slider, names='value')
update_slider(dict(new=list(datas.keys())[0]))

def markPuzzleBox(selectBbox, imSize, x0, y0, x1, y1):
    global datas
    datas[selectBbox] = x0, y0, x1, y1
    x0, y0, x1, y1 = float(x0), float(y0), float(x1), float(y1)
    x, y, w, h = frameFit(img, [x0, y0, x1 - x0, y1 - y0])
    im = img.copy()
    im = cv2.rectangle(im, (x, y), (x + w, y + h), (0, 128, 0), 5)
    size = int(imSize * imW), int(imSize * imH)
    im = cv2.resize(im,size)
    display(Img(im))


print("Press enter after updaing x0, y0, x1, y1")
interact(markPuzzleBox, imSize=(.2, 1.0), selectBbox=selectBbox, **textBoxes)

Press enter after updaing x0, y0, x1, y1


interactive(children=(Dropdown(description='selectBbox', options=('puzzleBox', 'queryBox'), value='puzzleBox')…

<function __main__.markPuzzleBox>

In [6]:
def data2bbox(data):
    x0, y0, x1, y1 = data
    x0, y0, x1, y1 = float(x0), float(y0), float(x1), float(y1)
    x, y, w, h = frameFit(img, [x0, y0, x1 - x0, y1 - y0])
    return np.array((x, y, w, h), int)
puzzleBox = data2bbox(datas['puzzleBox'])
queryBox = data2bbox(datas['queryBox'])
print(puzzleBox, queryBox)

[ 26 103 740 735] [790 112 177 526]


# Image to text
1. Get sub-image (puzzle image or query image)
2. Binarize the image with otsu algorithm 
3. Perform a 5x5 closing operation.
4. * Query image: Find the contours bounding box that will extract word images
  * Puzzle image: Find the contours bounding box that will extract letter images 
5. Run tesseract on word/letter image to extract string
  * Config 1: Look only for char A-Z + 0 + p with --psm 10 (treat image as single char.)
  * Config 2: It text is not detected in config 1 then run just --psm 10
  * Finally perform a post correction with predefined map corrections = {'PP': 'P', '0': 'O', '': 'N'}
6. Return image texts and bboxes

In [7]:
# TODO need to improve ocr
def img2data(img, puzzleBox):
    img =cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    corrections = {'PP': 'P', '0': 'O', '': 'N'}
    img = getSubImg(img, puzzleBox)
    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    roiImg = cv2.morphologyEx(cv2.bitwise_not(img), cv2.MORPH_CLOSE, cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5)), iterations=3)
    cnts, _ = cv2.findContours(roiImg, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    cnts = [bboxScale(img, cv2.boundingRect(cnt), 1.5) for cnt in cnts[::-1]]
    texts, bboxs = [], []
    for bbox in cnts:
        simg = getSubImg(img, bbox)
        text = pytesseract.image_to_string(simg, config="-c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0p --psm 10")
        if not text:
            text = pytesseract.image_to_string(simg, config="--psm 10")
        text = text.upper()
        if len(text) < 3:
            text = corrections.get(text, text)
        texts.append(text)
        bboxs.append(bbox)
    return img, texts, bboxs

puzzleImg, puzzles, pboxs = img2data(img, puzzleBox)
queryImg, queries, qboxs = img2data(img, queryBox)
# get puzzles and convert to square numpy 
puzzles = np.array(puzzles).reshape(int(len(puzzles) ** .5), -1)
queries = sorted(queries, key=lambda x:len(x))
print(puzzles)
print(queries)

[['K' 'R' 'P' 'Y' 'R' 'O' 'C' 'L' 'A' 'S' 'T' 'I' 'C' 'O']
 ['E' 'A' 'C' 'D' 'R' 'I' 'T' 'E' 'I' 'A' 'B' 'T' 'Y' 'A']
 ['R' 'L' 'O' 'E' 'D' 'Y' 'I' 'X' 'R' 'T' 'L' 'A' 'S' 'N']
 ['F' 'U' 'N' 'G' 'I' 'T' 'C' 'I' 'F' 'N' 'A' 'O' 'F' 'A']
 ['U' 'C' 'N' 'U' 'D' 'I' 'R' 'C' 'R' 'E' 'T' 'L' 'S' 'C']
 ['F' 'A' 'I' 'S' 'A' 'C' 'I' 'O' 'O' 'C' 'H' 'Y' 'U' 'H']
 ['F' 'I' 'P' 'T' 'T' 'A' 'S' 'N' 'N' 'S' 'E' 'T' 'O' 'R']
 ['L' 'P' 'T' 'N' 'I' 'C' 'O' 'F' 'D' 'E' 'R' 'I' 'I' 'O']
 ['E' 'F' 'I' 'E' 'C' 'I' 'R' 'I' 'E' 'B' 'S' 'C' 'R' 'N']
 ['A' 'P' 'O' 'L' 'M' 'P' 'O' 'P' 'U' 'U' 'K' 'A' 'A' 'I']
 ['C' 'E' 'N' 'F' 'A' 'S' 'B' 'P' 'R' 'R' 'I' 'M' 'G' 'S']
 ['C' 'P' 'D' 'O' 'E' 'R' 'O' 'L' 'F' 'E' 'T' 'E' 'A' 'M']
 ['B' 'B' 'E' 'T' 'Y' 'E' 'R' 'E' 'S' 'X' 'E' 'T' 'V' 'P']
 ['O' 'C' 'I' 'A' 'Y' 'P' 'U' 'U' 'M' 'B' 'R' 'A' 'G' 'E']]
['DEGUST', 'FIPPLE', 'LEXICON', 'UMBRAGE', 'EMACITY', 'DIDATIC', 'UROBOROS', 'PIACULAR', 'FRONDEUR', 'VAGARIOUS', 'KERFUFFLE', 'ERUBESCENT', 'CONNIPTION', 'ANACHRONISM', 'PY

### I'm using correlation based patten matching to find the words
1. The reason I use correlation not other approches like string search because I like correlation :P
2. Encode puzzle and query into numbers (eg: a->0, b-> 1, .. z-> 25) 
3. Perform correlation on puzzle with query as template (left to right search)
4. If the text is not found reverse the query and search (right to left)
5. Now transpose the puzzle for top to bottom and bottom to top word search
6. Currently I'm not performing diagonal searches
7. Plot the results


In [8]:
def str2num(queries):
    alpha = 'abcdefghijklmnopqrstuvwxyz'.upper()
    txts = []
    for query in queries:
        query = np.array([alpha.index(q) for q in query], 'u1')
        txts.append(query)
    return txts


def findWord(puzzles, metaPuzzles, txts, transpose):
    for txt in txts:
        v = cv2.matchTemplate(puzzles, np.array([txt]), cv2.TM_CCOEFF_NORMED)
        min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(v)
        if not max_val > .999:  # reverse the string and search
            v = cv2.matchTemplate(puzzles, np.array([txt[::-1]]), cv2.TM_CCOEFF_NORMED)
            min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(v)
        if max_val > .999:
            stop = len(txt)
            c, r = max_loc
            meta = metaPuzzles[c: c + stop, r] if transpose else metaPuzzles[r, c: c + stop]
            yield meta


def solvePuzzle(puzzles, queries):
    # encode strings
    queries = str2num(queries)
    puzzles = np.array(str2num(puzzles))
    r, c = puzzles.shape
    metaPuzzles = np.array([pboxs[ix] for ix, puzzle in enumerate(puzzles.ravel())]).reshape(r, c, -1)
    results = list()
    results.extend(findWord(puzzles, metaPuzzles, queries, transpose=False))
    results.extend(findWord(puzzles.T, metaPuzzles, queries, transpose=True))
    return results


def displayResult(puzzleImg, queryImg, metaPuzzles):
    res = []
    puzzleImg = cv2.cvtColor(puzzleImg,cv2.COLOR_GRAY2BGR)
    cm = plt.cm.get_cmap('tab20')
    for bboxs in metaPuzzles:
        color = np.random.randint(4, 128, 3, 'u1')
        for bbox in bboxs: 
            simg = getSubImg(puzzleImg, bbox)
            simg[simg==255] = 1 # encode background as 1
            simg += color
            simg[simg==color] = 255 # make text pure white
            res.append(puzzleImg.copy())
    return res

results = solvePuzzle(puzzles, queries)
res = displayResult(puzzleImg,queryImg, results)
HTML(displayVideo(res).to_html5_video())