In [1]:
import cv2
import imutils
import os
import random
import numpy as np
import matplotlib.pyplot as plt
from pytesseract import image_to_string
%matplotlib inline

In [2]:
def display_img(img, mode=None):
    
    fig = plt.figure(figsize=(12, 10))
    ax = fig.add_subplot(111)
    
    if mode == 'gray':
        ax.imshow(img, cmap='gray')
    else:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        ax.imshow(img)

## Reading text from passport

In [3]:
docs = os.listdir('output1')

In [6]:
if not os.path.exists('raw_text'):
    os.mkdir('raw_text')

In [None]:
for (i, image) in enumerate(docs):
    
    name = image.split('.')[0]
    
    img = cv2.imread(os.path.join('output1', image), 0)
    
    blurred = cv2.medianBlur(img, 1)
    thresh = cv2.threshold(blurred.copy(), 0, 225,
                           cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    raw_text = image_to_string(thresh, lang='rus')
    
    with open(os.path.join('raw_text', '{}.txt'.format(name)), 'w') as f:
        f.write(raw_text)
    
    print(i)

In [31]:
def find_person_name(image):
    
    (h, w, _) = image.shape
    bottom = image[int(h/2):h, 0:w].copy()
    
    edged = cv2.Canny(bottom, 75, 200)
    
    rectKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 5))
    edged = cv2.morphologyEx(edged, cv2.MORPH_CLOSE, rectKernel)

    edged = cv2.erode(edged, (25, 25), iterations=2)
    edged = cv2.dilate(edged, (25, 25), iterations=2)
    
    cnts = cv2.findContours(edged.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    cnts = imutils.grab_contours(cnts)
    cnts = contours.sort_contours(cnts,
                        method="top-to-bottom")[0]
    hullImage = np.zeros(bottom.shape[:2], dtype="uint8")
    
    full_name = []
    for (i, c) in enumerate(cnts):
        (x, y, w, h) = cv2.boundingRect(c)
        ar = w / h

        if w > 100 and h > 100 and ar > 2.5:

            output = bottom.copy()
            
            full_name.append(image_to_string(output[y:y+h,x:x+w], lang='rus').replace('\n', ' '))
            print(image_to_string(output[y:y+h,x:x+w], lang='rus').replace('\n', ' '))
            
            
    result = []
    result.append([re.sub(r'[^а-яА-Я]+', '', f) for f in full_name if f != ''][:3])
    result.append(full_name)
    return result

In [32]:
import re
from imutils import contours

In [39]:
def read_text_from_passport(image):
    
    raw_text = image_to_string(image, lang='rus').replace(' ', '')
    side = imutils.rotate_bound(image, -90)
    side = image_to_string(side, lang='rus').replace(' ', '')
    
    passport = {
        'ocr_result': {
            'doc_type': 'passport',
            'issue_authority': '',
            'issue_code': '',
            'issue_date': '',
            'birth_date': '',
            'ver1':{
                'patronymic_name': '',
                'name': '',
                'surname': ','
            },
            'gender': '',
            'birth_place': '',
            'series': '',
            'number': '',
        },
        'text': '',
    }
    
    
    # Looking for issue date
    date = re.findall(r'(\d{2}.{1,3}\d{2}.{1,3}\d{4})', raw_text)
    if date is not None:
        passport['ocr_result']['issue_date'] = date[0][0]
    if len(date) >= 2:
        passport['ocr_result']['birth_date'] = date[1][0]

    # Looking for issue code
    code = re.search(r'\d{3}-\d{3}', raw_text)
    if code is not None:
        passport['ocr_result']['issue_code'] = code[0]

        
    # Looking for passport series
    series = re.search(r'(\d{2} \d{2})', side)
    if series is not None:
        passport['ocr_result']['series'] = series[0]

    # Looking for passport number
    num = re.search(r'(\d{6})', side)
    if num is not None:
        passport['ocr_result']['number'] = num[0]
    
    
    
    name = find_person_name(image)
    
    if len(name) == 3:
        passport['ocr_result']['ver1']['patronymic_name'] = name[-1]
        passport['ocr_result']['ver1']['name'] = name[-1]
        passport['ocr_result']['ver1']['surname'] = name[-1]

    # Looking for gender
    if passport['ocr_result']['ver1']['patronymic_name'].endswith('ВИЧ') \
                        or re.search(r'(МУЖ|МУЖ.) ', raw_text) is not None:
        passport['ocr_result']['gender'] = 'male'
    elif passport['ocr_result']['ver1']['patronymic_name'].endswith('ВНА') \
                        or re.search(r'(ЖЕН|ЖЕН.) ', raw_text) is not None:
        passport['ocr_result']['gender'] = 'female'
        
    return passport

In [40]:
image = cv2.imread('output1/75.png')
data = read_text_from_passport(image)