In [2]:
import cv2
import imutils
from imutils import rotate_bound
import os
import random
import numpy as np
import matplotlib.pyplot as plt
from pytesseract import image_to_string
import re
%matplotlib inline

## Preprocessing

In [3]:
def passport_border(image):
    
    # Initializing cascade
    #image = cv2.imread(filename)
    #image = imutils.resize(image, width=1000)
    cascade = cv2.CascadeClassifier('cascade.xml')
    gray = cv2.cvtColor(image.copy(), cv2.COLOR_BGR2GRAY)
    
    # Finding a face
    face = cascade.detectMultiScale(gray, 1.3, 5)

    if face is not ():
        # Cutting the image so only passport was left
        (x, y, w, h) = face[0]

        (H, W, _) = image.shape

        if y - int(6 * h) < 0:
            startY = 0
        else:
            startY = y - int(6 * h)

        if y + 3 * h > H:
            endY = H
        else:
            endY = y + 3 * h

        if x - w < 0:
            startX = 0
        else:
            startX = x - w

        if x + 6 * w > W:
            endX = W
        else:
            endX = x + 6 * w

        mask = np.zeros((H, W), dtype=np.uint8)
        mask[startY:endY, startX:endX] = 255

        masked = cv2.bitwise_and(image, image, mask=mask)
        masked = get_segment_crop(image, mask=mask)

        """
            cv2.imwrite(os.path.join('output', output + '.png'), masked)

        else:
            cv2.imwrite(os.path.join('output', output + '.png'), image)
        """

        return masked
    
    else:
        return image

In [4]:
def rotate_passport(passport):
    """
    rotating an image so passport could be readed
    :param image: np array
    :return: np array
    """

    # Initializing cascade
    cascade = cv2.CascadeClassifier('cascade.xml')
    image = imutils.resize(passport.copy(), width=1000)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    rotates = 0
    # Looking for a face
    for _ in range(4):

        face = cascade.detectMultiScale(gray, 1.3, 5)

        if face is not ():
            return imutils.rotate_bound(passport, 90 * rotates)

        gray = imutils.rotate_bound(gray, 90)
        rotates += 1

    print('Falsed')
    # Return false if the given picture is not a passport
    return imutils.rotate_bound(passport, 90)

In [5]:
def get_segment_crop(img,tol=0, mask=None):
    if mask is None:
        mask = img > tol
    return img[np.ix_(mask.any(1), mask.any(0))]

In [6]:
def cut_passport(filename, output):
    
    image = cv2.imread(filename)
    # image = imutils.resize(image, width=1000)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    sobelX = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=5)
    sobelY = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=5)
    
    blended = cv2.addWeighted(src1=sobelX, alpha=0.5, src2=sobelY, beta=0.5, gamma=0)
    
    kernel = np.ones((20, 20), dtype=np.uint8)
    opening = cv2.morphologyEx(blended, cv2.MORPH_OPEN, kernel)
    
    min_ = np.min(opening)
    opening = opening - min_
    max_ = np.max(opening)
    div = max_/255
    opening = np.uint8(opening / div)
    
    blurred = cv2.GaussianBlur(opening, (1, 1), 0)
    thresh = cv2.threshold(opening, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    
    (h, w) = thresh.shape
    edgeH = int(h * 0.01)
    edgeW = int(w * 0.01)
    thresh[0:edgeH,0:w] = 255
    thresh[h-edgeH:h,0:w] = 255
    thresh[0:h,0:edgeW] = 255
    thresh[0:h, w-edgeW:w] = 255
    
    kernel = np.ones((20, 20), dtype=np.uint8)
    thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
    
    inverse = cv2.bitwise_not(thresh)
    
    
    coords = np.column_stack(np.where(inverse > 0))
    angle = cv2.minAreaRect(coords)[-1]
    
    if angle < -45:
        angle = -(90 + angle)
        
    image = rotate_bound(image,angle=angle)
    inverse = rotate_bound(inverse,angle=angle)
    
    masked = get_segment_crop(image, mask=inverse)

    if not os.path.exists('output1'):
        os.mkdir('output1')
        
    cv2.imwrite(os.path.join('output1', output + '.png'), masked)
    
    return masked

In [7]:
dataset_dir = 'dataset'
docs = [os.path.join(dataset_dir, f) for f in os.listdir(dataset_dir)
            if f.endswith('.png') or f.endswith('.jpg') or f.endswith('.jpeg')]
docs = random.sample(docs, 100)

In [8]:
for (i, image) in enumerate(docs):
    
    img = cv2.imread(image)
    mask = cut_passport(image, str(i))
    mask = rotate_passport(mask)
    mask = passport_border(mask)
    
    print(i)
    """
    cascade = cv2.CascadeClassifier('cascade.xml')
    gray = cv2.cvtColor(mask.copy(), cv2.COLOR_BGR2GRAY)

    face = cascade.detectMultiScale(gray, 1.3, 5)
    if face is not ():
        (x, y, w, h) = face[0]
        (H, W, _) = mask.shape
        name = mask[int(y+h/2):H, 0:W]
        
    """

    
    cv2.imwrite(os.path.join('output1', '{}.png'.format(i)), mask)
    #mask2 = passport_border(image, str(i))
    
    #mask = cv2.bitwise_or()

KeyboardInterrupt: 

In [None]:
for i, image in enumerate(os.listdir('output1')):
    if image.endswith('.png'):
        img = cv2.imread(os.path.join('output1', image))

        cascade = cv2.CascadeClassifier('cascade.xml')
        gray = cv2.cvtColor(img.copy(), cv2.COLOR_BGR2GRAY)

        face = cascade.detectMultiScale(gray, 1.3, 5)
        if face is not ():
            (x, y, w, h) = face[0]
            (H, W, _) = img.shape
            img = img[int(H/2):int(y+h/2), 0:W]
        else:
            print('False')
            (H, W, _) = img.shape
            img = img[int(H/2):int(H/4*3), 0:W]

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        text = image_to_string(gray, lang='rus').replace('\n', ' ')
        text = re.sub('[^а-яА-Я ]+', '', text)

        cv2.imwrite(os.path.join('name', '{}.png'.format(image.split('.')[0])), img)
        with open(os.path.join('name', '{}.txt'.format(image.split('.')[0])), 'w') as f:
            f.write(text)
        print(i)

In [353]:
def read_passport(image):
    
    (h, w) = image.shape[:2]
        
    authority = image[0:int(h/2), 0:w]
    name = image[int(h/2):int(h/4*3), 0:w]
    birth_place = image[int(h/4*3):h, 0:w]
    
    authority = image_to_string(authority, lang='rus').replace('\n', ' ')
    name = image_to_string(name, lang='rus').replace('\n', ' ')
    birth_place = image_to_string(birth_place, lang='rus').replace('\n', ' ')
    raw = image_to_string(image, lang='rus').replace('\n', ' ')
    
    side = imutils.rotate_bound(image, angle=-90)
    (h, w) = side.shape[:2]
    side = side[0:int(h/10), 0:w]
    number = image_to_string(side, lang='rus')
    
    authority = re.sub(r'[^а-яА-Я- \.]+', '', authority)
    name = re.sub(r'[^а-яА-Я ]+', '', name)
    birth_place = re.sub(r'[^а-яА-Я- \.]+', '', birth_place)
    
    
    ALLOLEW_SMALL_STRINGS_AUTHORITIES = ['и', 'в']
    authority = authority.split()
    for i, word in enumerate(authority):
        if len(word) <= 2 and word.lower() not in ALLOLEW_SMALL_STRINGS_AUTHORITIES:
            del(authority[i])
            
    name = name.split()
    for i, word in enumerate(name):
        if len(word) <= 2:
            del(name[i])
            
    ALLOLEW_SMALL_STRINGS_BIRTH_PLACE = []
    birth_place = birth_place.split()
    for i, word in enumerate(birth_place):
        if len(word) <= 2 and word.lower() not in ALLOLEW_SMALL_STRINGS_BIRTH_PLACE:
            del(birth_place[i])
            
    authority = ''.join('{} '.format(word) for word in authority)
    birth_place = ''.join('{} '.format(word) for word in birth_place)
    name = ''.join('{} '.format(word) for word in name)
    
    return (authority, name, birth_place, raw, number)

In [354]:
image = cv2.imread('output1/55.png')

In [355]:
(authority, name, birth_place, raw, number) = read_passport(image)

In [364]:
def parse_passport(authority, name, birth_place, raw, number):
    
    passport = {
        'ocr_result': {
            'doc_type': 'passport',
            'issue_authority': '',
            'issue_code': '',
            'issue_date': '',
            'surname': '',
            'name': '',
            'patronymic_name': '',
            'birth_date': '',
            'gender': '',
            'birth_place': '',
            'series': '',
            'number': '',
        },
        'text': '',
        'FIO': '',
    }
    
    
    AUTHORITIES = ['отделом', 'УФМС', 'МФЦ', 'ГОМ', 'УВД']
    issued = None
    for auth in AUTHORITIES:
        if re.search(auth, authority, flags=re.I) is not None:
            issued = re.findall(r'({}.*)'.format(auth), authority, flags=re.I)[0]
            break

    if issued is None:
        issued = authority
    passport['ocr_result']['issue_authority'] = issued
    
        
    LOCALITIES = ['пос', 'гор', r'с\.']
    born = None
    for local in LOCALITIES:
        if re.search(local, birth_place, flags=re.I) is not None:
            born = re.findall(r'({}.*)'.format(local), birth_place, flags=re.I)[0]
            break

    if born is None:
        born = birth_place
    passport['ocr_result']['birth_place'] = born
        
    
    full_name = re.search(r'(.* (.*(ВИЧ|ВНА)))', name, flags=re.I)
    if full_name is not None:
        passport['ocr_result']['patronymic_name'] = full_name[2].upper()
        full_name = full_name[0].split()
        
    # Looking for dates of issue and birth
    date = re.findall(r'(\d{2}\.\d{2}\.\d{4})', raw)

    if date != []:
        passport['ocr_result']['issue_date'] = date[0]
    if len(date) >= 2:
        
        date1 = int(date[0].split('.')[-1])
        date2 = int(date[1].split('.')[-1])
        
        if date1 > date2:
            passport['ocr_result']['issue_date'] = date[0]
            passport['ocr_result']['birth_date'] = date[1]
        else:
            passport['ocr_result']['issue_date'] = date[1]
            passport['ocr_result']['birth_date'] = date[0]
    
    
    # Looking for issue code
    code = re.search(r'\d{3}-\d{3}', raw)
    if code is not None:
        passport['ocr_result']['issue_code'] = code[0]
    
    # Looking for gender
    if passport['ocr_result']['patronymic_name'].endswith('ВИЧ') \
                        or re.search(r'(МУЖ|МУЖ.) ', raw) is not None:
        passport['ocr_result']['gender'] = 'male'
    elif passport['ocr_result']['patronymic_name'].endswith('ВНА') \
                        or re.search(r'(ЖЕН|ЖЕН.) ', raw) is not None:
        passport['ocr_result']['gender'] = 'female'
        
    # Looking for passport series
    series = re.search(r'(\d{2} \d{2})', number)
    if series is not None:
        passport['ocr_result']['series'] = series[0]

    # Looking for passport number
    num = re.search(r'(\d{6})', number)
    if num is not None:
        passport['ocr_result']['number'] = num[0]
        
    passport['text'] = raw
    passport['FIO'] = name
    
    return passport

In [365]:
passport = parse_passport(authority, name, birth_place, raw, number)

In [374]:
for i, img in enumerate(os.listdir('output1')):
    
    if img.endswith('.png'):
        image = cv2.imread('output1/{}'.format(img))
        (authority, name, birth_place, raw, number) = read_passport(image)
        passport = parse_passport(authority, name, birth_place, raw, number)

        with open('text/{}.txt'.format(i), 'w') as f:
            f.write(str(passport))
        print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
