# 신장의무기록사본 OCR 서비스 구현

## 1. 훈련용 데이터 만들기

### 1-1. 라이브러리 로드

In [91]:
import cv2
import numpy as np
import pandas as pd
from glob import glob
import json
import math
import os
os.chdir('D:/kidney/source')

import tensorflow as tf
import efficientnet.tfkeras
from tensorflow.keras.models import load_model

from tensorflow.keras.callbacks import ModelCheckpoint

### 1-2. 함수

* 1-2-1. 이미지 크기 통일 (함수)

In [92]:
def border_make(image):
    row, col = image.shape[:2]
    bottom = image[row-2:row, 0:col]
    mean = cv2.mean(bottom)[0]
    
    col_bordersize = (55-col)/2
    row_bordersize = (55-row)/2
    
    border = cv2.copyMakeBorder(
        image,
        top = math.ceil(row_bordersize),
        bottom = math.floor(row_bordersize),
        left = math.ceil(col_bordersize),
        right = math.floor(col_bordersize),
        borderType = cv2.BORDER_ISOLATED, #BORDER_ISOLATED  BORDER_CONSTANT
        value = [mean, mean, mean]
    )
    return border

* 1-2-2. 훈련용 이미지 reshape  (함수)

In [93]:
def MakeX_train(json_data, image): #json_data: 라벨미해서 나온 데이터, image: 라벨미에 사용했던 이미지
    #image = cv2.imread('../image/SCAN_01.jpg', cv2.IMREAD_COLOR) #위에서 정의했음.
    img_lst =[]
    for i in range(len(json_data['shapes'])):
        # 첫 좌표
        x1 = int(json_data['shapes'][i]['points'][0][0]) #0,0 첫좌표의 행값
        y1 = int(json_data['shapes'][i]['points'][0][1]) #0,0 첫좌표의 열값
        # 마지막 좌표
        x2 = int(json_data['shapes'][i]['points'][1][0])
        y2 = int(json_data['shapes'][i]['points'][1][1])

        #크롭 이미지
        if x1 > x2:
            if y1 > y2:
                cropped_image = image[y2: y1, x2: x1].copy()    
            else:
                cropped_image = image[y1: y2, x2: x1].copy()    
        else:
            if y1 > y2:
                cropped_image = image[y2: y1, x1: x2].copy()    
            else:
                cropped_image = image[y1: y2, x1: x2].copy()    

        #이미지 사이즈 통일 (55*55) 
        img = border_make(cropped_image) # 함수 사용
        # 이미지 그레이스케일 적용
        dst = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        # 이미지 트레시 홀드 적용
        ret ,img = cv2.threshold(dst,127,255,0)

        #SCAN_01.PNG의 모든 글자를 4차원의 형태로 만들기
        img_lst.append(img)
        X_train = np.array(img_lst).reshape(-1,55,55,1)    # (971, 55, 55, 1)
    return X_train


* 1-2-3. Y_train의 컬럼명(=label) 추출 (함수)

In [94]:
def MakeY_train(json_data): # json_data: 라벨미해서 나온 데이터
    # Json_shapes = json_dict['shapes']
    Y_train = []
    for i in range(len(json_data['shapes'])):
        label = json_data['shapes'][i]['label']
        Y_train.append(label)
    return Y_train

### 1-3. 데이터 셋 만들기

* 1-3-1. 이미지, labelme_json 로드


In [95]:
glob_image = glob('../image/SCAN*.jpg')
glob_json = glob('../image/*.json')

* 1-3-2. 함수 돌리기 전 리스트 만들기


In [96]:
json_lst = []
for i in range(len(glob_json)):
    with open(glob_json[i], "r", encoding='UTF8') as scan_json:
        json_data = json.load(scan_json)
        json_lst.append(json_data)

In [97]:
image_lst =[]
for i in range(len(glob_image)):
    image = cv2.imread('../image/SCAN_01.jpg')
    image_lst.append(image)

### 1-4. 함수 적용

In [98]:
Y_train = []
X_train = []
for image, json_data in zip(image_lst, json_lst):
    x_train = MakeX_train(json_data, image)
    y_train = MakeY_train(json_data)
    #with open("../image/newfile.json", "w") as new_file:
     #   json.dump(json_lst, new_file)
    X_train.append(x_train)
    Y_train += y_train
X_train = np.concatenate((X_train))

TypeError: list indices must be integers or slices, not str

In [None]:
X_train

In [None]:
Y_train

np.save('../data/X_train', img_array, allow_pickle=True, fix_imports=True)

## 2. CNN OCR 모델 만들기

### 2-1. 독립변수, 종속변수 정의

In [None]:
X_train 
Y_train = pd.get_dummies(Y_train)
print(X_train.shape, Y_train.shape)

In [None]:
X_train

In [None]:
Y_train

### 2-1-1. 예측을 위해 데이터 저장

In [None]:
df = pd.DataFrame(Y_train.columns)

In [None]:
df

df.to_csv('../data/Y_train_unique.csv', index=False, encoding='utf-8-sig')

### 2-2. CNN LeNet-5 모델 만들기

In [99]:
X = tf.keras.layers.Input(shape=[55, 55, 1])
H = tf.keras.layers.Conv2D(3, kernel_size = 5, activation = 'swish')(X)
H = tf.keras.layers.MaxPool2D()(H)
H = tf.keras.layers.Conv2D(6, kernel_size = 5, activation = 'swish')(H)
H = tf.keras.layers.MaxPool2D()(H)
H = tf.keras.layers.Flatten()(H)
H = tf.keras.layers.Dense(84, activation = 'swish')(H)
Y = tf.keras.layers.Dense(148, activation = 'softmax')(H)

model = tf.keras.models.Model(X, Y)
model.compile(loss='categorical_crossentropy', metrics=['accuracy'])

### 2-3. 모델 FIT

In [100]:
model.fit(X_train, Y_train, epochs=40) #, steps_per_epoch = 20)

ValueError: When using data tensors as input to a model, you should specify the `steps_per_epoch` argument.

### 모델 저장

model.save('../model/OCR_ver1.0.h5')

### 2-4. 모델 요약

In [14]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 55, 55, 1)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 51, 51, 3)         78        
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 25, 25, 3)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 21, 21, 6)         456       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 10, 10, 6)         0         
_________________________________________________________________
flatten (Flatten)            (None, 600)               0         
_________________________________________________________________
dense (Dense)                (None, 84)                50484 