# 신장의무기록사본 OCR 서비스 구현

# 1. 라이브러리 로드

In [2]:
from glob import glob
import pandas as pd
import cv2
import numpy as np
import math

from tensorflow.keras.models import load_model
import tensorflow as tf
import json

import os
os.chdir('D:/kidney/source')

In [3]:
import efficientnet.tfkeras # for swish

# 2. 모델 로드

In [4]:
model = load_model('../model/OCR_ver1.0.h5')

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


# 3. 예측 준비 작업

In [5]:
def border_make(image):
    row, col = image.shape[:2]
    bottom = image[row-2:row, 0:col]
    mean = cv2.mean(bottom)[0]
    
    col_bordersize = (55-col)/2
    row_bordersize = (55-row)/2
    
    border = cv2.copyMakeBorder(
        image,
        top = math.ceil(row_bordersize),
        bottom = math.floor(row_bordersize),
        left = math.ceil(col_bordersize),
        right = math.floor(col_bordersize),
        borderType = cv2.BORDER_ISOLATED, #BORDER_ISOLATED  BORDER_CONSTANT
        value = [mean, mean, mean]
    )
    return border

In [6]:
# labelme scan file 읽기
with open('../image/SCAN_01.json', "r", encoding='UTF8') as scan_json:
    json_dict = json.load(scan_json)

# 라벨을 리스트로 만들기
data = json_dict['shapes'] #341

Y_train_label = [] # 971
for i in range(len(data)):
    label = json_dict['shapes'][i]['label']
    Y_train_label.append(label)

Y_train = pd.get_dummies(Y_train_label) # 971 to (971, 122) 

In [7]:
# 예측 데이터 레이블 값 #Y_test # actual_value_lst
Y_test = ['.','1','1','8','1','0','2','3','/','희','찬','0','자','출','명','증','본','의','력','다','합','을','음','없','림','틀','과','의','록','무','은','니','본','사','한','부','첨','~','~','~','~']

# 4. 예측 데이터 불러오기

In [8]:
predict_lst = glob('D:/kidney/image/forPredict/*.png') 

In [9]:
img_lst = []
for predict_lst_num in range(len(predict_lst)):
    image = cv2.imread(predict_lst[predict_lst_num], cv2.IMREAD_COLOR)
    img = border_make(image)
    dst = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    ret ,img = cv2.threshold(dst,127,255,0)
    img_lst.append(img)
    img_array = np.array(img_lst)
img_array_reshape = img_array.reshape(-1,55,55,1)
X_test = img_array_reshape
X_test.shape

(41, 55, 55, 1)

# 5. 예측

In [10]:
### 6-2. 예측 확률 데이터프레임 형태로 출력

pred = model.predict(X_test) # 41 
pred_df = pd.DataFrame(pred).round(2) #(41, 122)
pred_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,112,113,114,115,116,117,118,119,120,121
0,0.0,0.0,0.0,0.0,0.03,0.09,0.04,0.01,0.04,0.0,...,0.01,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0
1,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.07,0.29,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.96,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 5-1. 컬럼명 변경

In [11]:
### 6-3-2. 최고 확률 값의 컬럼 알아내기(6-3-1의 인덱스 번호와 일치시키기 위해)
pred_lst = [] #list # 41
for row in range(len(pred_df)):
    pred_col_num = pred_df.loc[:,pred_df.loc[row] == round(pred_df.iloc[row],2).max()].columns[0]
    pred_lst.append(pred_col_num)

In [24]:
####### 4-3. 컬럼 번호를 레이블값으로 대치
Y_train_columns = pd.read_csv('../data/Y_train_columns.csv', encoding= 'utf-8-sig')  # list # 122
Y_train_columns = Y_train_columns.loc[:, '0']

In [25]:
Y_train_columns

0      %
1      (
2      )
3      ,
4      -
      ..
117    한
118    합
119    호
120    화
121    희
Name: 0, Length: 122, dtype: object

In [14]:
### 6-3-3. 예측된 확률을 가장 높은 확률의 값으로 바꾸기

predict_value_lst = [] # list # 41
for i, column_num in enumerate(pred_lst):
    predict_value_lst.append(Y_train_columns[column_num])

In [15]:
### 6-4. 데이터 프레임 형태로 비교해 보기

result_df = pd.DataFrame(columns=['predict', 'actual'])

result_df.actual = Y_test
result_df.predict = predict_value_lst

In [16]:
Y_train = sorted(Y_train)
pred_df = pd.DataFrame(data=pred, index = Y_test, columns= Y_train).round(0).reset_index()  

pred_df.head() # (41,123)

Unnamed: 0,index,%,(,),",",-,.,/,0,1,...,치,태,틀,페,학,한,합,호,화,희
0,.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
pivot_df = pd.pivot_table(pred_df,index = ['index'], aggfunc = [sum])
pivot_df.columns = pivot_df.columns.droplevel(0)

pivot_df = pivot_df.reset_index() #( 33, 123)

# 5-2. 오차행렬을 구하기 위해 정방향 테이블로 수정하기

In [18]:
### 전체 목록중에 예측 데이터 제외 나머지도 로우(인덱스)에 넣기

Y_train #전체 리스트 # 122
predict = list(set(predict_value_lst)) #예측한 리스트 # 41

In [19]:
### 전체리스트 - 예측한 리스트

a_sub_b = [x for x in Y_train if x not in list(pivot_df['index'])]

In [20]:
Y_train

['%',
 '(',
 ')',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 'A',
 'B',
 'C',
 'D',
 'DH',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 '[',
 ']',
 '^',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'i',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 't',
 'u',
 'x',
 '~',
 'μl',
 '검',
 '결',
 '계',
 '고',
 '과',
 '구',
 '기',
 '남',
 '니',
 '다',
 '단',
 '력',
 '록',
 '뢰',
 '림',
 '명',
 '무',
 '민',
 '반',
 '번',
 '보',
 '본',
 '부',
 '사',
 '산',
 '상',
 '수',
 '안',
 '없',
 '오',
 '원',
 '위',
 '은',
 '을',
 '음',
 '의',
 '이',
 '일',
 '자',
 '전',
 '접',
 '정',
 '증',
 '지',
 '찬',
 '참',
 '첨',
 '체',
 '출',
 '치',
 '태',
 '틀',
 '페',
 '학',
 '한',
 '합',
 '호',
 '화',
 '희']

In [21]:
### 피벗한 데이터프레임에 빈 로우 추가하기

add_df = pd.DataFrame(data =np.nan, index = a_sub_b, columns=Y_train).reset_index()
add_df

Unnamed: 0,index,%,(,),",",-,.,/,0,1,...,치,태,틀,페,학,한,합,호,화,희
0,%,,,,,,,,,,...,,,,,,,,,,
1,(,,,,,,,,,,...,,,,,,,,,,
2,),,,,,,,,,,...,,,,,,,,,,
3,",",,,,,,,,,,...,,,,,,,,,,
4,-,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,태,,,,,,,,,,...,,,,,,,,,,
85,페,,,,,,,,,,...,,,,,,,,,,
86,학,,,,,,,,,,...,,,,,,,,,,
87,호,,,,,,,,,,...,,,,,,,,,,


# 5-3. 예측 테이블과 정방향용 테이블 합치기

In [22]:
### 피벗한 데이터프레임에 빈 로우 추가하기

concat_df = pd.concat([pivot_df, add_df])
concat_df = concat_df.set_index('index', drop=True)
concat_df = concat_df.fillna(0)
concat_df = concat_df.sort_index()
print(concat_df.shape)
concat_df.tail()

#concat_df.to_csv('../data/confusion_table.csv',  encoding = 'utf-8-sig') # index =False,

(122, 122)


Unnamed: 0_level_0,%,(,),",",-,.,/,0,1,2,...,치,태,틀,페,학,한,합,호,화,희
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
한,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
합,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
호,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
화,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
희,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 6. 모델 평가

In [23]:
# 다중 레이블 다중 분류

### TP, TN, FP, FN  //  accuracy, precision, recall, F1

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

acc = accuracy_score(Y_test, predict_value_lst)

precision_none = precision_score(Y_test, predict_value_lst, average=None)
precision_micro = precision_score(Y_test, predict_value_lst, average='micro')
precision_macro = precision_score(Y_test, predict_value_lst, average='macro') # 'micro', 'macro', 'weighted'
precision_weighted = precision_score(Y_test, predict_value_lst, average='weighted') # 'micro', 'macro', 'weighted'

recall_micro = recall_score(Y_test, predict_value_lst, average='micro')
recall_macro = recall_score(Y_test, predict_value_lst, average='macro') # 'micro', 'macro', 'weighted'
recall_weighted = recall_score(Y_test, predict_value_lst, average='weighted') # 'micro', 'macro', 'weighted'

f1_weighted = f1_score(Y_test, predict_value_lst, labels=None, average="weighted")

print('accurracy: ', acc)

print('precision_none: ', precision_none)
print('precision_micro: ', precision_micro)
print('precision_macro: ', precision_macro)
print('precision_weighted: ', precision_weighted)

print('recall_micro: ', recall_micro)
print('recall_macro: ', recall_macro)
print('recall_weighted: ', recall_weighted)

print('f1_weighted: ', f1_weighted)

accurracy:  0.24390243902439024
precision_none:  [1.         0.5        1.         0.42857143 0.         0.5
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         1.
 0.         0.         0.         0.         0.5        0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.        ]
precision_micro:  0.24390243902439024
precision_macro:  0.10714285714285715
precision_weighted:  0.17770034843205573
recall_micro:  0.24390243902439024
recall_macro:  0.14130434782608695
recall_weighted:  0.24390243902439024
f1_weighted:  0.1983739837398374


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


# 7. roc커브, auc 구하기

# 8. need to update labelme