# 신장의무기록사본 OCR 서비스 구현

# 1. 라이브러리 로드

In [27]:
from glob import glob
import pandas as pd
import cv2
import numpy as np
import math

from tensorflow.keras.models import load_model
import efficientnet.tfkeras # for swish
import tensorflow as tf
import json

import os
os.chdir('D:/kidney/source')

# 2. 모델 로드

In [28]:
model = load_model('../model/OCR_ver1.1.h5')

# 3. 예측 준비 작업

### 3-1. 함수

In [29]:
def border_make(image):
    row, col = image.shape[:2]
    bottom = image[row-2:row, 0:col]
    mean = cv2.mean(bottom)[0]
    
    col_bordersize = (55-col)/2
    row_bordersize = (55-row)/2
    
    border = cv2.copyMakeBorder(
        image,
        top = math.ceil(row_bordersize),
        bottom = math.floor(row_bordersize),
        left = math.ceil(col_bordersize),
        right = math.floor(col_bordersize),
        borderType = cv2.BORDER_ISOLATED, #BORDER_ISOLATED  BORDER_CONSTANT
        value = [mean, mean, mean]
    )
    return border

### 3-2. Y_test

In [30]:
# 예측 데이터 레이블 값 #Y_test # actual_value_lst
Y_test = pd.read_csv('../data/Y_test.csv', encoding='utf-8-sig') #Y_test = ['.','1','1','8','1','0','2','3','/','희','찬','0','자','출','명','증','본','의','력','다','합','을','음','없','림','틀','과','의','록','무','은','니','본','사','한','부','첨','~','~','~','~']
Y_test = list(Y_test['col_names'])

pd.DataFrame(data = Y_test, columns=['col_names']).to_csv('../data/Y_test.csv', index = False, encoding='utf-8-sig')

### 3-3. Y_train_unique

In [31]:
Y_train_unique = pd.read_csv('../data/Y_train_unique.csv', encoding= 'utf-8-sig').loc[:, '0']  # list # 122        # Y_train_columns = Y_train_columns.loc[:, '0']         # Y_train_columns > Y_train_unique 

# 4. 예측 데이터 불러오기

In [32]:
predict_lst = glob('D:/kidney/image/forPredict/*.png') 

In [33]:
img_lst = []
for predict_lst_num in range(len(predict_lst)):
    image = cv2.imread(predict_lst[predict_lst_num], cv2.IMREAD_COLOR)
    img = border_make(image)
    dst = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    ret ,img = cv2.threshold(dst,127,255,0)
    img_lst.append(img)
    img_array = np.array(img_lst)
img_array_reshape = img_array.reshape(-1,55,55,1)
X_test = img_array_reshape
X_test.shape

(41, 55, 55, 1)

# 5. 예측

### 5-1. 예측 확률을 데이터프레임 형태로 출력

In [34]:
pred = model.predict(X_test) # 41 
#pred_df = pd.DataFrame(pred).round(2) #(41, 122)
pred_df = pd.DataFrame(pred)#.round(2) #(41, 122)
pred_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,138,139,140,141,142,143,144,145,146,147
0,0.0,2.702334e-09,9.391016e-21,6.483938000000001e-27,1.043956e-22,6.326679e-07,0.999999,1.189521e-11,7.682648e-11,2.571087e-12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.286007e-12,0.0
1,0.0,0.1017439,2.6225550000000003e-17,5.9697499999999996e-24,6.607743e-14,3.325519e-09,2.932596e-12,9.077468e-09,9.931381e-10,0.2500152,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.658682e-15,0.0
2,0.0,9.816627e-06,9.470696000000001e-17,1.3428620000000002e-31,1.9403759999999998e-19,9.802898e-05,6.265336e-10,1.155096e-09,1.236224e-11,0.03365525,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.7198460000000003e-17,0.0
3,0.0,4.36715e-11,7.850346e-14,1.6793939999999998e-20,3.982215e-16,9.868211999999998e-19,5.640015e-17,1.140877e-13,0.9749762,1.984601e-11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.616191e-18,0.0
4,0.0,7.139599e-05,1.699484e-17,1.386335e-29,4.6323120000000005e-17,4.315253e-10,2.63647e-12,3.837961e-11,1.410989e-07,0.02181116,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.467525e-16,0.0


### 5-1-1. 실제와 예측 값 간단하게 비교

In [35]:
### 최고 확률 컬럼 알아내기

pred_lst = [] #list # 41
for row in range(len(pred_df)):
    #pred_col_num = pred_df.loc[:,pred_df.loc[row] == round(pred_df.iloc[row],2).max()].columns[0]
    pred_col_num = pred_df.loc[:,pred_df.loc[row] == pred_df.iloc[row].max()].columns[0]
    pred_lst.append(pred_col_num)

pd.DataFrame(data= pred_df.iloc[0,:]).to_csv('../data/testtesttest.csv')

In [36]:
### 가장 높게 예측된 확률을 label로 바꾸기

predict_lst = [] # list # 41
for i, column_num in enumerate(pred_lst):
    predict_lst.append(Y_train_unique[column_num])

In [37]:
### 최고 확률 컬럼 
probability_lst = []
for row in range(len(pred_df)):
    percentage = round(pred_df.iloc[row,:].max()*100,3)
    probability_lst.append(percentage)

### EDA 추가 코드

In [38]:
### 데이터 프레임 형태로 비교해 보기

result_df = pd.DataFrame(columns=['predict', 'actual', 'probability'])

result_df.actual = Y_test
result_df.predict = predict_lst
result_df.probability = probability_lst
result_df# 숫자는 나름 잘 맞추는거 같은데 한글은 의무기록지 한장으로는 통 못 맞춤

Unnamed: 0,predict,actual,probability
0,.,.,100.0
1,:,1,64.824
2,:,1,96.59
3,0,8,97.498
4,:,1,97.812
5,0,0,99.562
6,:,2,95.591
7,3,3,99.611
8,7,/,99.95
9,검,희,47.567


# 7. 모델 평가

In [39]:
# 다중 레이블 다중 분류

### TP, TN, FP, FN  //  accuracy, precision, recall, F1

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

acc = accuracy_score(Y_test, predict_lst)

precision_none = precision_score(Y_test, predict_lst, average=None)
precision_micro = precision_score(Y_test, predict_lst, average='micro')
precision_macro = precision_score(Y_test, predict_lst, average='macro') # 'micro', 'macro', 'weighted'
precision_weighted = precision_score(Y_test, predict_lst, average='weighted') # 'micro', 'macro', 'weighted'

recall_micro = recall_score(Y_test, predict_lst, average='micro')
recall_macro = recall_score(Y_test, predict_lst, average='macro') # 'micro', 'macro', 'weighted'
recall_weighted = recall_score(Y_test, predict_lst, average='weighted') # 'micro', 'macro', 'weighted'

f1_weighted = f1_score(Y_test, predict_lst, labels=None, average="weighted")

print('accurracy: ', acc)

print('precision_none: ', precision_none)
print('precision_micro: ', precision_micro)
print('precision_macro: ', precision_macro)
print('precision_weighted: ', precision_weighted)

print('recall_micro: ', recall_micro)
print('recall_macro: ', recall_macro)
print('recall_weighted: ', recall_weighted)

print('f1_weighted: ', f1_weighted)

accurracy:  0.12195121951219512
precision_none:  [0.         0.         1.         0.         0.33333333 0.
 0.         1.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         1.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 1.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.        ]
precision_micro:  0.12195121951219512
precision_macro:  0.0849673202614379
precision_weighted:  0.1138211382113821
recall_micro:  0.12195121951219512
recall_macro:  0.08823529411764706
recall_weighted:  0.12195121951219512
f1_weighted:  0.11707317073170731


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


# 8. roc커브, auc 구하기

In [40]:
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes

In [41]:
print(pred.shape)
print(type(pred))

(41, 148)
<class 'numpy.ndarray'>


실제값을 타이핑 해뒀는데 그게 아니라 어레이로 만들어져야 할듯

어떻게? 잘해봐 왕수

In [42]:
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

NameError: name 'n_classes' is not defined

In [43]:
plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

KeyError: 2

<Figure size 432x288 with 0 Axes>

# 9. need to update labelme