<a href="https://colab.research.google.com/github/yasusuzuki/ReceiptOCRScanner/blob/main/ReceiptOCRScanner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 環境セットアップ

In [None]:
import platform
print("python " + platform.python_version())

In [None]:
!pip install --upgrade google-cloud-vision
# iPhoneからアップロードした *.HEIC形式をPNGに変換するライブラリ
!pip install pyheif



### Google Driveをマウントする

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
from google.colab import auth
from oauth2client.client import GoogleCredentials
import gspread

auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())

### 画像ファイルを読み込む

In [None]:
def heic_convert_to(heic_filename, output_filename,q=90):
  import pyheif
  from PIL import Image
  heif = pyheif.read(heic_filename)
  image = Image.frombytes(
              heif.mode, 
              heif.size, 
              heif.data,
              "raw",
              heif.mode,
              heif.stride,
          )
  #image.save(output_filename,quality=75 )
  image.save(output_filename,format="jpeg" )

import glob
import os
import os.path
import shutil


IMG_FOLDER_PATH = '/content/gdrive/My Drive/programming/vision_api/image/'
!ls '/content/gdrive/My Drive/programming/vision_api/image'

orig_files = glob.glob(IMG_FOLDER_PATH + 'orig/' + '*.HEIC')
orig_files.sort()
print('HEIC files:',orig_files)
for f in orig_files:
    dest_file_name = os.path.splitext(os.path.basename(f))[0] + '.jpeg'
    heic_convert_to(f, IMG_FOLDER_PATH+'in_progress/'+dest_file_name )
    shutil.move(f, IMG_FOLDER_PATH + 'orig/trush/') 

orig_files = glob.glob(IMG_FOLDER_PATH + 'orig/' + '*.jpeg')
orig_files.sort()
print('jpeg files:',orig_files)
for f in orig_files:
    shutil.move(f, IMG_FOLDER_PATH + 'in_progress/') 
    

In [None]:
import os
import os.path
import errno

from googleapiclient import discovery
from oauth2client.client import GoogleCredentials

cred_path = '/content/gdrive/My Drive/programming/vision_api/credentials.json'

if os.path.exists(cred_path) == False:
  raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), cred_path)

print('Found Credential file:' + cred_path)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path

In [None]:
import cv2
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
# The name of the image file to annotate
input_files = glob.glob(IMG_FOLDER_PATH + 'in_progress/' + '*')

for f in input_files:
    img = cv2.imread(f) # input_fileは画像のパス
    plt.figure(figsize=[10,10])
    plt.axis('off')
    plt.imshow(img[:,:,::-1])

In [None]:
import io
from google.cloud import vision
# Instantiates a client
client = vision.ImageAnnotatorClient()

# Loads the image into memory
with io.open(input_files[0], 'rb') as image_file:
    content = image_file.read()

image = vision.Image(content=content)

# Performs label detection on the image file
#response = client.label_detection(image=image)
#labels = response.label_annotations
#print('Labels:')
#for label in labels:
#    print(label.description)

# face_detection sample
#response = client.face_detection(image=image, max_results=4)
#faces = response.face_annotations
#print('Found {} face{}'.format(len(faces), '' if len(faces) == 1 else 's'))

# テキスト認識
response = client.document_text_detection(image=image)
#print(vars(response))
texts = response.text_annotations
for text in texts:
    #print('\n"{}"'.format(text.description))
    vertices = (['({},{})'.format(vertex.x, vertex.y) for vertex in text.bounding_poly.vertices])
    #print('bounds: {}'.format(','.join(vertices)))


print(response.text_annotations[0].description)



### テキスト抽出　－　正規表現などで特定の情報を抽出する

In [None]:
def get_sorted_lines(response):
    document = response.full_text_annotation
    bounds = []
    for page in document.pages:
      for block in page.blocks:
        for paragraph in block.paragraphs:
          for word in paragraph.words:
            for symbol in word.symbols:
              x = symbol.bounding_box.vertices[0].x
              y = symbol.bounding_box.vertices[0].y
              text = symbol.text
              bounds.append([x, y, text, symbol.bounding_box])
    bounds.sort(key=lambda x: x[1])
    old_y = -1
    line = []
    lines = []
    threshold = 20  #ここで、同じ行として認識するかの微調整を行うことができる
    for bound in bounds:
      x = bound[0]
      y = bound[1]
      if old_y == -1:
        old_y = y
      elif old_y-threshold <= y <= old_y+threshold:
        old_y = y
      else:
        old_y = -1
        line.sort(key=lambda x: x[0])
        lines.append(line)
        line = []
      line.append(bound)
    line.sort(key=lambda x: x[0])
    lines.append(line)
    return lines

img = cv2.imread(input_files[0], cv2.IMREAD_COLOR)
lines = get_sorted_lines(response)
for line in lines:
  texts = [i[2] for i in line]
  texts = ''.join(texts)
  bounds = [i[3] for i in line]
  print(texts)
  for bound in bounds:
    p1 = (bounds[0].vertices[0].x, bounds[0].vertices[0].y)   # top left
    p2 = (bounds[-1].vertices[1].x, bounds[-1].vertices[1].y) # top right
    p3 = (bounds[-1].vertices[2].x, bounds[-1].vertices[2].y) # bottom right
    p4 = (bounds[0].vertices[3].x, bounds[0].vertices[3].y)   # bottom left
    cv2.line(img, p1, p2, (0, 255, 0), thickness=3, lineType=cv2.LINE_AA)
    cv2.line(img, p2, p3, (0, 255, 0), thickness=3, lineType=cv2.LINE_AA)
    cv2.line(img, p3, p4, (0, 255, 0), thickness=3, lineType=cv2.LINE_AA)
    cv2.line(img, p4, p1, (0, 255, 0), thickness=3, lineType=cv2.LINE_AA)



plt.figure(figsize=[10,10])
plt.axis('off')
plt.imshow(img[:,:,::-1]);plt.title("img_by_line")

In [None]:
import re

def get_matched_string(pattern, string):
    prog = re.compile(pattern)
    result = prog.search(string)
    if result:
        return result.group()
    else:
        return False

pattern_dict = {}
#pattern_dict['date'] = r'[12]\d{3}[/\-年](0?[1-9]|1[0-2])[/\-月](0?[1-9]|[12][0-9]|3[01])日?'
pattern_dict['date'] = r'[12]\d{3}[/\-年](0?[1-9]|1[0-2])[/\-月](3[01]|[12][0-9]|0?[1-9])日?'
pattern_dict['time'] = r'((0?|1)[0-9]|2[0-3])[:時][0-5][0-9]分?'
pattern_dict['tel'] = '0\d{1,3}-\d{1,4}-\d{4}'
#pattern_dict['total_price'] = r'合計¥(0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)$'
pattern_dict['total_price'] = r'現計¥(0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)$'
pattern_dict['item'] = r'\w+¥[0-9]+$' #どうも円記号はふつうのバックスラッシュではない。


items = []
for line in lines:
  texts = [i[2] for i in line]
  texts = ''.join(texts)
  #print(texts)
  for key, pattern in pattern_dict.items():
    matched_string = get_matched_string(pattern, texts)
    if matched_string:
      if key == 'time' and '営業時間' in texts :
        continue
      if key == 'item':
        if '計' in matched_string or '釣り' in matched_string:
          continue
        tokens = matched_string.split('¥')
        items.append([tokens[0],tokens[1]])
      elif key == 'tel':
        tel = matched_string
      elif key == 'total_price':
        total_price = matched_string
      elif key == 'date':
        date = matched_string
      elif key == 'time':
        time = matched_string
      else:
        print("Error",key, matched_string)

print(date,time,tel,items)

In [None]:
ss_id = "1w8fuyK4akK7j8zO7pNfXhPW35sA1UZCBllG8BXHNCf8"
workbook = gc.open_by_key(ss_id)
worksheet = workbook.worksheet("抽出データ")
print(items)
worksheet.append_row([os.path.basename(input_files[0]),date+time, tel])
for i in items:
  worksheet.append_row(['','','', i[0],i[1]])