<a href="https://colab.research.google.com/github/tsoonjin/lazybone/blob/master/PathOCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Google Drive Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2.19_amd64.deb
!sudo dpkg -i libssl1.1_1.1.1f-1ubuntu2.19_amd64.deb
%cd drive/MyDrive/projects/pathdao_experience/

In [None]:
!nvidia-smi

In [None]:
# MMOCR
!pip install -U openmim
!mim install mmengine
!mim install 'mmcv>=2.0.0rc1'
!mim install 'mmdet>=3.0.0rc0'
!git clone https://github.com/open-mmlab/mmocr.git
%cd mmocr
!pip install -v -e .

In [None]:
from mmocr.apis import MMOCRInferencer
import mmcv
import matplotlib.pyplot as plt
%cd /content/drive/MyDrive/projects/pathdao_experience/
infer = MMOCRInferencer(det='dbnetpp', rec='abinet')
result = infer('pathdao_raw/0729_ID1/0729_19019_1 - Gloom.jpg', out_dir='mmocr_output/', return_vis=True, save_vis=True)
print(result['predictions'])

predicted_img = mmcv.imread('mmocr_output/vis/0729_19019_1 - Gloom.jpg')
plt.figure(figsize=(18, 32))
plt.imshow(mmcv.bgr2rgb(predicted_img))
plt.show()

In [None]:
!pip install opencv-python
!pip install easyocr
!pip install paddleocr
!pip install paddlepaddle
!pip install paddlepaddle-gpu==2.5.1.post120  -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
!pip install Pillow==9.5.0
import paddle
paddle.utils.run_check()

In [None]:
from operator import index
from paddleocr import PaddleOCR,draw_ocr
from PIL import Image
from PIL.ExifTags import TAGS
from dateutil.parser import parse
from datetime import datetime, timedelta
import easyocr
import time
from google.colab.patches import cv2_imshow
import cv2
import pandas as pd
import pytz
import numpy as np
import os
import math
import sys
import re

# 240 threshold with erosion works fine except for one scenario

DATE_FORMAT = '%m%d'
now = datetime.now(pytz.timezone('Asia/Kuala_Lumpur')) - timedelta(days=2)
date = now.strftime('%Y-%m-%d')

use_gpu = False

def is_date(string, relax=False, fuzzy=False, dayfirst=True):
    """
    Return whether the string can be interpreted as a date.

    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    try:
        if not relax:
          matched = re.search( r'[12][0-9]{3}|3000', string )
          if not matched:
            return False

        rightmost_string = string.split()
        if (len(rightmost_string) == 0):
          return False
        splitted = rightmost_string[-1]
        result = parse(splitted, fuzzy=fuzzy, dayfirst=dayfirst)
        return result

    except (OverflowError, ValueError):
        return False

def show_image(filepath):
  bgray = cv2.imread(filepath)[..., 0]
  # h, w, *rest = img.shape
  # blured1 = cv2.medianBlur(bgray,3)
  # blured2 = cv2.medianBlur(bgray,51)
  # divided = np.ma.divide(blured1, blured2).data
  divided = bgray
  normed = np.uint8(255*divided/divided.max())
  th, threshed = cv2.threshold(divided, 230, 255, cv2.THRESH_BINARY)
  threshed = cv2.erode(threshed, cv2.getStructuringElement(cv2.MORPH_ERODE, (1, 1)))
  # cv2_imshow(threshed)
  cv2_imshow(cv2.imread(filepath, cv2.IMREAD_COLOR))

def extract_truth(filepath):
  splitted_path = filepath.split(" - ")
  if len(splitted_path) > 0:
    [truth, player, *rest] = filepath.split(" - ")
    [date, voter, voter_part, *rest] = truth.split("_")

    return {
      "original_filepath": filepath,
      "date": date,
      "voter": voter,
      "voter_part": voter_part,
      "player": player
    }
  return {
      "original_filepath": 'N/A',
      "date": 'N/A',
      "voter": 'N/A',
      "voter_part": 'N/A',
      "player": 'N/A'
  }

def is_buggy(record):
  return record["voter"] == 'N/A' or record['voter_part'] == 'N/A' or record['date'] == 'N/A'


def extract_voter(reader, filepath, show_img=False):
  img = cv2.imread(filepath, cv2.IMREAD_COLOR)
  h, w, c = img.shape
  text_ = reader.readtext(img[0:round(h/5), 0:w], workers=2)

  for t_, t in enumerate(text_):
    bbox, text, score = t
    text = text.strip()
    if show_img:
      print("Extract voter", text, score)
    if ('forge' in text and 'game' in text):
      return text.split("game")[-1][1:]
  return 'N/A'

def extract_voter_part(reader, filepath, show_img=False):
  img = cv2.imread(filepath, cv2.IMREAD_COLOR)
  img = cv2.erode(img, np.ones((1, 1), np.uint8))
  bgray = cv2.imread(filepath)[..., 0]
  # h, w, *rest = img.shape
  # blured1 = cv2.medianBlur(bgray,3)
  # blured2 = cv2.medianBlur(bgray,51)
  # divided = np.ma.divide(blured1, blured2).data
  # divided = bgray
  # normed = np.uint8(255*divided/divided.max())
  th, threshed = cv2.threshold(bgray, 240, 255, cv2.THRESH_BINARY)
  threshed = cv2.erode(threshed, cv2.getStructuringElement(cv2.MORPH_ERODE, (1, 1)))
  # img = cv2.erode(img, cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (1,1)))

  h, w, *rest = img.shape
  text_ = reader.readtext(threshed[0:round(h/3), 0:round(w/3)], workers=2)
  text_.sort(key=lambda x: x[2], reverse=True)
  voter_parts = []

  for t_, t in enumerate(text_):
    bbox, text, score = t
    text = text.strip().replace(" ", "")
    if show_img:
      print("Extract voter part", text, score)
    if ('#' in text):
      voter_part = text.split("#")[1]
      if len(voter_part) > 1:
        voter_parts.append(voter_part)
        return voter_part
  voter_part = voter_parts[0] if voter_parts else 'N/A'
  return voter_part

def extract_img_metadata(filepath, show_img=False):
  image = Image.open(filepath)
  exifdata = image.getexif()
  if show_img:
    print(exifdata)
    for tagid in exifdata:

      # getting the tag name instead of tag id
      tagname = TAGS.get(tagid, tagid)

      # passing the tagid to get its respective value
      value = exifdata.get(tagid)

      # printing the final result
      print(f"{tagname:25}: {value}")


def extract_date(reader, filepath, show_img=False, attempt=2):
  img = cv2.imread(filepath, cv2.IMREAD_GRAYSCALE)
  h, w, *rest = img.shape
  restrictions = ["AM", "PM", ":", "."]

  text_ = reader.readtext(img[h-50:h, 0:w], workers=2)
  text_.sort(key=lambda x: x[2], reverse=True)

  # text_ = reader.readtext(img, workers=2)

  for t_, t in enumerate(text_):
    bbox, text, score = t
    text = text.strip().split(" ")[-1].replace(" ", "")
    if show_img:
      print("Extract date", text, score)
    text_date = len(text) > 6 and is_date(text, relax=attempt==1)
    if (text_date and not any([char in text for char in restrictions])):
    # if (text_date and 'AM' not in text and 'PM' not in text and ':' not in text):
      return text_date.strftime(DATE_FORMAT)
  # extract_img_metadata(filepath, show_img)
  if attempt > 1:
    return extract_date(reader, filepath, show_img, attempt=attempt - 1)
  return 'N/A'


ocr = PaddleOCR(cls=False, det=False, lang='en', use_gpu=use_gpu) # need to run only once to download and load model into memory
reader = easyocr.Reader(
    ["en"],
    gpu=use_gpu,
    quantize=True
)

dir_path = 'pathdao_raw/0729_ID1'
img_path = '0729_15431_14259 - NaDa.png'
dir_path = f"upload/{now.strftime('%d-%m-%Y')}"
# dir_path = "upload/02-08-2023"


if not os.path.isdir(dir_path):
  print(f"No upload folder: {dir_path}")
  sys.exit(1)

"""
Measure date difference due to timezone
"""
def is_date_similar(d1, d2):
  return d1 - timedelta(days=1) <= d2 <= d1 + timedelta(days=1)

def generate_pred(ocr_instances, dir_path, filename, debug=False, show_img=False):
  easyocr_reader = ocr_instances["reader"]
  paddleocr_reader = ocr_instances["ocr"]
  truth = extract_truth(filename) if debug else {}
  full_path = f"{dir_path}/{img_path}"
  if show_img:
    show_image(full_path)

  pred = {
    "filename": filename,
    "voter": "N/A",
    "voter_part": "N/A",
    "date": "N/A",
    "has_voted": False,
    "to_check": True,
    "accuracy": "N/A"
  }

  result = paddleocr_reader.ocr(full_path)
  potential_voter_part = []
  potential_date = []

  for idx in range(len(result)):
    res = result[idx]
    res.sort(key=lambda x: x[1][1], reverse=True)
    for line in res:
        text = line[1][0]
        text = text.strip().replace(" ", "")
        if show_img:
          print(line[1])
        text_date = len(text) > 6 and is_date(text)
        if (text == 'VOTED'):
          pred['has_voted'] = True
        if ('forge' in text and 'game/' in text):
          pred["voter"] = text.split("/")[-1]
        elif (pred['voter_part'] == 'N/A' and len(text) > 1 and text.startswith('#')):
          pred['voter_part'] = text.split("#")[-1]
        elif text_date and 'PM' not in text and 'AM' not in text and ':' not in text:
          pred['date'] = text_date.strftime(DATE_FORMAT)
        elif(pred['voter_part'] == 'N/A' and len(text) > 1 and (text.startswith('1') or text.startswith('4')) and line[0][0][1] < 100):
          potential_voter_part.append([line[0][0], text[1:]])
        elif(is_date(text, relax=True)):
          potential_date.append(is_date(text, relax=True).strftime(DATE_FORMAT))

  if pred['date'] == 'N/A':
    pred['date'] = extract_date(easyocr_reader, full_path, show_img)

  if pred['voter_part'] == 'N/A' or (debug and pred['voter_part'] != truth['voter_part']):
    potential_voter_part.sort(key=lambda x: x[0][1])
    potential_voter_part = potential_voter_part[0][1] if len(potential_voter_part) > 0 else 'N/A'
    extracted_voter_part = extract_voter_part(easyocr_reader, full_path, show_img)
    pred['voter_part'] = extracted_voter_part if extracted_voter_part != 'N/A' else potential_voter_part

  if pred['voter'] == 'N/A' or (debug and pred['voter'] != truth['voter']):
      pred['voter'] = extract_voter(easyocr_reader, full_path, show_img)

  if pred['date'] == 'N/A':
    pred['date'] = potential_date[0] if len(potential_date) > 0 else 'N/A'

  if debug:
    if (pred['date'] != 'N/A' and pred['date'] != truth['date']):
      reparsed_date = is_date(f"{pred['date']}{now.strftime('%Y')}", dayfirst=False)
      is_swap = reparsed_date.strftime(DATE_FORMAT) == truth['date'] if reparsed_date else False
      pred['date'] = truth['date']

    pred["voter_score"] = 1 if truth["voter"] == pred["voter"] else 0
    pred["voter_part_score"] = 1 if truth["voter_part"] == pred["voter_part"] else 0
    pred["date_score"] = 1 if pred["date"] != 'N/A' and is_date_similar(parse(f"{truth['date'][2:4]}-{truth['date'][:2]}-{now.strftime('%Y')}"), parse(f"{pred['date'][2:4]}-{pred['date'][:2]}-{now.strftime('%Y')}")) else 0
    pred["accuracy"] = (pred["voter_score"] + pred["voter_part_score"] + pred["date_score"]) / 3

  pred["to_check"] = is_buggy(pred)

  return pred

files  =  [x for x in os.listdir(f"{dir_path}") if x.split('.')[-1] not in ['xlsx', 'txt', 'gsheet', 'csv']]
collections = []
samples = files
sample_range = [0, len(files)]
sample_range = [2000, len(files)]
samples = samples[sample_range[0]: sample_range[1]]
samples = [
    "0802_22064_1 - Sinyo Sinyo.jpg"
]
total_len = len(samples)
test = True
debug = False
debug = True

selectors = ["filename", "voter", "voter_part", "date", "has_voted", "to_check", "accuracy"]

st = time.time()

for i, img_path in enumerate(samples):
  print(f"{i+1}/{total_len}: {img_path}")
  record = generate_pred({"reader": reader, "ocr": ocr}, dir_path, img_path, test, debug)
  collections.append(record)
  print("Result: ", record)


df = pd.DataFrame.from_dict(collections)
output_df = df[[*selectors]]
accuracy_text = "_"

if test:
  failed_df = df[df['accuracy'] < 1]
  accuracy_text = f"_{math.floor((len(output_df.index) - len(failed_df.index)) / len(output_df.index) * 100)}%_"

output_df.to_excel(f"{dir_path}/result_{now.strftime('%H:%M')}_{sample_range[0]}-{sample_range[1]}{accuracy_text}{df[df['to_check'] == True].shape[0]}_{df[df['voter_score'] < 1].shape[0]}_{df[df['voter_part_score'] < 1].shape[0]}.xlsx")

print(f"Done: {time.time() - st}")
if test:
  with open(f"{dir_path}/failed_{now.strftime('%H:%M')}_{sample_range[0]}-{sample_range[1]}.txt", 'w') as f:
    f.write("\n".join(failed_df["filename"].values.tolist()) + "\n")

# with open(f"{dir_path}/failed.txt", 'r') as f:
#   failed = [x.strip() for x in f.readlines()]
#   collections = []
#   samples = failed
#   samples = [
#       "0729_20859_5435 - yaz - Jay Em.jpg"
#   ]
#   total_len = len(samples)


#   for i, img_path in enumerate(samples):
#     print(f"{i+1}/{total_len}")
#     record = generate_pred({"reader": reader, "ocr": ocr}, dir_path, img_path, True, True)
#     print("Result: ", record)
#     collections.append(record)
#   df = pd.DataFrame.from_dict(collections)
#   selectors = ["filename", "voter", "voter_part", "date", "has_voted", "to_check", "accuracy"]
#   output_df = df[[*selectors]]
#   failed_df = df[df['accuracy'] < 1]
#   output_df.to_excel(f"output/failed/{dir_path.split('/')[-1]}_{now.strftime('%H:%M')}_{df.shape[0]}_{math.floor((len(output_df.index) - len(failed_df.index)) / len(output_df.index) * 100)}%_{df[df['to_check'] == True].shape[0]}_{df[df['voter_score'] < 1].shape[0]}_{df[df['voter_part_score'] < 1].shape[0]}.xlsx")


#   with open('failed_latest.txt', 'w') as f:
#     f.write("\n".join(failed_df["filename"].values.tolist()) + "\n")



In [None]:
from pickle import FALSE
from Levenshtein import distance
import time
import easyocr
import random
import os
import math
import matplotlib.pyplot as plt
import pandas as pd
from dateutil.parser import parse
import cv2
from google.colab.patches import cv2_imshow
from datetime import datetime
import pytz


reader = easyocr.Reader(
    ["en"],
    gpu=True,
    quantize=True
)


dir_path = 'pathdao_raw/pathdao_raw'
sample_img = '0721_25418_20159 - Antonio.jpg'
game_url = 'forge'
DATE_FORMAT = '%m%d'

now = datetime.now(pytz.timezone('Asia/Kuala_Lumpur'))
date = now.strftime('%Y-%m-%d')
if not os.path.isdir(f"output/{date}"):
    os.makedirs(f"output/{date}")

def show_image(filepath):
  img = cv2.imread(filepath, cv2.IMREAD_COLOR)
  cv2_imshow(img)

def is_buggy(record):
  return record["voter"] == 'N/A' or record['voter_part'] == 'N/A' or record['date'] == 'N/A'


def is_date(string, fuzzy=False):
    """
    Return whether the string can be interpreted as a date.

    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    try:
        result = parse(string, fuzzy=fuzzy)
        return result

    except (OverflowError, ValueError):
        return False

def extract_truth(filepath):
  [truth, player, *rest] = filepath.split(" - ")
  [date, voter, voter_part, *rest] = truth.split("_")
  return {
      "original_filepath": filepath,
      "date": date,
      "voter": voter,
      "voter_part": voter_part,
      "player": player
  }

def process_img(reader, filepath, game_url, debug=False):
  img = cv2.imread(filepath, cv2.COLOR_BGR2GRAY)
  text_ = reader.readtext(img, workers=2)
  threshold = 0.5
  pred = {
    "voter": "N/A",
    "voter_part": "N/A",
    "date": "N/A"
  }
  # draw bbox and text
  for t_, t in enumerate(text_):
    bbox, text, score = t
    text = text.strip()
    text_date = is_date(text)
    should_draw = FALSE

    if (text.startswith("#")):
      pred["voter_part"] = text[1:]
      should_draw = True

    if (text_date):
      pred["date"] = text_date.strftime(DATE_FORMAT)
      should_draw = True


    if (text.startswith('http') or game_url in text):
      pred["voter"] = text.split("/")[-1]
      should_draw = True


    if debug:
      print(text, score)
  return pred




def generate_record(info, dir_path, filename, game_url):
  print(info)
  truth = extract_truth(filename)
  pred = process_img(reader, f"{dir_path}/{filename}", game_url)
  record = {
      "original_filepath": filename,
      "voter_truth": truth["voter"],
      "voter_pred": pred["voter"],
      "voter_score": 1 if truth["voter"] == pred["voter"] else 0,
      "voter_part_truth": truth["voter_part"],
      "voter_part_pred": pred["voter_part"],
      "voter_part_score": 1 if truth["voter_part"] == pred["voter_part"] else 0,
      "date_truth": truth["date"],
      "date_pred": pred["date"],
      "date_score": 1 if truth["date"] == pred["date"] else 0,
      "problematic": is_buggy(pred)
  }
  record["total_score"] = (record["voter_score"] + record["voter_part_score"] + record["date_score"]) / 3
  df = pd.DataFrame.from_dict([record])
  selector_d = {'original_filepath': 'filename', 'voter_pred': 'voter', 'voter_part_pred': 'voter_part', 'date_pred': 'date', 'problematic': 'to_check'}
  output_df = df.rename(columns=selector_d)[[*selector_d.values()]]
  return record


def test(dir_path, game_url, all=False, sample_size=10):
  files  =  os.listdir(f"{dir_path}")
  samples = files[all[0]:min(all[1], len(files))] if all else random.choices(files, k=sample_size)
  total_len = len(samples)
  df = pd.DataFrame.from_dict([generate_record(f"{idx + 1}/{total_len}", dir_path, s, game_url) for idx, s in enumerate(samples)])
  result = {
      "accuracy": df["total_score"].sum() / len(samples) * 100,
      "incorrect": df[df["total_score"] < 1]
  }
  return result, df

# Easy OCR (Test)

In [None]:
dir_path = 'pathdao_raw/0729_ID1'
st = time.time()
range = [0, 600]
result, df = test(dir_path, game_url, range)
print(result["accuracy"], time.time() - st)
selector_d = {'original_filepath': 'filename', 'voter_pred': 'voter', 'voter_part_pred': 'voter_part', 'date_pred': 'date', 'problematic': 'to_check'}
output_df = df.rename(columns=selector_d)[[*selector_d.values()]]
output_df.to_excel(f"output/{date}/{now.strftime('%H:%M')}_{range[0]}-{range[1]}_{df.shape[0]}_{df[df['problematic'] == True].shape[0]}.xlsx")
failed_df = output_df[output_df["to_check"] == True]
with open('failed.txt', 'a') as f:
  f.write("\n".join(failed_df["filename"].values.tolist()) + "\n")

In [None]:
failed_df = output_df[output_df["to_check"] == True]
with open('failed.txt', 'a') as f:
  f.write("\n".join(failed_df["filename"].values.tolist()) + "\n")

In [None]:
import numpy as np

plt.rcParams["figure.figsize"] = (30,20)
dir_path = 'pathdao_raw/0729_ID1'
game_url = 'forge.hv-mtlcom/game'

reader = easyocr.Reader(
    ["en"],
    gpu=True
)


def show_image(filepath):
  img = cv2.imread(filepath, cv2.IMREAD_COLOR)
  cv2_imshow(img)

def process_voter_part(reader, img):
  x, img = cv2.threshold(img, 200,255, cv2.THRESH_BINARY)
  text_ = reader.readtext(img, workers=2)

  pred = {
    "voter": "N/A",
    "voter_part": "N/A",
    "date": "N/A"
  }

  for t_, t in enumerate(text_):
    bbox, text, score = t
    text = text.strip()
    print(text, score)

    if (text.startswith("#")):
      pred["voter_part"] = text[1:]
      break

    if (text.startswith('01') or text.startswith('4')):
      pred["voter_part"] = text[1:]
      break
  return pred


def process_img(reader, filepath, game_url, debug=False):
  img = cv2.imread(filepath)
  text_ = reader.readtext(img, workers=2)
  threshold = 0.5
  pred = {
    "voter": "N/A",
    "voter_part": "N/A",
    "date": "N/A"
  }

  voters = []
  voters_map = {}
  # draw bbox and text
  for t_, t in enumerate(text_):
    bbox, text, score = t
    text = text.strip()
    text_date = is_date(text)
    should_draw = FALSE
    pt1 = [round(x) for x in bbox[0]]
    pt2 = [round(x) for x in bbox[2]]



    if (text.startswith('http') or 'forge' in text):
      voters.append(text)
      voters_map[text] = (pt1, pt2)

    elif (text.startswith("#")):
      pred["voter_part"] = text[1:]
      cv2.rectangle(img,pt1, pt2, (0, 255, 0), 5)
      cv2.putText(img, text, pt1, cv2.FONT_HERSHEY_COMPLEX, 0.65, (255, 0, 0), 2)

    elif (text_date):
      pred["date"] = text_date.strftime(DATE_FORMAT)
      cv2.rectangle(img,pt1, pt2, (0, 255, 0), 5)
      cv2.putText(img, text, pt1, cv2.FONT_HERSHEY_COMPLEX, 0.65, (255, 0, 0), 2)



    if debug:
      print(text, score)

  if voters:
    voters = sorted(voters, key=lambda x: distance(x, game_url))
    pred["voter"] = voters[0].split("/")[-1]
    cv2.rectangle(img, voters_map[voters[0]][0], voters_map[voters[0]][1], (0, 255, 0), 5)
    cv2.putText(img, text, voters_map[voters[0]][0], cv2.FONT_HERSHEY_COMPLEX, 0.65, (255, 0, 0), 2)

  if pred["voter_part"] == "N/A":

      x, new_img = cv2.threshold(img, 200,255, cv2.THRESH_BINARY)
      new_img = cv2.erode(new_img, np.ones((2, 2), np.uint8))
      text_ = reader.readtext(new_img, workers=2)
      for t_, t in enumerate(text_):
        print(text, score)
        bbox, text, score = t
        text = text.strip()
        text_date = is_date(text)
        should_draw = FALSE
        pt1 = [round(x) for x in bbox[0]]
        pt2 = [round(x) for x in bbox[2]]

        if (text.startswith("#")):
          pred["voter_part"] = text[1:]
          cv2.rectangle(img,pt1, pt2, (0, 255, 0), 5)
          cv2.putText(img, text, pt1, cv2.FONT_HERSHEY_COMPLEX, 0.65, (255, 0, 0), 2)

        elif (text.startswith("H")):
          pred["voter_part"] = text[1:]
          cv2.rectangle(img,pt1, pt2, (0, 255, 0), 5)
          cv2.putText(img, text, pt1, cv2.FONT_HERSHEY_COMPLEX, 0.65, (255, 0, 0), 2)

        elif (text.startswith("4")):
          pred["voter_part"] = "1"
          cv2.rectangle(img,pt1, pt2, (0, 255, 0), 5)
          cv2.putText(img, text, pt1, cv2.FONT_HERSHEY_COMPLEX, 0.65, (255, 0, 0), 2)



  plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
  plt.show()
  return pred


with open('failed_2.txt', 'r') as f:
  failed = [x.strip() for x in f.readlines()]
  # img_path = '0729_10993_5435 - Dian Kristina.png'
  # show_image(f"{dir_path}/{img_path}")
  collections = []
  for img_path in failed[:2]:
    record = process_img(reader, f"{dir_path}/{img_path}", game_url, True)
    record["problematic"] = is_buggy(record)
    collections.append(result)
  df = pd.DataFrame.from_dict(collections)
  selector_d = {'original_filepath': 'filename', 'voter_pred': 'voter', 'voter_part_pred': 'voter_part', 'date_pred': 'date', 'problematic': 'to_check'}
  output_df = df.rename(columns=selector_d)[[*selector_d.values()]]
  output_df.to_excel(f"output/{dir_path}_{now.strftime('%H:%M')}_{df.shape[0]}_{df[df['problematic'] == True].shape[0]}.xlsx")


# Architecture

- https://datascience.stackexchange.com/questions/64486/how-to-automatically-mount-my-google-drive-to-google-colab
- Schedule notebook check. Read metadata of what's processed based on presence of output file
- Download the model and use it instead of download every single time
- ocr.readtext(
    img,
    workers=4,
)
- ocr = easyocr.Reader(
    ["en"],
    gpu=True,
    quantize=True,
)

# Preprocessing

- `gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)`
- mn = np.min(frame)
- mx = np.max(frame)
- norm = (frame - mn) * (1.0 / (mx - mn))


# Output

- filename, voter, voter_part, date, confidence / flag

# References

- [Using tesseract OCR with OpenCV](https://nanonets.com/blog/ocr-with-tesseract/#:~:text=Tesseract%20is%20an%20open%20source,a%20wide%20variety%20of%20languages.)
- [Tesseract on Colab](https://bhadreshpsavani.medium.com/how-to-use-tesseract-library-for-ocr-in-google-colab-notebook-5da5470e4fe0)
- [Kaggle compare ocr.space vs tesseract](https://www.kaggle.com/code/naim99/ocr-text-recognition-ocr-space-api-tesseract)
- [tesseract, easy_ocr, keras_ocr](https://www.youtube.com/watch?v=oyqNdcbKhew)
- [EasyOCR Pytorch](https://www.youtube.com/watch?v=ZVKaWPW9oQY)
- [EasyOCR opencv](https://github.com/computervisioneng/text-detection-python-easyocr/blob/master/main.py)

## Advanced

- [Deepsolo ABCNet](https://www.kaggle.com/code/kunalpurkayastha/pretrained-abcnet-and-deepsolo)
- [Text recognition from image](https://www.kaggle.com/datasets/robikscube/textocr-text-extraction-from-images-dataset)
- [Scene text recognition: Shoppee](https://www.kaggle.com/competitions/shopee-product-matching/discussion/225942)
- [OCR Trick for e-commerce product](https://www.kaggle.com/c/cdiscount-image-classification-challenge/discussion/45863)