## Using OCR for Language Detection

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import random
import easyocr

from PIL import Image
from tqdm import tqdm
import numpy as np
tqdm.pandas()

In [None]:
df = pd.read_csv('', index_col = 0)

### EasyOCR

In [None]:
def run_easyocr(df):
    """
    Runs EasyOCR on a set of images whose file paths are stored in the DataFrame.
    
    EasyOCR predicts the actual text in the image. But since we are only interested
    in whether there is text or not, we will only save a binary output.
    """
    file = df.path
    image = mpimg.imread(file)
    # Center crop image to avoid picking out "Google"
    w, h = (int(i) * 0.95 for i in image.shape[:2])
    c0, c1 = (i / 2 for i in image.shape[:2])
    x = c1 - w / 2
    y = c0 - h / 2
    crop_img = image[int(y):int(y + h), int(x):int(x + w), :]
    
    result = reader.readtext(crop_img)
    
    df["result_easy"] = result
    
    if len(result) > 0:
        df["pred_easy"] = 1
    else:
        df["pred_easy"] = 0
    
    return df

In [None]:
reader = easyocr.Reader(['en'])
df_0 = df[df.task == 0]
df_0 = df_0.progress_apply(run_easyocr, axis = 1)

In [None]:
reader = easyocr.Reader(['sv'])
df_1 = df[df.task == 1]
df_1 = df_1.progress_apply(run_easyocr, axis = 1)

In [None]:
reader = easyocr.Reader(['ar'])
df_2 = df[df.task == 2]
df_2 = df_2.progress_apply(run_easyocr, axis = 1)

In [None]:
reader = easyocr.Reader(['ch_tra'])
df_3 = df[df.task == 3]
df_3 = df_3.progress_apply(run_easyocr, axis = 1)

In [None]:
df_easy = pd.concat([df_0, df_1, df_2, df_3]).reset_index().drop(["index"], axis = 1)

In [None]:
df_easy.to_pickle('')

In [None]:
def calculate_pr(df, suffix):
    for task in range(4):
        tp = 0
        fp = 0
        fn = 0
        y_label = np.array(df[df.task == task].label)
        y_preds = np.array(df[df.task == task][f"pred_{suffix}"])
        for i in range(len(y_preds)):
            if y_preds[i] == 1: # prediction is positive
                if y_label[i] == 1: # TP
                    tp += 1
                else:
                    fp += 1 # FP
            elif y_label[i] == 1: # FN
                fn += 1
        p = tp/(tp + fp)
        r = tp/(tp + fn)
        f1 = 2*p*r/(p+r)
        
        print(f"Task {task}")
        print(f"Precision: {p}")
        print(f"Recall: {r}")
        print(f"F1 Score: {f1}")

In [None]:
calculate_pr(df_easy, "easy")

### Google OCR

In [None]:
pip install google-cloud-datastore
pip install google-cloud-vision

In [None]:
from __future__ import print_function
from google.cloud import vision
import os
import io
import cv2

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = # JSON credentials
client = vision.ImageAnnotatorClient()

In [None]:
paths = df.path.to_list()
responses = []
languages = []

In [None]:
for file in tqdm(paths):
    with io.open(file, 'rb') as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    # Get response and append
    response = client.text_detection(image=image)
    responses.append(response)
    # Infer languages
    try:
        language = [a.language_code for a in response.full_text_annotation.pages[0].property.detected_languages]
    except IndexError:
        language = []
    languages.append(language)

In [None]:
gocr_results = pd.DataFrame.from_dict({'path': paths,
                                       'responses': responses[1:],
                                       'languages': languages[1:]})

In [None]:
gocr_results.to_pickle('')