In [1]:
import fasttext
import torch
from pdf2image import convert_from_path
import pdfplumber
import numpy as np
import easyocr
import pandas as pd
from pdf2image.exceptions import (
     PDFInfoNotInstalledError,
     PDFPageCountError,
     PDFSyntaxError
     )
import json
from tqdm.notebook import tqdm
from collections import defaultdict, Counter
import gc
import decimal
from functools import partial
import re
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

In [3]:
def convert_pdf_to_text(source_pdf_path, ocr_lang_list, batch_size, use_ocr=True):
    images = convert_from_path(source_pdf_path)
    total_images=len(images)
    print("Number of images {}".format(total_images))

    ocr_reader = easyocr.Reader(ocr_lang_list)

    if not use_ocr:
        plumber_pdf = pdfplumber.open(source_pdf_path)

    pages=[]
    page_sizes=[]
    bboxs=[]
    contents=[]

    for i in tqdm(range(total_images)):
        image_array = np.array(images[i])
        pdf_page = plumber_pdf.pages[i]
        image_size =image_array.shape[:2]
        results = ocr_reader.readtext(image_array, paragraph=True, batch_size=batch_size)

        for result in results:
            pages.append(i)
            page_sizes.append(image_array.shape)
            bboxs.append(json.dumps(result[0]))

            if not use_ocr:
                top_left = result[0][0] # [1665, 161] [height, width]
                bottom_right = result[0][2] #[1716, 824] [height, width]

                x_top_percent = decimal.Decimal(top_left[0]/image_size[1])
                y_top_percent = decimal.Decimal(top_left[1]/image_size[0])

                x_btm_percent = decimal.Decimal(bottom_right[0]/image_size[1])
                y_btm_percent = decimal.Decimal(bottom_right[1]/image_size[0])


                cropped_page = pdf_page.crop((x_top_percent*pdf_page.width, y_top_percent*pdf_page.height, 
                                              x_btm_percent*pdf_page.width, y_btm_percent*pdf_page.height))
                contents.append(cropped_page.extract_text())

                del cropped_page
                gc.collect()

            else:
                contents.append(result[1])

        del pdf_page
        gc.collect()

    if not use_ocr:
        plumber_pdf.close()

    return pd.DataFrame({'page':pages, 'page_size':page_sizes, 'bboxs':bboxs,'contents':contents})


In [4]:
# source_pdf_path = r"/content/drive/MyDrive/auto_translation/Astra Account December 2019.pdf
source_pdf_path = r"../data/BCI_group_holdings.pdf"
ocr_lang_list = ['ch_sim','en']
# ocr_lang_list = ['id','en']
batch_size = 32
company_name = "BCI_group"
fasttext_model_path = '../models/lid.176.bin'

In [None]:
raw_text_df = convert_pdf_to_text(source_pdf_path, ocr_lang_list, batch_size, use_ocr=False)
raw_text_df.to_excel(f"{company_name}_raw_text.xlsx")

CUDA not available - defaulting to CPU. Note: This module is much faster with a GPU.


Number of images 206


  0%|          | 0/206 [00:00<?, ?it/s]

In [None]:
raw_text_df = pd.read_excel("/content/yue_kai_holding_raw_text.xlsx", index_col=0)
raw_text_df.head()

## Preprocessing

In [18]:
raw_text_df.dropna().reset_index(drop=True)

Unnamed: 0,page,page_size,bboxs,contents
0,1,"(2245, 1654, 3)","[[1236, 154], [1518, 154], [1518, 202], [1236,...",CONTENTS 目錄
1,1,"(2245, 1654, 3)","[[138, 355], [523, 355], [523, 400], [138, 400]]",Corporate Information 公司資料
2,1,"(2245, 1654, 3)","[[138, 429], [517, 429], [517, 470], [138, 470]]",Chairman’s Statement 主席報告
3,1,"(2245, 1654, 3)","[[138, 501], [808, 501], [808, 544], [138, 544]]",Management Discussion and Analysis 管理層討論與分析
4,1,"(2245, 1654, 3)","[[1495, 507], [1515, 507], [1515, 535], [1495,...",6
...,...,...,...,...
3663,180,"(2245, 1654, 3)","[[940, 1400], [1026, 1400], [1026, 1426], [940...",100150
3664,180,"(2245, 1654, 3)","[[1108, 1400], [1182, 1400], [1182, 1428], [11...",82037
3665,180,"(2245, 1654, 3)","[[1266, 1400], [1342, 1400], [1342, 1426], [12...",53285
3666,180,"(2245, 1654, 3)","[[1424, 1400], [1498, 1400], [1498, 1426], [14...",20297


In [19]:
def remove_numeric_symbol_rows(df):
    pat = r"^['\[\]`\(\)\d,\s}{\$*&%#@!\-_+=\?><:;\|~\"/]+$"
    df['contents'] = df['contents'].str.strip()
    df = df.loc[~df['contents'].str.contains(pat)]
    return df
raw_text_df = raw_text_df.dropna().reset_index(drop=True)
preprocessed_df = remove_numeric_symbol_rows(raw_text_df)
preprocessed_df

Unnamed: 0,page,page_size,bboxs,contents
0,1,"(2245, 1654, 3)","[[1236, 154], [1518, 154], [1518, 202], [1236,...",CONTENTS 目錄
1,1,"(2245, 1654, 3)","[[138, 355], [523, 355], [523, 400], [138, 400]]",Corporate Information 公司資料
2,1,"(2245, 1654, 3)","[[138, 429], [517, 429], [517, 470], [138, 470]]",Chairman’s Statement 主席報告
3,1,"(2245, 1654, 3)","[[138, 501], [808, 501], [808, 544], [138, 544]]",Management Discussion and Analysis 管理層討論與分析
5,1,"(2245, 1654, 3)","[[139, 571], [1147, 571], [1147, 615], [139, 6...",Biographical Details of Directors and Senior M...
...,...,...,...,...
3653,180,"(2245, 1654, 3)","[[485, 1244], [616, 1244], [616, 1348], [485, ...",資產及負債\n總資產\n總負債
3654,180,"(2245, 1654, 3)","[[138, 1246], [406, 1246], [406, 1344], [138, ...",ASSETS AND LIABILITIES\nTotal assets\nTotal li...
3660,180,"(2245, 1654, 3)","[[488, 1396], [592, 1396], [592, 1428], [488, ...",資產淨值
3662,180,"(2245, 1654, 3)","[[140, 1400], [250, 1400], [250, 1426], [140, ...",Net assets


## Language prediction
There are 3 possibility
1. lang_1
2. eng
3. mix

In [26]:
def predict_macro_language(fasttext_predictions_list):
    language_proba = defaultdict(float)
    total_predictions = len(fasttext_predictions_list)
    for item in fasttext_predictions_list:
        for label, pred in zip(item[0],item[1]):
            language_proba[label] += pred
    sorted_language_proba = Counter(language_proba).most_common(2)
    if (len(sorted_language_proba) == 1) or (sorted_language_proba[0][1] > 5*sorted_language_proba[1][1]):
        #if only single language is detected or the most common language is 5 times more likely than the second most common
        dominant_language = sorted_language_proba[0].replace("__label__", "")
        print(f"AR detected as pure language AR of {dominant_language}")
        return (dominant_language,)
    else:
        dominant_language_proba, secondary_language_proba = sorted_language_proba
        dominant_language = dominant_language_proba[0].replace("__label__", "")
        secondary_language = secondary_language_proba[0].replace("__label__", "")

        dominant_proba = dominant_language_proba[1]
        secondary_proba = secondary_language_proba[1]

        print(f"AR detected as hybrid language AR of {dominant_language} of {dominant_proba/total_predictions:.5f} and {secondary_language} of {secondary_proba/total_predictions:.5f}")
        return (dominant_language, secondary_language)

def predict_paragraph_language(macro_language_prediction, x):
    dominant_language = macro_language_prediction[0]
    secondary_language = macro_language_prediction[1]

    first_language_label = x[0][0].replace("__label__", "")
    second_language_label = x[0][1].replace("__label__", "")

    first_language_prob = x[1][0]
    second_language_prob = x[1][1]

    if (first_language_prob > (5*second_language_prob)) and (first_language_label==dominant_language or first_language_label==secondary_language) and (first_language_prob >0.8):
        return  first_language_label
    else:
        return "MIXED"



def fasttext_language_predict(df, model_path, top_k=2):
    model = fasttext.load_model(model_path)
    df = df.copy()
    df['lang_raw_pred'] = df['contents'].map(lambda text:model.predict(text.replace("\n"," ").lower(), k=top_k))
    macro_language_prediction = predict_macro_language(df['lang_raw_pred'].tolist())
    if len(macro_language_prediction)==1:
        #single language
        df['final_lang_pred'] = macro_language_prediction

    else:
        df['final_lang_pred'] = df['lang_raw_pred'].map(partial(predict_paragraph_language, macro_language_prediction))


    return df

lang_prediction_df = fasttext_language_predict(preprocessed_df, fasttext_model_path, top_k=2)
lang_prediction_df.to_excel(f"{company_name}_lang_prediction.xlsx")
lang_prediction_df



AR detected as hybrid language AR of zh of 0.41819 and en of 0.40315


Unnamed: 0,page,page_size,bboxs,contents,lang_raw_pred,final_lang_pred
0,1,"(2245, 1654, 3)","[[1236, 154], [1518, 154], [1518, 202], [1236,...",CONTENTS 目錄,"((__label__zh, __label__en), [0.76993197202682...",MIXED
1,1,"(2245, 1654, 3)","[[138, 355], [523, 355], [523, 400], [138, 400]]",Corporate Information 公司資料,"((__label__zh, __label__ja), [0.93213880062103...",zh
2,1,"(2245, 1654, 3)","[[138, 429], [517, 429], [517, 470], [138, 470]]",Chairman’s Statement 主席報告,"((__label__zh, __label__ja), [0.51523053646087...",MIXED
3,1,"(2245, 1654, 3)","[[138, 501], [808, 501], [808, 544], [138, 544]]",Management Discussion and Analysis 管理層討論與分析,"((__label__zh, __label__en), [0.46013802289962...",MIXED
5,1,"(2245, 1654, 3)","[[139, 571], [1147, 571], [1147, 615], [139, 6...",Biographical Details of Directors and Senior M...,"((__label__en, __label__zh), [0.59745222330093...",MIXED
...,...,...,...,...,...,...
3653,180,"(2245, 1654, 3)","[[485, 1244], [616, 1244], [616, 1348], [485, ...",資產及負債\n總資產\n總負債,"((__label__zh, __label__en), [0.95575839281082...",zh
3654,180,"(2245, 1654, 3)","[[138, 1246], [406, 1246], [406, 1344], [138, ...",ASSETS AND LIABILITIES\nTotal assets\nTotal li...,"((__label__en, __label__fr), [0.84671950340271...",en
3660,180,"(2245, 1654, 3)","[[488, 1396], [592, 1396], [592, 1428], [488, ...",資產淨值,"((__label__zh, __label__fr), [0.87261879444122...",zh
3662,180,"(2245, 1654, 3)","[[140, 1400], [250, 1400], [250, 1426], [140, ...",Net assets,"((__label__en, __label__zh), [0.53379791975021...",MIXED


## Hand labeling and cleaning and load in cleaned file

In [7]:
lang_prediction_df = pd.read_excel("/content/yue_kai_holding_lang_prediction.xlsx", index_col=0)
lang_prediction_df

Unnamed: 0,page,page_size,bboxs,contents,lang_raw_pred,final_lang_pred
0,1,"(2245, 1654, 3)","[[1236, 154], [1518, 154], [1518, 202], [1236,...",CONTENTS 目錄,"(('__label__zh', '__label__en'), array([0.7699...",MIXED
1,1,"(2245, 1654, 3)","[[138, 355], [523, 355], [523, 400], [138, 400]]",Corporate Information 公司資料,"(('__label__zh', '__label__ja'), array([0.9321...",MIXED
2,1,"(2245, 1654, 3)","[[138, 429], [517, 429], [517, 470], [138, 470]]",Chairman’s Statement 主席報告,"(('__label__zh', '__label__ja'), array([0.5152...",MIXED
3,1,"(2245, 1654, 3)","[[138, 501], [808, 501], [808, 544], [138, 544]]",Management Discussion and Analysis 管理層討論與分析,"(('__label__zh', '__label__en'), array([0.4601...",MIXED
4,1,"(2245, 1654, 3)","[[139, 571], [1147, 571], [1147, 615], [139, 6...",Biographical Details of Directors and Senior M...,"(('__label__en', '__label__zh'), array([0.5974...",MIXED
...,...,...,...,...,...,...
2910,180,"(2245, 1654, 3)","[[485, 1244], [616, 1244], [616, 1348], [485, ...",資產及負債\n總資產\n總負債,"(('__label__zh', '__label__en'), array([0.9557...",zh
2911,180,"(2245, 1654, 3)","[[138, 1246], [406, 1246], [406, 1344], [138, ...",ASSETS AND LIABILITIES\nTotal assets\nTotal li...,"(('__label__en', '__label__fr'), array([0.8467...",en
2912,180,"(2245, 1654, 3)","[[488, 1396], [592, 1396], [592, 1428], [488, ...",資產淨值,"(('__label__zh', '__label__fr'), array([0.8726...",zh
2913,180,"(2245, 1654, 3)","[[140, 1400], [250, 1400], [250, 1426], [140, ...",Net assets,"(('__label__en', '__label__zh'), array([0.5337...",en


## Split chinese mixed text

In [59]:
def split_en_zh_mixed_text(df):
    df_length = len(df)
    pages = []
    contents = []
    final_lang_preds = []
    for i in tqdm(range(df_length)):
        row = df.iloc[i]
        page = row['page']
        content = row['contents']
        final_lang_pred = row['final_lang_pred']

        if final_lang_pred == 'MIXED':
            #check if english is at the front
            if re.match(r"^[\s\d]*[a-zA-Z]{4,}", content) is not None:
                regex_match = re.match(r"[^\u4e00-\ufaff]*", content)
                last_english_index = regex_match.span()[1]
                en_content = content[:last_english_index]
                zh_content = content[last_english_index:]
                pages.append(page)
                contents.append(en_content)
                final_lang_preds.append("en")

                pages.append(page)
                contents.append(zh_content)
                final_lang_preds.append("zh")

            elif re.search(r"[a-zA-Z]{4,}$", content) is not None:
                #english at the back
                regex_match = re.search(r"[^\u4e00-\ufaff]*$", content)
                first_english_index = regex_match.span()[0]
                zh_content = content[:first_english_index]
                en_content = content[first_english_index:]

                pages.append(page)
                contents.append(zh_content)
                final_lang_preds.append("zh")

                pages.append(page)
                contents.append(en_content)
                final_lang_preds.append("en")

            else:
                pages.append(page)
                contents.append(content)
                final_lang_preds.append("MIXED")

        else:
            pages.append(page)
            contents.append(content)
            final_lang_preds.append(final_lang_pred)

    return pd.DataFrame({"page":pages, "content":contents, "final_lang_pred":final_lang_preds })


In [60]:
splited_df = split_en_zh_mixed_text(lang_prediction_df)
splited_df.to_excel(f"{company_name}_splited.xlsx")
splited_df

  0%|          | 0/2915 [00:00<?, ?it/s]

In [81]:
similarity_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

In [88]:
embeddings1 = similarity_model.encode(["公司資料"], convert_to_tensor=True)
embeddings2 = similarity_model.encode(["Corporate Information"], convert_to_tensor=True)
cosine_score = util.cos_sim(embeddings1, embeddings2)
cosine_score

tensor([[0.8587]])

In [None]:
KPI B3.2: The average training hours completed per employee
by gender and employee category
關鍵績效指標 B3.2：按性別及僱員類別劃分，每名僱員完成受訓的平均時數