<a href="https://colab.research.google.com/github/zenon10/POC-OCR/blob/main/Extract_text_wrapper_vRG20230213.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Library to work with Operating System
import os
import glob
import pandas as pd
from PIL import Image
import pytesseract
import shutil
import textract
import random
from statistics import mean

# Library to create PDF, inspection, manupulation and rendering(turning into images)
import pypdfium2 as pdfium

# If you don't have tesseract executable in your PATH, include the following:
#pytesseract.pytesseract.tesseract_cmd = r'C:\Users\RiyaGupta\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'

In [None]:
path = glob.glob("1_input/*")
metadata = pd.DataFrame({"input_file_path":path});metadata
metadata.to_csv("metadata.csv")

In [None]:
def pdf_to_txt(input_file_path, text_file_name, output_folder_path):
    
    pdf_info = pdfium.PdfDocument(input_file_path)
        
    # get the number of pages in the document
    n_pages = len(pdf_info)
    print('Number of pages {}'.format(n_pages))
    
    page_indices = [i for i in range(n_pages)]    
    renderer = pdf_info.render_to(pdfium.BitmapConv.pil_image, page_indices = page_indices, scale = 300/72)
    
    combined_text = ''
    ocr_conf = []
    page = 0
    for i in renderer:
        page += 1
        text = pytesseract.image_to_string(i, lang='fra')
        ## Concatenating text-string outputs from each image into one master text-string
        combined_text = ("\n**********\n"+str(page)+"\n**********\n").join([combined_text, text])
        #confidence score—pytesseract
        conf_score = pytesseract.image_to_data(i,output_type = 'data.frame')
        conf_score = conf_score[conf_score.conf != -1]
        ocr_conf.append(conf_score.conf.mean())
    
    ## Open a file with access mode 'w'
    file_object2 = open(text_file_name, 'w', encoding="utf8")
    file_object2.write(combined_text)
    file_object2.close()
    return mean(ocr_conf)

In [None]:
def extract_text(input_file_path, output_folder_path):
    '''
    This function takes French PDFs, doc, docx, excels, 
    extract text and save it in a txt file.

    Input parameters
        input_file_path: Input file location
        output_file_path: Output file location
        extra_dir: Directory to save additional output files for PDFs
    Output
        For PDF files, the extra_dir will have jpeg files and output_file_path will have txt files
        For doc, docx and the excel, output_file_path will be only txt files
    '''
    file_name = os.path.basename(input_file_path).split('.')[0]
    
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path )
        
    if os.path.exists(input_file_path):
        print("Text extraction process starting for...",input_file_path)
        
        ## Storing extension of input_file_path
        file_name_extn = os.path.basename(input_file_path).split('.')[1]
        
        ## Keeping a list of acceptable extensions
        pdf_extension = ['PDF']
        non_pdf_extension = ['DOC', 'DOCX', 'XLSX']
        
        text_file_name = output_folder_path + file_name + ".txt"
        
        ## Applying text extraction process based on formats
        if file_name_extn.upper() in pdf_extension:
            
            ## Converting each page of PDF to combined text file
            ocr_conf = pdf_to_txt(input_file_path, text_file_name, output_folder_path)
            
        elif file_name_extn.upper() in non_pdf_extension:
            text_file = textract.process(input_file_path)
            ocr_conf = 100
            ## Saving text in output_file_path
            with open(text_file_name, 'w') as f:
                f.write(text_file.decode('utf8', 'strict'))
        else:
            raise ValueError("File does not match required formats - PDF, doc, docx, xlsx")
    else:
        raise ValueError("File does not exist!!")
    return text_file_name, ocr_conf

In [None]:
def extract_text_wrapper(metadata_file_path, output_folder_path):
    metadata = pd.read_csv(metadata_file_path)
    metadata.loc[:,['text_file_path','confidence']] = metadata["input_file_path"].apply(lambda x: extract_text(x, output_folder_path)).tolist()
    metadata.to_csv("metadata.csv")

In [None]:
metadata_file_path = "metadata.csv"
output_folder_path = "2_output/"
extract_text_wrapper(metadata_file_path, output_folder_path)


Text extraction process starting for... 1_input\f1.pdf
Number of pages 6
Text extraction process starting for... 1_input\f2.docx
Text extraction process starting for... 1_input\f2.pdf
Number of pages 4
Text extraction process starting for... 1_input\f3.docx
