In [1]:
import io
import os
import time
import re

import pandas as pd
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, \
    strip_multiple_whitespaces, stem_text, strip_tags, strip_short

In [2]:
def detect_handwritten_ocr(path, verbose=False):
    """Detects handwritten characters in a local image.

    Args:
    path: The path to the local file.
    """
    from google.cloud import vision_v1p3beta1 as vision
    client = vision.ImageAnnotatorClient()

    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.types.Image(content=content)

    # Language hint codes for handwritten OCR:
    # en-t-i0-handwrit, mul-Latn-t-i0-handwrit
    # Note: Use only one language hint code per request for handwritten OCR.
    image_context = vision.types.ImageContext(
        language_hints=['en-t-i0-handwrit'])

    response = client.document_text_detection(image=image,
                                              image_context=image_context)

    if verbose:
        print('Full Text: {}'.format(response.full_text_annotation.text))

        for page in response.full_text_annotation.pages:
            for block in page.blocks:
                print('\nBlock confidence: {}\n'.format(block.confidence))

                for paragraph in block.paragraphs:
                    print('Paragraph confidence: {}'.format(
                        paragraph.confidence))

                    for word in paragraph.words:
                        word_text = ''.join([
                            symbol.text for symbol in word.symbols
                        ])
                        print('Word text: {} (confidence: {})'.format(
                            word_text, word.confidence))

                        for symbol in word.symbols:
                            print('\tSymbol: {} (confidence: {})'.format(
                                symbol.text, symbol.confidence))
    
    return response.full_text_annotation.text

In [3]:
PROJECT_PATH = '../../data/postits'
FOLDER = 'data_priority_area_all'
IMAGES_PATH = f'{PROJECT_PATH}/{FOLDER}/notes'
TEXT_PATH = f'{PROJECT_PATH}/{FOLDER}/text'
GROUP_PATTERN = '(?<=\[)(.*)(?=\])'

img_data = []
for img_name in os.listdir(f'{IMAGES_PATH}'):
    print(f'{IMAGES_PATH}/{img_name}')
    img_group = re.search(GROUP_PATTERN, img_name).group(0)
    
    txt_name = img_name.replace('jpg', 'txt')
    if os.path.isfile(f'{TEXT_PATH}/{txt_name}'):
        img_text = open(f'{TEXT_PATH}/{txt_name}').read()
    else:
        img_text = detect_handwritten_ocr(f'{IMAGES_PATH}/{img_name}')
        with open(f'{TEXT_PATH}/{txt_name}', 'w') as f:
            f.write(img_text)
    
    img_data.append({
        'name': img_name,
        'group': img_group,
        'text': img_text
    })

/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note01 [Data cleaning].jpg
/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note02 [Data cleaning].jpg
/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note03 [Data cleaning].jpg
/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note04 [Data cleaning].jpg
/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note05 [Data cleaning].jpg
/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note06 [Data cleaning].jpg
/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note07 [Data cleaning].jpg
/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note08 [Data cleaning].jpg
/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note09 [Data cleaning].jpg
/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note10 [Data cleaning].jpg
/Users/sorrosn/Deskt



/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note62 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note63 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note64 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note65 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note66 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note67 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note68 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note69 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note70 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note71 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note72 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note73 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note74 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note75 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note76 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note77 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note78 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note79 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note80 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note81 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note82 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note83 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note84 [Open EHR].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note85 [Ready ML models].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note86 [Ready ML models].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note87 [Ready ML models].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note88 [Ready ML models].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note89 [Ready ML models].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note90 [Ready ML models].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note91 [Ready ML models].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note92 [Ready ML models].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note93 [Ready ML models].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note94 [Ready ML models].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note95 [Ready ML models].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note96 [Ready ML models].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note97 [Ready ML models].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note98 [Data counceling].jpg




/Users/sorrosn/Desktop/ocr_images/postits/data_priority_area_all/notes/note99 [Data counceling].jpg




In [4]:
img_data = pd.DataFrame(img_data)
img_data.head()

Unnamed: 0,group,name,text
0,Data cleaning,note01 [Data cleaning].jpg,807 OF BIOB\nINFORMANES IS\nCLEANING ZORG\nis ...
1,Data cleaning,note02 [Data cleaning].jpg,"Data cleaning\ngoolines of code,\ntoo was abou..."
2,Data cleaning,note03 [Data cleaning].jpg,Data cleaning\nBiggest challenge\nI think we h...
3,Data cleaning,note04 [Data cleaning].jpg,"CLEANING DATA ..\nESSENTIAL, UN-\nREWALED A Wi..."
4,Data cleaning,note05 [Data cleaning].jpg,THAT IS PROSAIC\nBUT NEARU Tape\nINSURMOUNTABL...


In [5]:
def preprocess_text(text):
    clean_text = text.lower()
    clean_text = remove_stopwords(clean_text)
    clean_text = strip_multiple_whitespaces(clean_text)
    clean_text = strip_punctuation(clean_text)
    clean_text = strip_tags(clean_text)
    clean_text = stem_text(clean_text)
    clean_text = strip_short(clean_text, minsize=4)
    return clean_text

In [6]:
img_data['clean_text'] = img_data['text'].apply(preprocess_text)

In [7]:
img_data.to_csv(f'{PROJECT_PATH}/{FOLDER}.csv')