In [1]:
from PIL import Image
import pytesseract

In [2]:
feb_elec_text = pytesseract.image_to_string(Image.open('images/feb_elec.jpg'))
print(feb_elec_text[:100])

TesseractNotFoundError: tesseract is not installed or it's not in your path

## Attempt 1 - Common words

In [3]:
import re
feb_elec_text_list = re.findall(r'\b[a-z]+\b', feb_elec_text.lower())
print(feb_elec_text_list[:50])

['malra', 'pu', 'hlic', 'p', 'nwer', 'diskin', 'for', 'bill', 'inquiries', 'call', 'the', 'omaha', 'see', 'back', 'for', 'toll', 'free', 'number', 'page', 'of', 'account', 'number', 'due', 'date', 'total', 'amount', 'due', 'feb', 'customer', 'name', 'haffner', 'robert', 'm', 'date', 'february', 'billing', 'information', 'for', 'service', 'addressm', 'rate', 'billing', 'period', 'meter', 'meter', 'reading', 'usa', 'e', 'from', 'to']


In [4]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

words  = [i for i in feb_elec_text_list if i not in stop_words]
print(words[:10])

['malra', 'pu', 'hlic', 'p', 'nwer', 'diskin', 'bill', 'inquiries', 'call', 'omaha']


In [5]:
from collections import Counter

word_count = Counter(words)
print(word_count.most_common(3))

[('due', 4), ('date', 3), ('usage', 3)]


In [15]:
words_for_name = '_'.join([w[0] for w in word_count.most_common(3)])

def get_datestamp():
    tdy = datetime.today()
    return '{}_{}_{}'.format(tdy.year, tdy.month, tdy.day)

new_file_name = '{}_{}.jpg'.format(get_datestamp(), words_for_name)
print (new_file_name)

2017_8_5_due_date_usage.jpg


## Attempt 2 - String comparison

In [9]:
mar_elec_text = pytesseract.image_to_string(Image.open('images/mar_elec.jpg'))
print(mar_elec_text[:100])

ifs?

ï¬‚maha Publi: Paws! Districl

For bill inquiries can the Omaha Office
(402) 536-4131 See back f


In [10]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

print(fuzz.ratio(feb_elec_text, mar_elec_text))

85


In [11]:
feb_gas_water_text = pytesseract.image_to_string(Image.open('images/feb_gas_water.jpg'))
print(feb_gas_water_text[:100])

Share the warmth and make a
difference.

Join us at the 10m annual Heat the
Streets Run and Walk for


In [16]:
print(fuzz.ratio(feb_elec_text, feb_gas_water_text))

32


In [13]:
choices = {'electric_bill' : feb_elec_text, 'water_gas_bill' : feb_gas_water_text}

best_match = process.extractOne(mar_elec_text, choices=choices, score_cutoff=75)
if best_match:
    print(best_match[2])

electric_bill


In [14]:
new_file_name = '{}_{}.jpg'.format(get_datestamp(), best_match[2])
print (new_file_name)

2017_8_5_electric_bill.jpg
