In [1]:

from pytesseract import *
import os
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', None)


import numpy as np
import re
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
#from wine_catalog import *

In [2]:
def img_df3(img):
    # call tesseract function
    out = image_to_data(img)

    with open('tmp', "w") as f:
        f.write(out)
    # df = pd.read_csv('tmp.csv', sep=',', engine="python")
    df = pd.DataFrame.from_csv('tmp', sep='\t')

    # if the text is Null/None/Nan, drop the row
    df = df[df.text.notnull()].reset_index(drop=True)
    os.remove('tmp')

    
    ############### prices --> df1  ##################
    # assume all float numbers are prices, select them out
    tmp = df.text.apply(lambda x: bool(re.match(r"\d+\.\d{2}", x)), 1)
    df1 = df[tmp].reset_index()
    # now the new dataframe "df1" has less rows, we still want to keep their indices in "df", the one from tesseract
    df1 = df1.rename(columns={'index': 'old_index'})   

    # Select only valid prices:
    # What the code does here is to check there the x.x number comes together with another x.x number
    # because it is very likely the prices we are interested don't come alone, in contrast to the prices 
    # in description
    def check_neighbor_exist(i):
        """ return whether there is another float number around it, this can be done by old_index """
        cur_old_index = df1.loc[i, "old_index"]
        
        # the "old_index" of the previous x.x number 
        prev_old_index = df1.loc[i - 1, "old_index"] if i != 0 else -2 
        # here -2 can be any negative number, just to make sure "prev, suc" wouldn't be 1 when they shouldn't be
        prev = cur_old_index - prev_old_index # check whether their "old_index" differ only by 1
        
        #  the "old_index" of the next x.x number 
        suc_old_index = df1.loc[i + 1, "old_index"] if i != len(df1) - 1 else -2
        suc = suc_old_index - cur_old_index # check whether their "old_index" differ only by 1
        if prev == 1 or suc == 1:
            return True
        else:
            return False
    
    valid_ids = [i for i in df1.index if check_neighbor_exist(i)] # filter out prices in description
    df1 = df1.iloc[valid_ids]
    df1['begin'] = 0

    
    ################   wine No. --> df2  ###################
    # select out all wine No., assuming they are always the first word on the same line, or "word_num" = 1
    check = df.text.apply(lambda x: bool(re.match(r"[0-9]+$", x)), 1)
    df2 = df[check].reset_index()
    df2 = df2.rename(columns={'index': 'old_index'})

    def check_first_word(i):
        """ return whether it is the first word on the same line """
        return df2.loc[i, "word_num"] == 1

    valid_ids = [i for i in df2.index if check_first_word(i)]
    df2 = df2.iloc[valid_ids]
    df2['begin'] = 1

    
    ################  concatenate --> df3  ##################
    df3 = pd.concat([df1, df2]).sort_values('old_index')

    return df, df3

In [3]:
def table_reader(filename):
    #filename = os.path.join(os.getcwd(), '../sample1/UCD_Lehmann_0922.jpg')
    
    ### (1) Read the image:
    img = cv2.imread(filename)
    # img1 = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # img2 = cv2.Laplacian(img, cv2.CV_64F)

    ### (2) Find the box we want to crop and do the cropping
    _, df3 = img_df3(img)
    top, bottom = max(df3.top.min() - 100, 0), min(df3.top.max() + 100, 6000)
    #left, right = max(df3.left.min() - 100, 0), df3.left.max() + 100
    # print("top, bottom", top, bottom)
    df, df3 = img_df3(img[top:bottom])
    
    
    ### (3) add wine info. to Pandas DataFrame
    res = []  # create an empty array, then we will append wine info to it as we iterate over all the items
    for i in range(len(df3) - 1):
        # for i in range(5):
        if df3.iloc[i].begin == 1 and (i + 1 == len(df3) or df3.iloc[i + 1].begin == 0):
            tmp = []

            # add wine No.:
            # it should be an integer number
            tmp.append(df3.iloc[i].text)
            start = i

            # add prices:
            # they takes the form x.x
            prices = []
            end = i + 1
            while end < len(df3) - 1 and df3.iloc[end].begin == 0:
                prices.append(df3.iloc[end].text)
                end += 1
            # print("end at:", df3.iloc[end-1]['old_index'])

            # add name:
            s = ''
            for k in range(df3.iloc[i]['old_index'] + 1, df3.iloc[i + 1]['old_index']):
                # print(k, df.iloc[k].text)
                s += ' ' + df.iloc[k].text
            s = s.rstrip(' .')
            tmp.append(s)

            # add description:
            s = ''
            for k in range(df3.iloc[end - 1]['old_index'] + 1, df3.iloc[end]['old_index']):
                s += ' ' + df.iloc[k].text
            s = s.rstrip(' .')
            tmp.append(s)
            
            # finally add them all to the list that we will use to create pandas dataframe
            res.append(tmp + prices)
    
    # find the maximum length, it varies from images to images because the number of prices might be different
    L = max([len(s) for s in res])
    for s in res:
        s += [''] * (L - len(s))
    
    # create the data frame and save it to .csv file
    res_df = pd.DataFrame(np.array(res),columns=["No.", "Name", "Description"] + ["Prices" + str(i+1) for i in range(L-3)])
    res_df.to_csv(filename[:-3] + "csv")
    
    return res_df

## Try on Single Image

In [5]:
filename = os.path.join(os.getcwd(), '../sample1/UCD_Lehmann_0922.jpg')

df = table_reader(filename)
df.head()

top, bottom 2229 4980


Unnamed: 0,No.,Name,Description,Prices1,Prices2
0,209,"VOLNAY, CLOS CHAMPANS",Estate—bottled by Marquis d’Angerville. We sh...,4.79,51.75
1,403,CHAPELLE-CHAMBERTIN,"Estate-bottled, P. Damoy. In a blind tasting,...",5.99,64.7
2,608,"VOSNE ROMANEE, LES REIGNOTS","This velvety, soft Vosne-Romanée. from the ou...",6.75,72.9
3,199,CORTON BRESSANDES,"Estate-bottled, Prince de Merode. A remarkabl...",7.39,79.8
4,775,CLOS DE VOUGEOT,Estate-bottled by R. Engel. Because of the si...,7.39,79.8


## Try on Multiple/All Images

In [None]:
path = os.path.join('../sample1','')
imgs = [img for img in os.listdir(path) if img.endswith('.jpg')]
for img in tqdm(imgs):
    print()
    print("Working on %s ......" %img)
    file_path = os.path.join(os.getcwd(), '../sample1/',img)
    try:
        df = table_reader(file_path)
        print(df.head())
    except:
        print("doesn't work for ", img)
        continue

  0%|          | 0/100 [00:00<?, ?it/s]


Working on UCD_Lehmann_0006.jpg ......


  1%|          | 1/100 [00:26<44:16, 26.83s/it]

doesn't work for  UCD_Lehmann_0006.jpg

Working on UCD_Lehmann_0007.jpg ......


  2%|▏         | 2/100 [00:41<37:53, 23.20s/it]

doesn't work for  UCD_Lehmann_0007.jpg

Working on UCD_Lehmann_0015.jpg ......


  3%|▎         | 3/100 [01:18<44:18, 27.40s/it]

   No.                                       Name  \
0  842            CHATEAU HANTEILLAN (Haut Medoc)   
1  325   CHATEAU TEILLAC (Puisseguin-St. Emilion]   
2  311                   CHATEAU POTENSAC (Medoc)   
3  848    CHATEAU COUTELIN MERVILLE (St. Estephe]   
4  359            CHATEAU PICQUE-CAILLOU (Graves)   

                                         Description Prices1 Prices2 Prices3  \
0   This very wine was selected by a tasting pane...    1.99   21.50           
1   The fruit and softness of the St. Emilion are...    1.99   21.50           
2           Chateau-bottled, lovely, soft, ready now    1.99   21.50           
3   A discovery—not far from Calon Segur and simi...    2.29   24.75           
4   Entirely remarkable. Extraordinary example of...    2.79   30.15           

  Prices4  
0          
1          
2          
3          
4          

Working on UCD_Lehmann_0029.jpg ......


  4%|▍         | 4/100 [01:30<36:04, 22.55s/it]

doesn't work for  UCD_Lehmann_0029.jpg

Working on UCD_Lehmann_0033.jpg ......


  5%|▌         | 5/100 [01:45<32:14, 20.36s/it]

   No.                                        Name Description Prices1 Prices2
0  603                    BEAUJOLAIS 1967, Nicolas                1.49   16.10
1  801   BEAUIOLAIA VILLAGES, 1967, Vin de I’Annee                2.19   23.65
2  789       CHENAS, 1966, Rt. Hon. Ernest Marples                2.19   23.65
3  333     BROUILLY, 1966, Rt. Hon. Ernest Marples                2.19   24.97
4  794      FLEURIE, 1966, Rt. Hon. Ernest Marples                2.85   32.49

Working on UCD_Lehmann_0038.jpg ......


  6%|▌         | 6/100 [02:00<29:15, 18.68s/it]

doesn't work for  UCD_Lehmann_0038.jpg

Working on UCD_Lehmann_0059.jpg ......


  7%|▋         | 7/100 [02:42<39:48, 25.68s/it]

doesn't work for  UCD_Lehmann_0059.jpg

Working on UCD_Lehmann_0103.jpg ......


  8%|▊         | 8/100 [03:34<51:53, 33.84s/it]

doesn't work for  UCD_Lehmann_0103.jpg

Working on UCD_Lehmann_0106.jpg ......
