In [18]:
import ocrfunction
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pytesseract
import string
from PIL import Image
from os.path import isfile, join
import cv2
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk.stem
from nltk.tokenize import word_tokenize
from datetime import datetime
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer

  from ._conv import register_converters as _register_converters


## Data Preparation

### Wine data

In [8]:
wine = pd.read_csv('https://raw.githubusercontent.com/zy2292/Wine-Master/master/Data/wine_data_cleaned.csv')
wine.head()

Unnamed: 0,index,country,province,region_1,designation,variety,winery,year,title,unique_name,description,points,price
0,0.0,Italy,Sicily & Sardinia,Etna,Vulkà Bianco,White Blend,Nicosia,2013.0,Nicosia 2013 Vulkà Bianco (Etna),Nicosia 2013 Vulkà Bianco (Etna) White Blend,"Aromas include tropical fruit, broom, brimston...",87.0,
1,1.0,Portugal,Douro,,Avidagos,Portuguese Red,Quinta dos Avidagos,2011.0,Quinta dos Avidagos 2011 Avidagos Red (Douro),Quinta dos Avidagos 2011 Avidagos Red (Douro) ...,"This is ripe and fruity, a wine that is smooth...",87.0,15.0
2,2.0,US,Oregon,Willamette Valley,,Pinot Gris,Rainstorm,2013.0,Rainstorm 2013 Pinot Gris (Willamette Valley),Rainstorm 2013 Pinot Gris (Willamette Valley) ...,"Tart and snappy, the flavors of lime flesh and...",87.0,14.0
3,3.0,US,Michigan,Lake Michigan Shore,Reserve Late Harvest,Riesling,St. Julian,2013.0,St. Julian 2013 Reserve Late Harvest Riesling ...,St. Julian 2013 Reserve Late Harvest Riesling ...,"Pineapple rind, lemon pith and orange blossom ...",87.0,13.0
4,4.0,US,Oregon,Willamette Valley,Vintner's Reserve Wild Child Block,Pinot Noir,Sweet Cheeks,2012.0,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,"Much like the regular bottling from 2012, this...",87.0,65.0


In [9]:
def str_process(string):
    string=string.replace('\n','')
    string=string.replace(' ', '')
    string=''.join([*filter(str.isalnum, string)])
    string=''.join(string)
    string=string.upper()
    return string

In [10]:
wine['designation']=wine['designation'].apply(str).apply(str_process)
wine['province']=wine['province'].apply(str).apply(str_process)
wine['region_1']=wine['region_1'].apply(str).apply(str_process)
wine['variety']=wine['variety'].apply(str).apply(str_process)
wine['winery']=wine['winery'].apply(str).apply(str_process)
wine['year']=wine['year'].fillna(0).apply(int).apply(str).apply(str_process)

### True label

In [11]:
labeldf = pd.read_csv('/Users/zhenli/training_label.csv')
labeldf = labeldf.iloc[0:100,:]
label = labeldf.iloc[:,1]
label_set = label.apply(lambda x: set(list(map(int,x.split(',')))))
label_set.head()

0                               {42190, 16919}
1    {73926, 8461, 29905, 62291, 16918, 30040}
2                                      {41747}
3                               {30024, 84417}
4                                      {87400}
Name: unique_name, dtype: object

### Boxes location

In [None]:
with open('/Users/zhenli/text-detection-ctpn-ocr/boxes.json') as f:
    boxes = json.load(f)

## Function preparation

In [24]:
def text_ocr(boxes):
    result=dict()
    for i in list(range(len(boxes))):
        num=i+1
        img = cv2.imread('/Users/zhenli/wineImage/train_'+ str(num) +'.jpg',0)
        res=''
        for j in list(range(len(boxes[str(i+1)]))):
            location=list(map(int,boxes[str(i+1)][str(j+1)].split(',')))
            min_x,min_y,max_x,max_y=location[0],location[1],location[2],location[3]
            image = img[min_y:max_y,min_x:max_x]
            text = pytesseract.image_to_string(image, config = '--psm 3', lang = 'eng+fra+deu+ita+spa+por+afr')
            res=res+text
            # height=int((max_y-min_y)/3)
            # bilateral = cv2.bilateralFilter(image, d=height, sigmaColor=height*2, sigmaSpace=height/2)
            # edges = cv2.Canny(bilateral,100,200)
            # text2 = pytesseract.image_to_string(edges, config = '--psm 3', lang = 'eng+fra+deu+ita+spa+por+afr'
        result[i]=res
    return result

In [12]:
from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer
from py_stringmatching.similarity_measure.tversky_index import TverskyIndex
def get_tversky_index(proto, query, n=3, beta=0.5):
    tversky = TverskyIndex(beta=beta)
    qgram = QgramTokenizer(qval=n, padding=False)
    inters = tversky.get_sim_score(qgram.tokenize(query), qgram.tokenize(proto))
    return inters

In [14]:
from multiprocessing import Pool
from functools import partial
def match_col(arg, query, n=3):
    col, beta = arg
    table = wine[col]
    score = table.apply(lambda x: get_tversky_index(x, query, n=n, beta=beta))
    return score

def get_index(query, betas):
    cols = ['designation', 'province','region_1','variety','winery','year']
    betas = betas
    args = zip(cols, betas)
    
    pool = Pool()
    func = partial(match_col, query=query, n=3)
    score = pool.map(func, args)
    wine['scores'] = sum(score)
    result = wine[wine['scores']==max(wine['scores'])]
    if sum(result['scores'])==0:
        res_index=[]
    else:
        if result.shape[0] > 8:
            result = result.sample(8)
        res_index=list(result.index)
    pool.close()
    pool.join()
    return list(result.index)

In [15]:
def get_match(x, y):
    return int(len(x.intersection(y)) != 0)
vec_match = np.vectorize(get_match)

In [16]:
def select_matching(betas, text_str, label_set):
    result = text_str[0].apply(lambda x: get_index(x,betas)).apply(set)
    match = vec_match(result, label_set)
    return sum(match)

In [22]:
def get_weighted_index(query, weights):
    cols = ['designation', 'province','region_1','variety','winery','year']
    betas = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
    args = zip(cols, betas)
    
    pool = Pool()
    func = partial(match_col, query=query, n=3)
    score = pool.map(func, args)
    wine['scores'] = np.average(score, axis=0, weights=weights)
    result = wine[wine['scores']==max(wine['scores'])]
    if sum(result['scores'])==0:
        res_index=[]
    else:
        if result.shape[0] > 8:
            result = result.sample(8)
        res_index=list(result.index)
    pool.close()
    pool.join()
    return res_index

def select_weighted_matching(text_str, label_set, weights):
    result = text_str[0].apply(lambda x: get_weighted_index(x,weights)).apply(set)
    match = vec_match(result, label_set)
    return sum(match)

# Model 1: baseline ocr only

In [3]:
text_unstructure={}
for i in range(1,101):
    Num = i
    imageOriginal = Image.open('/Users/zhenli/wineImage/train_'+ str(Num) +'.jpg').convert("L")
    image = np.asarray(imageOriginal)

    text = pytesseract.image_to_string(image, config = '--psm 3', lang = 'eng+fra+deu+ita+spa+por+afr')
    text_unstructure[i] = text

In [7]:
text_str = pd.DataFrame(text_unstructure,index=[0]).T
text_str.head()

Unnamed: 0,0
1,M U M M N A PA\n\nBRUT PRESTIGE
2,DE TRADITIONN\n\nC\n«Y‘ 54\nxx“ <6\n\nMUMM NAP...
3,A\nI CHAMPAGNE Oh\n\nBRUT\nMAISON FONDEE EN 1811
4,MUMM NAPA\n\nBRUT ROSE
5,JOSEPH PHELPS\n\nINSIGNIA\n\nNAPA VALLEY\n\nES...


In [17]:
text_str[0] = text_str[0].apply(str_process)
text_str.head()

Unnamed: 0,0
1,MUMMNAPABRUTPRESTIGE
2,DETRADITIONNCY54XX6MUMMNAPABLANCDEBLANCS
3,AICHAMPAGNEOHBRUTMAISONFONDEEEN1811
4,MUMMNAPABRUTROSE
5,JOSEPHPHELPSINSIGNIANAPAVALLEYESTATEGROWNREDWI...


In [19]:
print(str(datetime.now()))
betas=[0.5,0.5,0.5,0.5,0.5,0.5]
accuracy_model1=select_matching(betas,text_str,label_set)
print(accuracy_model1)
print(str(datetime.now()))

2018-05-09 20:34:10.264884
35
2018-05-09 20:54:41.437248


# Model 2: CTPN text detection + OCR with image preprocessing (biliteral and canny edge detection)

Modify **text_ocr** function

In [29]:
def img_text_ocr(boxes,height_divide,canny_range):
    result = dict()
    for i in list(range(len(boxes))):
        num = i+1
        img = cv2.imread('/Users/zhenli/wineImage/train_'+ str(num) +'.jpg',0)
        res = ''
        for j in list(range(len(boxes[str(i+1)]))):
            location=list(map(int,boxes[str(i+1)][str(j+1)].split(',')))
            min_x,min_y,max_x,max_y=location[0],location[1],location[2],location[3]
            image = img[min_y:max_y,min_x:max_x]
            height = int((max_y-min_y)/height_divide)
            bilateral = cv2.bilateralFilter(image, d=height, sigmaColor=height*2, sigmaSpace=height/2)
            edges = cv2.Canny(bilateral,canny_range[0],canny_range[1])
            text = pytesseract.image_to_string(edges, config = '--psm 3', lang = 'eng+fra+deu+ita+spa+por+afr')
            res = res+text
        result[i] = res
    return result

In [30]:
def select_ocr_matching(boxes,height_divide, canny_range,betas, label_set):
    text = img_text_ocr(boxes,height_divide, canny_range)
    text_str = pd.DataFrame(text,index=[0]).T
    text_str[0] = text_str[0].apply(str_process)
    result = text_str[0].apply(lambda x: get_index(x,betas)).apply(set)
    match = vec_match(result, label_set)
    return sum(match)

In [35]:
betas1=[0.5,0.5,0.5,0.5,0.5,0.5]
for k in [100,150,200]:
    print(str(datetime.now()))
    accuracy2_1=select_ocr_matching(boxes=boxes,height_divide=8, canny_range=[0.5*k,k],betas=betas1,label_set=label_set)
    print(str(8)+" "+str(0.5)+" "+str(k)+" "+'accuracy2_1:', accuracy2_1)

2018-05-09 22:23:35.180423
8 0.5 100 accuracy2_1: 12
2018-05-09 22:43:23.053058
8 0.5 150 accuracy2_1: 12
2018-05-09 23:02:42.836483
8 0.5 200 accuracy2_1: 14


In [42]:
betas2=[0.5,0.5,0.8,0.5,0.5,0.9]
for k in [100,150,200]:
    print(str(datetime.now()))
    accuracy2_2=select_ocr_matching(boxes=boxes,height_divide=8, canny_range=[0.5*k,k],betas=betas2,label_set=label_set)
    print(str(15)+" "+str(0.5)+" "+str(k)+" "+'accuracy2_2:', accuracy2_2)

2018-05-10 01:00:05.939767
15 0.5 100 accuracy2_2: 12
2018-05-10 01:22:34.699573
15 0.5 150 accuracy2_2: 13
2018-05-10 01:41:24.227373
15 0.5 200 accuracy2_2: 14


# Model 3: CTPN text detection+ baseline OCR

In [49]:
with open('/Users/zhenli/text-detection-ctpn-ocr/boxes.json') as f:
    boxes = json.load(f)

In [50]:
print(str(datetime.now()))
text=text_ocr(boxes)
print(str(datetime.now()))

2018-05-10 14:42:40.286934
2018-05-10 14:45:16.969882


In [51]:
text_str = pd.DataFrame(text,index=[0]).T
text_str.head()

Unnamed: 0,0
0,MUMM NAPABRUT PRESTIGE
1,09“ TRADITION~\nY\\n\n4 4‘4\n3’ ‘eMUMM NAPABLA...
2,"AND, 1:91.PR\n0\nk DU\nCT or EMF“;MAISON FONDE..."
3,BRUT Ros:MUMM NAPAw “z
4,INSIGNIAJOSEPH PHELPSRED WINEIAPA VALLEYESTATE...


In [52]:
text_str[0] = text_str[0].apply(str_process)
text_str.head()

Unnamed: 0,0
0,MUMMNAPABRUTPRESTIGE
1,09TRADITIONY4443EMUMMNAPABLANCDEBLANCS
2,AND191PR0KDUCTOREMFMAISONFONDEEEN1811750MGRAND...
3,BRUTROSMUMMNAPAWZ
4,INSIGNIAJOSEPHPHELPSREDWINEIAPAVALLEYESTATEGROWN


In [53]:
print(str(datetime.now()))
betas=[0.5,0.5,0.5,0.5,0.5,0.5]
accuracy_model3=select_matching(betas,text_str,label_set)
print("Accuracy:", accuracy_model3)
print(str(datetime.now()))

2018-05-10 14:45:43.752027
Accuracy: 38
2018-05-10 15:01:56.556376


# Model 4: CTPN text detection+ baseline OCR (4 gram)

Modify **get_index** function

In [43]:
def get_index_4g(query, betas):
    cols = ['designation', 'province','region_1','variety','winery','year']
    betas = betas
    args = zip(cols, betas)
    
    pool = Pool()
    func = partial(match_col, query=query, n=4)
    score = pool.map(func, args)
    wine['scores'] = sum(score)
    result = wine[wine['scores']==max(wine['scores'])]
    if sum(result['scores'])==0:
        res_index=[]
    else:
        if result.shape[0] > 8:
            result = result.sample(8)
        res_index=list(result.index)
    pool.close()
    pool.join()
    return list(result.index)

In [44]:
def select_matching_4g(betas, text_str, label_set):
    result = text_str[0].apply(lambda x: get_index_4g(x,betas)).apply(set)
    match = vec_match(result, label_set)
    return sum(match)

In [None]:
print(str(datetime.now()))
for year in [1, 1.6]:
    for region in [0.7, 1,1.3]:
        accuracy = select_weighted_matching(text_str, label_set, 
                                            weights=[1, (3-(region+year)), region, 1, 1, year])
        print('For year=' +str(year)+ ', region=' +str(region)+ ', accuracy:', accuracy)
        print(str(datetime.now()))

In [None]:
print(str(datetime.now()))
betas=[0.5,0.5,0.5,0.5,0.5,0.5]
accuracy_model4=select_matching_4g(betas,text_str,label_set)
print("Accuracy:", accuracy_model4)
print(str(datetime.now()))

# Model 5: baseline OCR+ weighted score

In [36]:
text_unstructure={}
for i in range(1,101):
    Num = i
    imageOriginal = Image.open('/Users/zhenli/wineImage/train_'+ str(Num) +'.jpg').convert("L")
    image = np.asarray(imageOriginal)

    text = pytesseract.image_to_string(image, config = '--psm 3', lang = 'eng+fra+deu+ita+spa+por+afr')
    text_unstructure[i] = text

In [37]:
text_str = pd.DataFrame(text_unstructure,index=[0]).T
text_str.head()

Unnamed: 0,0
1,M U M M N A PA\n\nBRUT PRESTIGE
2,DE TRADITIONN\n\nC\n«Y‘ 54\nxx“ <6\n\nMUMM NAP...
3,A\nI CHAMPAGNE Oh\n\nBRUT\nMAISON FONDEE EN 1811
4,MUMM NAPA\n\nBRUT ROSE
5,JOSEPH PHELPS\n\nINSIGNIA\n\nNAPA VALLEY\n\nES...


In [38]:
text_str.to_csv("text_str.csv")

In [None]:
text_str[0] = text_str[0].apply(str_process)
text_str.head()

In [None]:
print(str(datetime.now()))
for year in [1, 1.6]:
    for region in [0.7, 1,1.3]:
        accuracy = select_weighted_matching(text_str, label_set, 
                                            weights=[1, (3-(region+year)), region, 1, 1, year])
        print('For year=' +str(year)+ ', region=' +str(region)+ ', accuracy:', accuracy)
        print(str(datetime.now()))

# Model 6: CTPN text detection+ baseline OCR+ weighted score