In [16]:
import numpy as np
import pandas as pd
import seaborn as sns
from itertools import chain, product
from model.CM_Label import CM_Label

import utils

# Setting 

In [2]:
FD_DATA = './data/'
FD_WEIGHT = './weight/'
FN_DATA = FD_DATA + 'data_sample.csv'
FN_WEIGHT = FD_WEIGHT + 'weight_CM.p'

# Load Data

In [3]:
df = pd.read_csv(FN_DATA, encoding='latin-1')
N = df.shape[0]

# Map label to label description 
map_l2d = dict(df.groupby(by='LABEL')['LABEL_DESC'].first().reset_index().values.tolist())

df.columns, N

(Index(['DESC', 'LABEL', 'LABEL_DESC'], dtype='object'), 9327)

In [4]:
df['DESC'] = df['DESC'].apply(lambda x: ' '.join(list(filter(lambda y: not('(CAT)' in y or '(SUBCAT)' in y), 
                                                            x.split()))))

# Prepare data

In [5]:
x = df.DESC.str.split().values
y = df.LABEL.str.split().values

In [6]:
universe_desc = set(chain.from_iterable(x))
universe_label = set(chain.from_iterable(y))

len(universe_desc), len(universe_label)

(4855, 5438)

In [7]:
# Fit

# Break down pair of (word, word)
x, y = zip(*[ [[w1], [w2]] for arr_x, arr_y in zip(x, y) for w1, w2 in product(arr_x, arr_y) ])
x = np.asarray(x + (0,))[:-1]     # Make sure x is ndarray(List)
y = np.asarray(y + (0,))[:-1]     # Make sure y is ndarray(List)

# Shuffle
idx = np.arange(x.shape[0])
np.random.seed(42)
np.random.shuffle(idx)
x, y = x[idx], y[idx]

# x_train, y_train = x[:-np.floor(N*0.1).astype(np.int)], y[:-np.floor(N*0.1).astype(np.int)]
# x_test, y_test = x[-np.floor(N*0.1).astype(np.int):], y[-np.floor(N*0.1).astype(np.int):]

# Train CM Labeller

In [8]:
param_optim = {}

# Create model
model = CM_Label()

# Build
model.build(universe_x=universe_desc, universe_y=universe_label)

# Compile
model.compile()

# Fit
model.fit(x, y, verbose=True)

# Save
model.save_model(FN_WEIGHT)

# Load
model.load_model(FN_WEIGHT)

# Inference
# y_pred = model.predict(x, n_best=1)

# Evaluate
# perc_match = (y_pred == y).sum() / y.shape[0]
# print('train acc {}'.format(perc_match))


In [9]:
def postprocess(word_src, candidates_tgt, fuzzy_pl=True):
    words_tgt = []
    for word_tgt, score_tgt in candidates_tgt[::-1]:
        if float(score_tgt) > 0:
            if len(word_src) == utils.lcs(word_src.upper(), word_tgt.upper()):
                words_tgt.append([word_tgt, score_tgt])
            elif fuzzy_pl and word_src[-1].upper() == 'S':
                # fuzzy match when {word_src} indicates plural form
                if len(word_src[:-1]) == utils.lcs(word_src[:-1].upper(), word_tgt.upper()):
                    words_tgt.append([word_tgt, score_tgt])
    return words_tgt

def map_word(x, model,
             n_best=30):
    x_pad = np.asarray([ [v] for v in x ])
    y_pred, score_pred = model.predict(x_pad, n_best=n_best, score=True)
    res = map(postprocess, x_pad.flatten(), np.stack((y_pred, score_pred), axis=-1))
    return list(res)

In [10]:
words_src = list(universe_desc)
words_tgt = map_word(words_src, model, n_best=30)

In [17]:
map_word = { x: y for x,y in zip(words_src, words_tgt) }

map_word

{'BNN': [['Banana', '34.0']],
 'MDCN': [['Medicine', '3.0']],
 'GLPC': [['Gelpacs', '3.0']],
 'KRL': [['Krill', '6.0']],
 'HSHBR': [['Hashbrowns', '1.0']],
 'RANCH': [],
 'SCRT': [['Secret', '9.0']],
 'NGT': [['Nuggets', '4.0']],
 'KC': [['KC', '1.0']],
 'HRBD': [['Herbed', '1.0']],
 'SMLSN': [['Similasan', '3.0']],
 'LTRCY': [],
 'RNCH': [['Ranch', '50.0']],
 'STRP': [['Strips', '13.0'], ['Stripes', '4.0']],
 'CNTNR': [['Containers', '17.0'], ['Container', '7.0']],
 'CND': [['Canned', '1.0']],
 'JRDNS': [['Jordan', '1.0']],
 'DHYDR': [],
 'SPR': [['Super', '55.0'], ['Spears', '9.0']],
 'HYDRC': [['Hydrochloride', '4.0'], ['Hydrocortisone', '3.0']],
 'KD': [['KIDS', '1.0']],
 'FLFY': [['Fluffy', '4.0']],
 'AMRCS': [],
 'SCBLS': [['Sociables', '1.0']],
 'MCS': [['Mucus', '3.0']],
 'MDW': [['Meadow', '4.0']],
 'STRNR': [['Strainer', '1.0']],
 'SCHSN': [],
 'JELLY': [['Jelly', '1.0']],
 'RNS': [['Rinse', '10.0'],
  ['Detergent', '3.0'],
  ['Drying', '3.0'],
  ['ProClean', '2.0']],
 'DUOZ'

In [15]:
df['DESC'].apply(lambda x: ' '.join(filter(len, map(lambda y: map_word[y][0][0] if map_word[y] else '', 
                                                    x.split()))))

0                     Doritos Tortilla Chips Nacho Cheese
1                            Lay Potato Chips Family Size
2       Coca Cola Low Sodium Can High Corn Syrup Water...
3                                    Ritz Butter Crackers
4       Kraft Original Macaroni Cheese Dinner Blue Wat...
5                          Hellmann Spread Mayonnaise Jar
6       Cheetos Cheese Puffs Flavored Crunch Gelatin F...
7       Pepsi Very Low Sodium High Corn Syrup Sugar Wa...
8           Fritos Corn Chips Gelatin Free Original Fresh
9           Cheetos Cheese Puffs Baked Gelatin Free Rolls
10                      Doritos Tortilla Chips Cola Ranch
11        Miracle Whip Original Liquid Salad Dressing Jar
12       Little Debbie Oatmeal Cream Pie Flavored Cookies
13                          Campbell Cream Mushroom Water
14         Doritos Tortilla Chips Nacho Cheese Party Size
15                       Mission Flour Soft Taco Tortilla
16         Oreo Sandwich Cream Flavored Chocolate Cookies
17      Pepper

In [18]:
utils.save_pickle(map_word, 'map_word.p')

In [19]:
map_word = utils.load_pickle('map_word.p')