In [7]:
import numpy as np
import pandas as pd
from itertools import chain
from model.SVD_Label import SVD_Label
from model.NMF_Label import NMF_Label
from model.WE_Label import WE_Label

# Setting 

In [2]:
FD_DATA = './data/'
FD_WEIGHT = './weight/'
FN_DATA = FD_DATA + 'data_sample.csv'
# FN_WEIGHT = FD_WEIGHT + 'weight_SVD.p'
# FN_WEIGHT = FD_WEIGHT + 'weight_NMF.p'
FN_WEIGHT = FD_WEIGHT + 'weight_WE.p'

# Load Data

In [3]:
df = pd.read_csv(FN_DATA, encoding='latin-1')
N = df.shape[0]

# Map label to label description 
map_l2d = dict(df.groupby(by='LABEL')['LABEL_DESC'].first().reset_index().values.tolist())

df.columns, N

(Index(['DESC', 'LABEL', 'LABEL_DESC'], dtype='object'), 9327)

# Train SVD Labeller

In [5]:
# Create model
model = SVD_Label(n_components=22)

# Build
universe_desc = set(chain.from_iterable(df.DESC.str.split().values))
universe_label = set(df.LABEL.values)
model.build(universe_x=universe_desc, universe_y=universe_label)

# Compile
model.compile()

# Fit
x = df.DESC.str.split().values
y_true = df.LABEL.values
model.fit(x, y_true, verbose=True)

# Save
model.save_model(FN_WEIGHT)

# Train NMF Labeller

In [27]:
# Create model
model = NMF_Label(n_components=78)

# Build
universe_desc = set(chain.from_iterable(df.DESC.str.split().values))
universe_label = set(df.LABEL.values)
model.build(universe_x=universe_desc, universe_y=universe_label)

# Compile
model.compile()

# Fit
x = df.DESC.str.split().values
y_true = df.LABEL.values
model.fit(x, y_true, verbose=True)

# Save
model.save_model(FN_WEIGHT)

Recon error: 12.689146348762879, Raw matrix norm: 1382.1798881565035


# Train WE Labeller

In [52]:
universe_desc = set(chain.from_iterable(df.DESC.str.split().values))
universe_label = set(df.LABEL.values)


# Create model
model = WE_Label(word_size=N,
                 vocabulary_size=len(universe_desc),
                 label_size=len(universe_label),
                 embedding_size=20)

# Build
model.build(universe_x=universe_desc, universe_y=universe_label)

# Compile
model.compile(num_sampled=5)     # num_samples:   no. of negative samples in tf.nn.nce_loss

# Fit (generator)
def generator(batch_size):
    # Generate a positive batch
    idx = np.random.choice(N, size=(batch_size))
    batch = df.iloc[idx, :].DESC.str.split().values
    labels = df.iloc[idx, :].LABEL.values
    return batch, labels
x = df.DESC.str.split().values
y_true = df.LABEL.values
model.fit_generator(generator, 
                    batch_size=10,
                    epochs=1,
                    steps_per_epoch=40001,
                    verbose=True)

# Save
model.save_model(FN_WEIGHT)

Initialized
Average loss at step  0 :  12.969268798828125
Average loss at step  2000 :  4.7141783863082525
Average loss at step  4000 :  0.8928284910805523
Average loss at step  6000 :  0.6263821849692612
Average loss at step  8000 :  0.49908627847209575
Average loss at step  10000 :  0.3859197879666463
Average loss at step  12000 :  0.35697544111404567
Average loss at step  14000 :  0.3271955153606832
Average loss at step  16000 :  0.3261461468618363
Average loss at step  18000 :  0.30747248181235043
Average loss at step  20000 :  0.29119278528355064
Average loss at step  22000 :  0.2786038213027641
Average loss at step  24000 :  0.27221421768609433
Average loss at step  26000 :  0.26151013309834525
Average loss at step  28000 :  0.26197119642701
Average loss at step  30000 :  0.258291011323221
Average loss at step  32000 :  0.25221168718580156
Average loss at step  34000 :  0.2577984863622114
Average loss at step  36000 :  0.262198032848537
Average loss at step  38000 :  0.2583162902

# Validate

In [53]:
# Load
model.load_model(FN_WEIGHT)

# Inference
y_pred = model.predict(x)

# Evaluate
perc_match = (y_pred == y_true).sum() / N
perc_match

0.9181944891176155

In [54]:
label_pred = np.asarray([ map_l2d[x] for x in y_pred ])

df_result = df.copy()
df_result['LABEL_DESC_INFER'] = label_pred

In [14]:
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)
# pd.set_option('display.max_colwidth', -1)

# df_result[df_result['LABEL_DESC'] != df_result['LABEL_DESC_INFER']]