In [10]:
import numpy as np
import pandas as pd
from itertools import chain
from model.SVD_Label import SVD_Label
from model.NMF_Label import NMF_Label
from model.WE_Label import WE_Label

# Setting 

In [11]:
FD_DATA = './data/'
FD_WEIGHT = './weight/'
FN_DATA = FD_DATA + 'data_sample.csv'
# FN_WEIGHT = FD_WEIGHT + 'weight_SVD.p'
# FN_WEIGHT = FD_WEIGHT + 'weight_NMF.p'
FN_WEIGHT = FD_WEIGHT + 'weight_WE.p'

# Load Data

In [12]:
df = pd.read_csv(FN_DATA, encoding='latin-1')
N = df.shape[0]

# Map label to label description 
map_l2d = dict(df.groupby(by='LABEL')['LABEL_DESC'].first().reset_index().values.tolist())

df.columns, N

(Index(['DESC', 'LABEL', 'LABEL_DESC'], dtype='object'), 9327)

# Train SVD Labeller

In [None]:
# Create model
model = SVD_Label(n_components=22)

# Build
universe_desc = set(chain.from_iterable(df.DESC.str.split().values))
universe_label = set(df.LABEL.values)
model.build(universe_x=universe_desc, universe_y=universe_label)

# Compile
model.compile()

# Fit
x = df.DESC.str.split().values
y_true = df.LABEL.values
model.fit(x, y_true, verbose=True)

# Save
model.save_model(FN_WEIGHT)

# Train NMF Labeller

In [None]:
# Create model
model = NMF_Label(n_components=78)

# Build
universe_desc = set(chain.from_iterable(df.DESC.str.split().values))
universe_label = set(df.LABEL.values)
model.build(universe_x=universe_desc, universe_y=universe_label)

# Compile
model.compile()

# Fit
x = df.DESC.str.split().values
y_true = df.LABEL.values
model.fit(x, y_true, verbose=True)

# Save
model.save_model(FN_WEIGHT)

# Train WE Labeller

In [13]:
universe_desc = set(chain.from_iterable(df.DESC.str.split().values))
universe_label = set(df.LABEL.values)


# Create model
model = WE_Label(word_size=N,
                 vocabulary_size=len(universe_desc),
                 label_size=len(universe_label),
                 embedding_size=40)

# Build
model.build(universe_x=universe_desc, universe_y=universe_label)

# Compile
model.compile(num_sampled=5)     # num_samples:   no. of negative samples in tf.nn.nce_loss

# Fit
x = df.DESC.str.split().values
y = df.LABEL.values

# Shuffle
idx = np.arange(N)
np.random.shuffle(idx)
x, y = x[idx], y[idx]

x_train, y_train = x[:-np.floor(N*0.1).astype(np.int)], y[:-np.floor(N*0.1).astype(np.int)]
x_test, y_test = x[-np.floor(N*0.1).astype(np.int):], y[-np.floor(N*0.1).astype(np.int):]
model.fit(x_train, y_train, verbose=True)

# Save
model.save_model(FN_WEIGHT)

# Validate

In [14]:
# Load
model.load_model(FN_WEIGHT)

# Inference
y_pred = model.predict(x_test)

# Evaluate
perc_match = (y_pred == y_test).sum() / y_test.shape[0]
perc_match

0.9120171673819742

In [9]:
# label_pred = np.asarray([ map_l2d[x] for x in y_pred ])

# df_result = df.copy()
# df_result['LABEL_DESC_INFER'] = label_pred

In [None]:
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)
# pd.set_option('display.max_colwidth', -1)

# df_result[df_result['LABEL_DESC'] != df_result['LABEL_DESC_INFER']]