In [471]:
from __future__ import print_function, division

from keras.datasets import mnist
(X_train, y_train), (_, _) = mnist.load_data()



In [472]:

print(X_train[0].shape)

(28, 28)


# Get receipt dataset

In [473]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.utils import shuffle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.preprocessing import sequence

df_col = ["sentence","brand_name","info","index","content","total","thank_you"]
y_col = ["brand_name","info","index","content","total","thank_you"]
train_df = pd.read_csv('../text_classification/31-07-vigroupped.csv',   encoding='utf-8')

seed = 120
np.random.seed(seed)
train_df = shuffle(train_df)
train_df.head()

X_train = train_df["sentence"].fillna("fillna").values
Y_train = train_df[['brand_name', 'info', 'index', 'content', 'total', 'thank_you']].values

tokenizer = Tokenizer()
texts = X_train

tokenizer.fit_on_texts(texts) 
Tokenizer_vocab_size = len(tokenizer.word_index) + 1

X_train_encoded_words = tokenizer.texts_to_sequences(X_train)

maxWordCount= 10
maxDictionary_size=Tokenizer_vocab_size
X_train_encoded_padded_words = sequence.pad_sequences(X_train_encoded_words, maxlen=maxWordCount)

In [474]:
X_train_encoded_padded_words.shape
# Y_train

(644, 10)

In [475]:
tokenizer.word_index['ngừ']

1195

In [476]:
# Convert Y_train
# range(0,len(y_col))[5]
y_train = []
for row in Y_train:
    for index,col in enumerate(range(0,len(y_col))):
        if row[col] == 1:
            y_train.append(index)
print(len(y_train))

644


# Gan 

In [477]:
from __future__ import print_function, division

from keras.datasets import mnist
from keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply
from keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import UpSampling2D, Conv2D
from keras.models import Sequential, Model
from keras.optimizers import Adam

import matplotlib.pyplot as plt

import numpy as np


In [478]:
def build_generator(img_shape, latent_dim,num_classes):

        model = Sequential()

        model.add(Dense(Tokenizer_vocab_size, input_dim=latent_dim))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(Tokenizer_vocab_size*2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(Tokenizer_vocab_size*4))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(np.prod(img_shape), activation='tanh'))
        model.add(Reshape(img_shape))

        model.summary()

        noise = Input(shape=(latent_dim,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(num_classes, latent_dim)(label))

        model_input = multiply([noise, label_embedding])
        img = model(model_input)

        return Model([noise, label], img)

In [479]:
def build_discriminator(img_shape, num_classes):

        model = Sequential()

        model.add(Dense(Tokenizer_vocab_size*2, input_dim=np.prod(img_shape)))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(Tokenizer_vocab_size*2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dropout(0.4))
        model.add(Dense(Tokenizer_vocab_size*2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dropout(0.4))
        model.add(Dense(1, activation='sigmoid'))
        model.summary()

        img = Input(shape=img_shape)
        label = Input(shape=(1,), dtype='int32')

        label_embedding = Flatten()(Embedding(num_classes, np.prod(img_shape))(label))
        flat_img = Flatten()(img)

        model_input = multiply([flat_img, label_embedding])

        validity = model(model_input)

        return Model([img, label], validity)


In [480]:
# Setting variable

img_rows = 5
img_cols = 2
channels = 1
img_shape = (img_rows, img_cols, channels)
num_classes = len(y_col)
latent_dim = 100

optimizer = Adam(0.0002, 0.5)

# Build and compile the discriminator
discriminator = build_discriminator(img_shape,num_classes)
discriminator.compile(loss=['binary_crossentropy'],
    optimizer=optimizer,
    metrics=['accuracy'])

# Build the generator
generator = build_generator(img_shape,latent_dim,num_classes)

# The generator takes noise and the target label as input
# and generates the corresponding digit of that label
noise = Input(shape=(latent_dim,))
label = Input(shape=(1,))
img = generator([noise, label])

# For the combined model we will only train the generator
discriminator.trainable = False

# The discriminator takes generated image as input and determines validity
# and the label of that image
valid = discriminator([img, label])

# The combined model  (stacked generator and discriminator)
# Trains generator to fool discriminator
combined = Model([noise, label], valid)
combined.compile(loss=['binary_crossentropy'],
    optimizer=optimizer)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_166 (Dense)            (None, 2392)              26312     
_________________________________________________________________
leaky_re_lu_160 (LeakyReLU)  (None, 2392)              0         
_________________________________________________________________
dense_167 (Dense)            (None, 2392)              5724056   
_________________________________________________________________
leaky_re_lu_161 (LeakyReLU)  (None, 2392)              0         
_________________________________________________________________
dropout_105 (Dropout)        (None, 2392)              0         
_________________________________________________________________
dense_168 (Dense)            (None, 2392)              5724056   
_________________________________________________________________
leaky_re_lu_162 (LeakyReLU)  (None, 2392)              0         
__________

In [481]:
# Save model function

def save_model(generator,discriminator):
    def save(model, model_name):
        model_path = "saved_model/%s.json" % model_name
        weights_path = "saved_model/%s_weights.hdf5" % model_name
        options = {"file_arch": model_path,
                    "file_weight": weights_path}
        json_string = model.to_json()
        open(options['file_arch'], 'w').write(json_string)
        model.save_weights(options['file_weight'])

    save(generator, "generator")
    save(discriminator, "discriminator")

In [482]:
print(X_train_encoded_words[0])
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)

# Creating texts 


[7, 227, 13, 228, 1]


In [483]:
def padded_sequence_to_text(int_arr):
    
    padded_sequence = int_arr.reshape((maxWordCount))
    padded_sequence = padded_sequence.tolist()
#     print(padded_sequence)
    started = False
    word_seq = []
    for word in padded_sequence:
        if started:
            word_seq.append(word)
        else:
            if word != 0:
                started = True
                word_seq.append(word)
    
    sentences = list(map(sequence_to_text, [word_seq]))
    if len(sentences)>0:
        my_texts = []
        for word in sentences[0]:
            if word:
                my_texts.append(word)
            
        return ' '.join(my_texts)
    return None
# print(X_train_encoded_padded_words[0])
# print(padded_sequence_to_text(X_train_encoded_padded_words[0]))

In [484]:
def convert_y(y):
    result = []
    for index, col in enumerate(y_col):
        if index == y:
            result.append(1)
        else:
            result.append(0)
#     print(result)
    return result

In [485]:
# sample_

def sample_images(epoch, generator):
        csvfile = 'cgan2d.csv'
        c = len(y_col)
        noise = np.random.normal(0, 1, (c, 100))
        sampled_labels = np.arange(0, len(y_col)).reshape(-1, 1)

        gen_imgs = generator.predict([noise, sampled_labels])
        # Rescale images 0 - 1
#         print(sampled_labels)
        gen_imgs = 1-(0.5 * gen_imgs + 0.5)
        gen_imgs = Tokenizer_vocab_size*gen_imgs
        
        
        int_arr = np.array(gen_imgs, dtype='int')
#         print(int_arr[0])
        
        
#         print(len(int_arr[0,:,:,0]))
#         fig, axs = plt.subplots(r, c)
        cnt = 0

        for j in range(c):
            sentence = padded_sequence_to_text(int_arr[cnt])
            result = convert_y(sampled_labels[cnt])
            if len(sentence) <= 0:
                continue
            print(sentence,':',sampled_labels[cnt])
            cnt += 1
            df = pd.read_csv(csvfile)# Loading a csv file with headers 
            data = {
                'sentence':sentence,
            }
            for index, col in enumerate(y_col):
                data[col] = result[index]
            df = df.append(data, ignore_index=True)
            df.to_csv(csvfile, index = False,  encoding='utf-8')
#                 axs[i,j].imshow(gen_imgs[cnt,:,:,0], cmap='gray')
#                 axs[i,j].axis('off')
#                 cnt += 1
#         fig.savefig("images/%d.png" % epoch)
#         plt.close()

In [486]:
X_train_encoded_padded_words

array([[   0,    0,    0, ...,   13,  228,    1],
       [   0,    0,    0, ...,    0,  415,  416],
       [   0,    0,    0, ...,  417,  418,  419],
       ...,
       [   0,    0,    0, ...,    0,  377,    1],
       [   0,    0,    0, ...,    0,    0,   13],
       [ 212, 1193, 1194, ...,    1,   19,    1]], dtype=int32)

In [487]:
x_train = []
for row in X_train_encoded_padded_words:
    aa = np.array(row)
    
    aa = np.reshape(aa,(5,2))
    print(aa)
    x_train.append(aa)

[[  0   0]
 [  0   0]
 [  0   7]
 [227  13]
 [228   1]]
[[  0   0]
 [  0   0]
 [  0   0]
 [  0   0]
 [415 416]]
[[  0   0]
 [  0   0]
 [  0   0]
 [  0 417]
 [418 419]]
[[ 0  0]
 [ 0  0]
 [ 0  0]
 [35  4]
 [36  1]]
[[  0   0]
 [  0 420]
 [ 11   1]
 [  1  37]
 [  1   1]]
[[ 0  0]
 [25 12]
 [66 78]
 [16  9]
 [90 46]]
[[  0   0]
 [  0   0]
 [  3 229]
 [128   3]
 [ 17   8]]
[[  0   0]
 [  0   0]
 [  0   0]
 [230 111]
 [129 130]]
[[  0   0]
 [  0   0]
 [  7 112]
 [421   2]
 [422   1]]
[[  0   0]
 [  0   0]
 [  0 231]
 [  9 113]
 [423   1]]
[[232  40]
 [233 159]
 [234 235]
 [236 424]
 [237 238]]
[[160 239]
 [240 161]
 [241 242]
 [162 243]
 [425  91]]
[[  0   0]
 [  0   0]
 [426 427]
 [428  67]
 [244   1]]
[[  0   0]
 [  0   0]
 [  0   0]
 [  0 245]
 [246 131]]
[[  0   0]
 [  0   0]
 [132  31]
 [429  67]
 [  2 430]]
[[ 0  0]
 [25 66]
 [ 2 79]
 [16 92]
 [80 80]]
[[  0   0]
 [  0   0]
 [  0   0]
 [  0 431]
 [432 433]]
[[  0   0]
 [  0   0]
 [ 41  53]
 [247 434]
 [435  10]]
[[  0   0]
 [  0   0]


In [488]:
x_train = np.array(x_train)
x_train.shape

(644, 5, 2)

In [None]:
# Traing
epochs = 601
batch_size=32
sample_interval=50

# Load the dataset
# Load the dataset
# (X_train, y_train), (_, _) = mnist.load_data()
X_train = x_train
y_train = np.array(y_train)

# Configure input
# X_train = (X_train.astype(np.float32) - 127.5) / 127.5
X_train = np.expand_dims(X_train, axis=3)
y_train = y_train.reshape(-1, 1)

# Adversarial ground truths
valid = np.ones((batch_size, 1))
fake = np.zeros((batch_size, 1))

for epoch in range(epochs):

    # ---------------------
    #  Train Discriminator
    # ---------------------

    # Select a random half batch of images
    idx = np.random.randint(0, X_train.shape[0], batch_size)
    imgs, labels = X_train[idx], y_train[idx]

    # Sample noise as generator input
    noise = np.random.normal(0, 1, (batch_size, 100))

    # Generate a half batch of new images
    gen_imgs = generator.predict([noise, labels])

    # Train the discriminator
    d_loss_real = discriminator.train_on_batch([imgs, labels], valid)
    d_loss_fake = discriminator.train_on_batch([gen_imgs, labels], fake)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

    # ---------------------
    #  Train Generator
    # ---------------------

    # Condition on labels
    sampled_labels = np.random.randint(0, len(y_col), batch_size).reshape(-1, 1)

    # Train the generator
    g_loss = combined.train_on_batch([noise, sampled_labels], valid)

    # Plot the progress
    print ("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))

    # If at save interval => save generated image samples
    if epoch % sample_interval == 0:
        sample_images(epoch, generator)
        save_model(generator,discriminator)

0 [D loss: 0.689363, acc.: 32.81%] [G loss: 0.683072]
ngừ khách tấn tên 1 ngừ châu mo số 32 : [0]
gian gia ttiền 02 vụ den ngừ 000 cashier đồ : [1]
giờ 000 count 000 đc ngừ 20141 68 3536 : [2]
kiem 08 nẵng exchangeable 20 ngừ món 880 leather chin : [3]
đằng 00 ct muỗi chivas bhd trâu đơn hoa khuyến : [4]
phác tram address q 000 ngừ 604 hàng ptt : [5]
1 [D loss: 0.438122, acc.: 50.00%] [G loss: 0.684036]
2 [D loss: 0.415822, acc.: 50.00%] [G loss: 0.689953]
3 [D loss: 0.392383, acc.: 70.31%] [G loss: 0.697532]
4 [D loss: 0.359233, acc.: 100.00%] [G loss: 0.707232]
5 [D loss: 0.352914, acc.: 100.00%] [G loss: 0.720255]
6 [D loss: 0.353294, acc.: 100.00%] [G loss: 0.737399]
7 [D loss: 0.347522, acc.: 98.44%] [G loss: 0.756218]
8 [D loss: 0.333782, acc.: 100.00%] [G loss: 0.777443]
9 [D loss: 0.309337, acc.: 100.00%] [G loss: 0.810131]
10 [D loss: 0.314901, acc.: 98.44%] [G loss: 0.850926]
11 [D loss: 0.292178, acc.: 98.44%] [G loss: 0.907675]
12 [D loss: 0.251361, acc.: 100.00%] [G loss: 

140 [D loss: 0.001030, acc.: 100.00%] [G loss: 6.184925]
141 [D loss: 0.077509, acc.: 98.44%] [G loss: 5.136583]
142 [D loss: 0.004039, acc.: 100.00%] [G loss: 5.011353]
143 [D loss: 0.003339, acc.: 100.00%] [G loss: 5.466925]
144 [D loss: 0.002021, acc.: 100.00%] [G loss: 5.670588]
145 [D loss: 0.001877, acc.: 100.00%] [G loss: 5.912338]
146 [D loss: 0.001547, acc.: 100.00%] [G loss: 6.071253]
147 [D loss: 0.001267, acc.: 100.00%] [G loss: 6.198174]
148 [D loss: 0.000998, acc.: 100.00%] [G loss: 6.461793]
149 [D loss: 0.000853, acc.: 100.00%] [G loss: 6.652143]
150 [D loss: 0.001383, acc.: 100.00%] [G loss: 6.504743]
151 [D loss: 0.000866, acc.: 100.00%] [G loss: 6.633652]
152 [D loss: 0.000880, acc.: 100.00%] [G loss: 6.866128]
153 [D loss: 0.000566, acc.: 100.00%] [G loss: 6.701253]
154 [D loss: 0.005390, acc.: 100.00%] [G loss: 6.401785]
155 [D loss: 0.000739, acc.: 100.00%] [G loss: 6.442341]
156 [D loss: 0.073501, acc.: 98.44%] [G loss: 5.215784]
157 [D loss: 0.003645, acc.: 100.

283 [D loss: 0.000164, acc.: 100.00%] [G loss: 8.236633]
284 [D loss: 0.000305, acc.: 100.00%] [G loss: 8.226313]
285 [D loss: 0.000142, acc.: 100.00%] [G loss: 8.127022]
286 [D loss: 0.000171, acc.: 100.00%] [G loss: 8.279829]
287 [D loss: 0.000132, acc.: 100.00%] [G loss: 8.043544]
288 [D loss: 0.002067, acc.: 100.00%] [G loss: 7.950860]
289 [D loss: 0.000189, acc.: 100.00%] [G loss: 7.878786]
290 [D loss: 0.000268, acc.: 100.00%] [G loss: 7.933727]
291 [D loss: 0.000314, acc.: 100.00%] [G loss: 8.031113]
292 [D loss: 0.000160, acc.: 100.00%] [G loss: 8.207396]
293 [D loss: 0.009694, acc.: 100.00%] [G loss: 6.254866]
294 [D loss: 0.001253, acc.: 100.00%] [G loss: 5.998363]
295 [D loss: 0.001179, acc.: 100.00%] [G loss: 6.258185]
296 [D loss: 0.000852, acc.: 100.00%] [G loss: 6.774701]
297 [D loss: 0.000675, acc.: 100.00%] [G loss: 7.029397]
298 [D loss: 0.000494, acc.: 100.00%] [G loss: 7.317335]
299 [D loss: 0.000340, acc.: 100.00%] [G loss: 7.404319]
300 [D loss: 0.000388, acc.: 10

426 [D loss: 0.000034, acc.: 100.00%] [G loss: 10.013780]
427 [D loss: 0.000040, acc.: 100.00%] [G loss: 9.858253]
428 [D loss: 0.000042, acc.: 100.00%] [G loss: 9.896381]
429 [D loss: 0.000027, acc.: 100.00%] [G loss: 9.686293]
430 [D loss: 0.000048, acc.: 100.00%] [G loss: 9.565679]
431 [D loss: 0.000035, acc.: 100.00%] [G loss: 9.741426]
432 [D loss: 0.000030, acc.: 100.00%] [G loss: 9.676340]
433 [D loss: 0.000038, acc.: 100.00%] [G loss: 9.964025]
434 [D loss: 0.000029, acc.: 100.00%] [G loss: 9.857815]
435 [D loss: 0.000032, acc.: 100.00%] [G loss: 9.956065]
436 [D loss: 0.000032, acc.: 100.00%] [G loss: 9.895308]
437 [D loss: 0.000028, acc.: 100.00%] [G loss: 9.954895]
438 [D loss: 0.000028, acc.: 100.00%] [G loss: 9.768171]
439 [D loss: 0.000035, acc.: 100.00%] [G loss: 10.139891]
440 [D loss: 0.000029, acc.: 100.00%] [G loss: 9.792784]
441 [D loss: 0.000024, acc.: 100.00%] [G loss: 9.962890]
442 [D loss: 0.000028, acc.: 100.00%] [G loss: 9.935460]
443 [D loss: 0.000027, acc.: 

567 [D loss: 0.000022, acc.: 100.00%] [G loss: 9.974796]
568 [D loss: 0.000024, acc.: 100.00%] [G loss: 10.123374]
569 [D loss: 0.000025, acc.: 100.00%] [G loss: 10.171907]
570 [D loss: 0.000018, acc.: 100.00%] [G loss: 10.473286]
571 [D loss: 0.000092, acc.: 100.00%] [G loss: 10.324974]
572 [D loss: 0.000057, acc.: 100.00%] [G loss: 10.220660]
573 [D loss: 0.000026, acc.: 100.00%] [G loss: 9.967640]
574 [D loss: 0.000057, acc.: 100.00%] [G loss: 10.524193]
575 [D loss: 0.000021, acc.: 100.00%] [G loss: 10.224580]
576 [D loss: 0.000026, acc.: 100.00%] [G loss: 10.357075]
577 [D loss: 0.000020, acc.: 100.00%] [G loss: 10.584774]
578 [D loss: 0.000024, acc.: 100.00%] [G loss: 10.102859]
579 [D loss: 0.000022, acc.: 100.00%] [G loss: 10.379586]
580 [D loss: 0.000020, acc.: 100.00%] [G loss: 10.446190]
581 [D loss: 0.000021, acc.: 100.00%] [G loss: 10.683065]
582 [D loss: 0.000024, acc.: 100.00%] [G loss: 10.478712]
583 [D loss: 0.000021, acc.: 100.00%] [G loss: 10.241052]
584 [D loss: 0.0