In [471]:
from __future__ import print_function, division

from keras.datasets import mnist
(X_train, y_train), (_, _) = mnist.load_data()



In [472]:

print(X_train[0].shape)

(28, 28)


# Get receipt dataset

In [69]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.utils import shuffle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.preprocessing import sequence

df_col = ["sentence","brand_name","info","index","content","total","thank_you"]
y_col = ["brand_name","info","index","content","total","thank_you"]
train_df = pd.read_csv('../text_classification/31-07-vigroupped.csv',   encoding='utf-8')

seed = 120
np.random.seed(seed)
train_df = shuffle(train_df)
train_df.head()

X_train = train_df["sentence"].fillna("fillna").values
Y_train = train_df[['brand_name', 'info', 'index', 'content', 'total', 'thank_you']].values

tokenizer = Tokenizer()
texts = X_train

tokenizer.fit_on_texts(texts) 
Tokenizer_vocab_size = len(tokenizer.word_index) + 1

X_train_encoded_words = tokenizer.texts_to_sequences(X_train)

maxWordCount= 16
maxDictionary_size=Tokenizer_vocab_size
X_train_encoded_padded_words = sequence.pad_sequences(X_train_encoded_words, maxlen=maxWordCount)

In [14]:
X_train_encoded_padded_words.shape
# Y_train

(644, 10)

In [15]:
tokenizer.word_index['ngừ']

1195

In [29]:
# Convert Y_train
# range(0,len(y_col))[5]
y_train = []
for row in Y_train:
    for index,col in enumerate(range(0,len(y_col))):
        if row[col] == 1:
            y_train.append(index)
print(len(y_train))

644


# Gan 

In [17]:
from __future__ import print_function, division

from keras.datasets import mnist
from keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply
from keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import UpSampling2D, Conv2D
from keras.models import Sequential, Model
from keras.optimizers import Adam

import matplotlib.pyplot as plt

import numpy as np


In [66]:
def build_generator(img_shape, latent_dim,num_classes):

        model = Sequential()

        model.add(Dense(128 * 1 * 1, activation="relu", input_dim=latent_dim))
        model.add(Reshape((1, 1, 128)))
        model.add(BatchNormalization(momentum=0.8))
        model.add(UpSampling2D())
        model.add(Conv2D(128, kernel_size=3, padding="same"))
        model.add(Activation("relu"))
        model.add(BatchNormalization(momentum=0.8))
        model.add(UpSampling2D())
        model.add(Conv2D(64, kernel_size=3, padding="same"))
        model.add(Activation("relu"))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Conv2D(1, kernel_size=3, padding='same'))
        model.add(Activation("tanh"))

        model.summary()

        noise = Input(shape=(latent_dim,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(num_classes, 10)(label))

        model_input = multiply([noise, label_embedding])
        img = model(model_input)

        return Model([noise, label], img)

In [30]:
def build_discriminator(img_shape, num_classes):

        model = Sequential()

        model.add(Dense(Tokenizer_vocab_size*2, input_dim=np.prod(img_shape)))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(Tokenizer_vocab_size*2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dropout(0.4))
        model.add(Dense(Tokenizer_vocab_size*2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dropout(0.4))
        model.add(Dense(1, activation='sigmoid'))
        model.summary()

        img = Input(shape=img_shape)
        label = Input(shape=(1,), dtype='int32')

        label_embedding = Flatten()(Embedding(num_classes, np.prod(img_shape))(label))
        flat_img = Flatten()(img)

        model_input = multiply([flat_img, label_embedding])

        validity = model(model_input)

        return Model([img, label], validity)


In [67]:
# Setting variable

img_rows = 4
img_cols = 4
channels = 1
img_shape = (img_rows, img_cols, channels)
num_classes = len(y_col)
latent_dim = 10

optimizer = Adam(0.0002, 0.5)

# Build and compile the discriminator
discriminator = build_discriminator(img_shape,num_classes)
discriminator.compile(loss=['binary_crossentropy'],
    optimizer=optimizer,
    metrics=['accuracy'])

# Build the generator
generator = build_generator(img_shape,latent_dim,num_classes)

# The generator takes noise and the target label as input
# and generates the corresponding digit of that label
noise = Input(shape=(latent_dim,))
label = Input(shape=(1,))
img = generator([noise, label])
print(img.shape)
# For the combined model we will only train the generator
discriminator.trainable = False

# The discriminator takes generated image as input and determines validity
# and the label of that image
valid = discriminator([img, label])

# The combined model  (stacked generator and discriminator)
# Trains generator to fool discriminator
combined = Model([noise, label], valid)
combined.compile(loss=['binary_crossentropy'],
    optimizer=optimizer)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_80 (Dense)             (None, 2392)              40664     
_________________________________________________________________
leaky_re_lu_49 (LeakyReLU)   (None, 2392)              0         
_________________________________________________________________
dense_81 (Dense)             (None, 2392)              5724056   
_________________________________________________________________
leaky_re_lu_50 (LeakyReLU)   (None, 2392)              0         
_________________________________________________________________
dropout_33 (Dropout)         (None, 2392)              0         
_________________________________________________________________
dense_82 (Dense)             (None, 2392)              5724056   
_________________________________________________________________
leaky_re_lu_51 (LeakyReLU)   (None, 2392)              0         
__________

In [44]:
# Save model function

def save_model(generator,discriminator):
    def save(model, model_name):
        model_path = "saved_model/%s.json" % model_name
        weights_path = "saved_model/%s_weights.hdf5" % model_name
        options = {"file_arch": model_path,
                    "file_weight": weights_path}
        json_string = model.to_json()
        open(options['file_arch'], 'w').write(json_string)
        model.save_weights(options['file_weight'])

    save(generator, "generator")
    save(discriminator, "discriminator")

In [45]:
print(X_train_encoded_words[0])
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)

# Creating texts 


[7, 227, 13, 228, 1]


In [46]:
def padded_sequence_to_text(int_arr):
    
    padded_sequence = int_arr.reshape((maxWordCount))
    padded_sequence = padded_sequence.tolist()
#     print(padded_sequence)
    started = False
    word_seq = []
    for word in padded_sequence:
        if started:
            word_seq.append(word)
        else:
            if word != 0:
                started = True
                word_seq.append(word)
    
    sentences = list(map(sequence_to_text, [word_seq]))
    if len(sentences)>0:
        my_texts = []
        for word in sentences[0]:
            if word:
                my_texts.append(word)
            
        return ' '.join(my_texts)
    return None
# print(X_train_encoded_padded_words[0])
# print(padded_sequence_to_text(X_train_encoded_padded_words[0]))

In [47]:
def convert_y(y):
    result = []
    for index, col in enumerate(y_col):
        if index == y:
            result.append(1)
        else:
            result.append(0)
#     print(result)
    return result

In [59]:
# sample_

def sample_images(epoch, generator):
        csvfile = 'cgan2d.csv'
        c = len(y_col)
        noise = np.random.normal(0, 1, (c, 10))
        sampled_labels = np.arange(0, len(y_col)).reshape(-1, 1)

        gen_imgs = generator.predict([noise, sampled_labels])
        # Rescale images 0 - 1
#         print(sampled_labels)
        gen_imgs = 1-(0.5 * gen_imgs + 0.5)
        gen_imgs = Tokenizer_vocab_size*gen_imgs
        
        
        int_arr = np.array(gen_imgs, dtype='int')
#         print(int_arr[0])
        
        
#         print(len(int_arr[0,:,:,0]))
#         fig, axs = plt.subplots(r, c)
        cnt = 0

        for j in range(c):
            sentence = padded_sequence_to_text(int_arr[cnt])
            result = convert_y(sampled_labels[cnt])
            if len(sentence) <= 0:
                continue
            print(sentence,':',sampled_labels[cnt])
            cnt += 1
#             df = pd.read_csv(csvfile)# Loading a csv file with headers 
#             data = {
#                 'sentence':sentence,
#             }
#             for index, col in enumerate(y_col):
#                 data[col] = result[index]
#             df = df.append(data, ignore_index=True)
#             df.to_csv(csvfile, index = False,  encoding='utf-8')
#                 axs[i,j].imshow(gen_imgs[cnt,:,:,0], cmap='gray')
#                 axs[i,j].axis('off')
#                 cnt += 1
#         fig.savefig("images/%d.png" % epoch)
#         plt.close()

In [50]:
X_train_encoded_padded_words

array([[  0,   0,   0, ...,  13, 228,   1],
       [  0,   0,   0, ...,   0, 415, 416],
       [  0,   0,   0, ..., 417, 418, 419],
       ...,
       [  0,   0,   0, ...,   0, 377,   1],
       [  0,   0,   0, ...,   0,   0,  13],
       [  0,   0,   0, ...,   1,  19,   1]], dtype=int32)

In [70]:
x_train = []
for row in X_train_encoded_padded_words:
    aa = np.array(row)
    
    aa = np.reshape(aa,(4,4))
#     print(aa)
    x_train.append(aa)

In [488]:
x_train = np.array(x_train)
x_train.shape

(644, 5, 2)

In [71]:
# Traing
epochs = 601
batch_size=32
sample_interval=50

# Load the dataset
# Load the dataset
# (X_train, y_train), (_, _) = mnist.load_data()
X_train = x_train
y_train = np.array(y_train)

# Configure input
X_train = (X_train.astype(np.float32) - 127.5) / 127.5
X_train = np.expand_dims(X_train, axis=3)
y_train = y_train.reshape(-1, 1)

# Adversarial ground truths
valid = np.ones((batch_size, 1))
fake = np.zeros((batch_size, 1))

for epoch in range(epochs):

    # ---------------------
    #  Train Discriminator
    # ---------------------

    # Select a random half batch of images
    idx = np.random.randint(0, X_train.shape[0], batch_size)
    imgs, labels = X_train[idx], y_train[idx]

    # Sample noise as generator input
    noise = np.random.normal(0, 1, (batch_size, 10))

    # Generate a half batch of new images
    gen_imgs = generator.predict([noise, labels])

    # Train the discriminator
    d_loss_real = discriminator.train_on_batch([imgs, labels], valid)
    d_loss_fake = discriminator.train_on_batch([gen_imgs, labels], fake)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

    # ---------------------
    #  Train Generator
    # ---------------------

    # Condition on labels
    sampled_labels = np.random.randint(0, len(y_col), batch_size).reshape(-1, 1)

    # Train the generator
    g_loss = combined.train_on_batch([noise, sampled_labels], valid)

    # Plot the progress
    print ("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))

    # If at save interval => save generated image samples
    if epoch % sample_interval == 0:
        sample_images(epoch, generator)
        save_model(generator,discriminator)

  'Discrepancy between trainable weights and collected trainable'


0 [D loss: 0.685560, acc.: 35.94%] [G loss: 0.683342]
muoi điển gst g8000 uỳnh not vat trứng hen 55 phe cua 0001 ——— lail dang : [0]
ức giờ 16 2x ngọt tây 2q ie 25 só lương hẽn 2dbtn 0971 nẵng con : [1]
py đằng dam hẹn cc book b4n farm tv khánh nguyễm cây 1111 ngọt 0902096 và : [2]
khuyến 900 ch 020 í bò menu hồ 941 xèo muỗi haineken tênhàng phs kdc bàn : [3]
f13q4 đongiá thank 420 tea boom tây 1851 100 12 80 matcha 48 tphcm 121 taxrm : [4]
thảo gọng 093 mr namu nẵng 020 165 cong 125 nôi come daie 540000 sling tax : [5]


  'Discrepancy between trainable weights and collected trainable'


1 [D loss: 0.406744, acc.: 50.00%] [G loss: 0.685641]
2 [D loss: 0.377031, acc.: 50.00%] [G loss: 0.692050]
3 [D loss: 0.375310, acc.: 82.81%] [G loss: 0.700904]
4 [D loss: 0.360473, acc.: 100.00%] [G loss: 0.710604]
5 [D loss: 0.364887, acc.: 100.00%] [G loss: 0.722484]
6 [D loss: 0.355819, acc.: 100.00%] [G loss: 0.738301]
7 [D loss: 0.341995, acc.: 100.00%] [G loss: 0.755489]
8 [D loss: 0.344479, acc.: 100.00%] [G loss: 0.775899]
9 [D loss: 0.327681, acc.: 100.00%] [G loss: 0.800324]
10 [D loss: 0.298904, acc.: 100.00%] [G loss: 0.834894]
11 [D loss: 0.289212, acc.: 100.00%] [G loss: 0.880019]
12 [D loss: 0.271650, acc.: 100.00%] [G loss: 0.940845]
13 [D loss: 0.249953, acc.: 100.00%] [G loss: 1.014797]
14 [D loss: 0.232133, acc.: 100.00%] [G loss: 1.134345]
15 [D loss: 0.210655, acc.: 98.44%] [G loss: 1.281694]
16 [D loss: 0.161189, acc.: 100.00%] [G loss: 1.471349]
17 [D loss: 0.136825, acc.: 100.00%] [G loss: 1.717657]
18 [D loss: 0.116112, acc.: 100.00%] [G loss: 2.000389]
19 [D

137 [D loss: 0.001525, acc.: 100.00%] [G loss: 5.929322]
138 [D loss: 0.074191, acc.: 98.44%] [G loss: 4.900976]
139 [D loss: 0.005857, acc.: 100.00%] [G loss: 4.989885]
140 [D loss: 0.003260, acc.: 100.00%] [G loss: 5.285584]
141 [D loss: 0.002328, acc.: 100.00%] [G loss: 5.612805]
142 [D loss: 0.001699, acc.: 100.00%] [G loss: 5.882111]
143 [D loss: 0.063449, acc.: 98.44%] [G loss: 4.541778]
144 [D loss: 0.005914, acc.: 100.00%] [G loss: 4.770958]
145 [D loss: 0.004051, acc.: 100.00%] [G loss: 5.293525]
146 [D loss: 0.002479, acc.: 100.00%] [G loss: 5.689938]
147 [D loss: 0.001744, acc.: 100.00%] [G loss: 5.913934]
148 [D loss: 0.001225, acc.: 100.00%] [G loss: 6.131908]
149 [D loss: 0.059081, acc.: 98.44%] [G loss: 4.882552]
150 [D loss: 0.006647, acc.: 100.00%] [G loss: 5.026805]
ngừ hàn sl 000 ngừ trích 4732 10 cá cốc cuộn 000 : [0]
tiền invoice price băp dao ngừ ngừ đà ngừ ngừ ngừ cá 1 1 đơn : [1]
00098 mantis lai khoai ngừ py đơn cá 4l puddlngtiủng ngừ ngừ : [2]
11 buom quản 17 

267 [D loss: 0.000151, acc.: 100.00%] [G loss: 8.345077]
268 [D loss: 0.000330, acc.: 100.00%] [G loss: 8.475836]
269 [D loss: 0.000117, acc.: 100.00%] [G loss: 8.360518]
270 [D loss: 0.000132, acc.: 100.00%] [G loss: 8.384274]
271 [D loss: 0.000141, acc.: 100.00%] [G loss: 8.622971]
272 [D loss: 0.000128, acc.: 100.00%] [G loss: 8.402697]
273 [D loss: 0.000100, acc.: 100.00%] [G loss: 8.589273]
274 [D loss: 0.000137, acc.: 100.00%] [G loss: 8.396505]
275 [D loss: 0.000561, acc.: 100.00%] [G loss: 8.592674]
276 [D loss: 0.000112, acc.: 100.00%] [G loss: 8.715261]
277 [D loss: 0.000109, acc.: 100.00%] [G loss: 8.624945]
278 [D loss: 0.000114, acc.: 100.00%] [G loss: 8.656914]
279 [D loss: 0.000298, acc.: 100.00%] [G loss: 8.610714]
280 [D loss: 0.000104, acc.: 100.00%] [G loss: 8.647027]
281 [D loss: 0.000367, acc.: 100.00%] [G loss: 8.699375]
282 [D loss: 0.000102, acc.: 100.00%] [G loss: 8.609598]
283 [D loss: 0.000180, acc.: 100.00%] [G loss: 8.641330]
284 [D loss: 0.000097, acc.: 10

401 [D loss: 0.000162, acc.: 100.00%] [G loss: 9.107002]
402 [D loss: 0.000073, acc.: 100.00%] [G loss: 9.403670]
403 [D loss: 0.000112, acc.: 100.00%] [G loss: 9.241503]
404 [D loss: 0.000143, acc.: 100.00%] [G loss: 9.231047]
405 [D loss: 0.000062, acc.: 100.00%] [G loss: 9.174343]
406 [D loss: 0.000056, acc.: 100.00%] [G loss: 9.181536]
407 [D loss: 0.000064, acc.: 100.00%] [G loss: 9.098770]
408 [D loss: 0.000066, acc.: 100.00%] [G loss: 9.487074]
409 [D loss: 0.000058, acc.: 100.00%] [G loss: 9.210809]
410 [D loss: 0.000075, acc.: 100.00%] [G loss: 9.343449]
411 [D loss: 0.000056, acc.: 100.00%] [G loss: 9.134583]
412 [D loss: 0.000065, acc.: 100.00%] [G loss: 9.477962]
413 [D loss: 0.000053, acc.: 100.00%] [G loss: 9.531595]
414 [D loss: 0.000049, acc.: 100.00%] [G loss: 9.508815]
415 [D loss: 0.000053, acc.: 100.00%] [G loss: 9.535431]
416 [D loss: 0.000062, acc.: 100.00%] [G loss: 9.285282]
417 [D loss: 0.000063, acc.: 100.00%] [G loss: 9.287171]
418 [D loss: 0.000050, acc.: 10

535 [D loss: 0.000022, acc.: 100.00%] [G loss: 10.213668]
536 [D loss: 0.000054, acc.: 100.00%] [G loss: 10.406489]
537 [D loss: 0.000021, acc.: 100.00%] [G loss: 10.134104]
538 [D loss: 0.000023, acc.: 100.00%] [G loss: 10.394485]
539 [D loss: 0.000023, acc.: 100.00%] [G loss: 10.193195]
540 [D loss: 0.000020, acc.: 100.00%] [G loss: 10.266439]
541 [D loss: 0.000021, acc.: 100.00%] [G loss: 10.506254]
542 [D loss: 0.000076, acc.: 100.00%] [G loss: 10.387527]
543 [D loss: 0.000023, acc.: 100.00%] [G loss: 10.422640]
544 [D loss: 0.000024, acc.: 100.00%] [G loss: 10.280684]
545 [D loss: 0.000024, acc.: 100.00%] [G loss: 10.444990]
546 [D loss: 0.000023, acc.: 100.00%] [G loss: 10.373248]
547 [D loss: 0.000044, acc.: 100.00%] [G loss: 10.375472]
548 [D loss: 0.000038, acc.: 100.00%] [G loss: 10.379084]
549 [D loss: 0.000020, acc.: 100.00%] [G loss: 10.418591]
550 [D loss: 0.000021, acc.: 100.00%] [G loss: 10.325291]
23 ngừ ngừ cuộn cuộn ngừ 000 1 biét dương ngừ : [0]
2 ngừ khoa đằng ngừ 