In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        file_path=os.path.join(dirname, filename)
        print(file_path,os.path.getsize(file_path))
        
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        file_path=os.path.join(dirname, filename)
        print(file_path,os.path.getsize(file_path))
        
for dirname, _, filenames in os.walk('/kaggle/temp'):
    for filename in filenames:
        file_path=os.path.join(dirname, filename)
        print(file_path,os.path.getsize(file_path))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install deepcut -q -q -q --exists-action i
!pip install pythainlp['full'] -q -q -q --exists-action i

In [None]:
import subprocess
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
import csv
import numpy as np
import deepcut
from keras.models import Model, load_model
from keras.layers import Input, Dense
from tensorflow.keras.utils import to_categorical, set_random_seed
import matplotlib.pyplot as plt
from random import shuffle
import pathlib
import random
import string
import re
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import pythainlp
import json

set_random_seed(99)
random.seed(99)

In [None]:
try:
    subprocess.check_output('nvidia-smi')
    print('Nvidia GPU detected!')
except Exception: # this command not being found can raise quite a few different errors depending on the configuration
    print('No Nvidia GPU in system!')

In [None]:
print(tf.__version__)
tf.test.is_gpu_available()
print(tf.config.list_physical_devices())

In [None]:
def sentence_loader(input_file):
    out=[]
    for i in list(input_file):
        s = i.split('::')[1].strip()
        s = ' '.join(pythainlp.word_tokenize(s,engine = 'deepcut'))
        out.append(s)
    return out

def answer_loader(ans_file):
    out=[]
    for i in ans_file:
        s = '[start] '+i.split('::')[1].strip().replace(",", " ")+' [end]'
        out.append(s)
    return out

def pair_gen(s,a):
    return list(zip(s,a))

In [None]:
input_file = open('/kaggle/input/typeof/input.txt', 'r',encoding = 'utf-8-sig')
ans_file = open('/kaggle/input/typeof/ans.txt', 'r',encoding = 'utf-8-sig')

In [None]:
os.path.exists('text_pairs.json')

In [None]:
input_data = sentence_loader(input_file)
ans_data = answer_loader(ans_file)
text_pairs = []
if os.path.exists('/kaggle/working/text_pairs.json'):
    with open('/kaggle/working/text_pairs.json', 'rb') as fp:
        text_pairs = json.load(fp)
else:
    text_pairs = pair_gen(input_data,ans_data)
    with open("/kaggle/working/text_pairs.json", "w") as fp:
        json.dump(text_pairs, fp)
        print("Done writing JSON data into .json file")

print(len(text_pairs))
for _ in range(5):
    print(random.choice(text_pairs))

In [None]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

In [None]:
from posixpath import split
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 15000
sequence_length = 32
batch_size = 32


def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")


input_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
    split='whitespace',
)
ans_vectorization = TextVectorization(
    max_tokens=10,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
    split='whitespace',
)
train_input_texts = [pair[0] for pair in train_pairs]
train_ans_texts = [pair[1] for pair in train_pairs]
input_vectorization.adapt(train_input_texts)
ans_vectorization.adapt(train_ans_texts)

In [None]:
def format_dataset(input, ans):
    input = input_vectorization(input)
    ans = ans_vectorization(ans)
    return (
        {
            "encoder_inputs": input,
            "decoder_inputs": ans[:, :-1],
        },
        ans[:, 1:],
    )


def make_dataset(pairs):
    input_texts, ans_texts = zip(*pairs)
    input_texts = list(input_texts)
    ans_texts = list(ans_texts)
    dataset = tf.data.Dataset.from_tensor_slices((input_texts, ans_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)
test_ds = make_dataset(test_pairs)

In [None]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        attention_output = self.attention(query=inputs, value=inputs, key=inputs)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "embed_dim": self.embed_dim,
                "dense_dim": self.dense_dim,
                "num_heads": self.num_heads,
            }
        )
        return config


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "sequence_length": self.sequence_length,
                "vocab_size": self.vocab_size,
                "embed_dim": self.embed_dim,
            }
        )
        return config


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(latent_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.add = layers.Add()  # instead of `+` to preserve mask
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, use_causal_mask=True
        )
        out_1 = self.layernorm_1(self.add([inputs, attention_output_1]))

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
        )
        out_2 = self.layernorm_2(self.add([out_1, attention_output_2]))

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(self.add([out_2, proj_output]))

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "embed_dim": self.embed_dim,
                "latent_dim": self.latent_dim,
                "num_heads": self.num_heads,
            }
        )
        return config

In [None]:
embed_dim = 300
latent_dim = 2048
num_heads = 16
num_en_transformer_blocks = 1
num_de_transformer_blocks = 1


encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
for _ in range(num_en_transformer_blocks):
    x = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder_outputs = x
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
# x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
for _ in range(num_de_transformer_blocks):
    x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
    x = layers.Dropout(0)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [None]:
epochs = 100  # This should be at least 30 for convergence

checkpoint = ModelCheckpoint('transformer_model_best_val.h5', #อย่าลืมเปลี่ยน path ให้ file
                             verbose=1,
                             monitor='val_accuracy',
                             save_best_only=True,
                             mode='max')

transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

In [None]:
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds,callbacks=[checkpoint])

In [None]:
ans_vocab = ans_vectorization.get_vocabulary()
ans_index_lookup = dict(zip(range(len(ans_vocab)), ans_vocab))
max_decoded_sentence_length = 32

transformer = load_model('transformer_model_best_val.h5')

def decode_sequence(input_sentence):
    tokenized_input_sentence = input_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = ans_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = ans_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence


test_input_texts = [pair[0] for pair in test_pairs]
for _ in range(30):
    input_sentence = random.choice(test_input_texts)
    print(input_sentence)
    translated = decode_sequence(input_sentence)
    print(translated)

In [None]:
transformer.evaluate(test_ds, verbose=1)