In [1]:
# Collapse-show
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

import pandas as pd

from pathlib import Path

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
assert tf.__version__ >= "2.0"

if not tf.test.is_gpu_available():
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "rnn"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings('ignore')

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. Neural nets can be very slow without a GPU.")
    if "google.colab" in sys.modules:
        print("Go to Runtime > Change runtime and select a GPU hardware "
              "accelerator.")
    if "kaggle_secrets" in sys.modules:
        print("Go to Settings > Accelerator and select GPU.")

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
No GPU was detected. LSTMs and CNNs can be very slow without a GPU.
No GPU was detected. Neural nets can be very slow without a GPU.


## An Encoder–Decoder Network for Neural Machine Translation

### Load and Explore data

In [2]:
data = pd.read_csv('mozilla_common_voice.csv')
data.head(10) # display the first 10 rows

Unnamed: 0,en_text,th_text
0,"The fool wanders, the wise man travels.",คนโง่พเนจร คนฉลาดท่องเที่ยว
1,One of these days is none of these days.,หนึ่งในวันเหล่านี้คือไม่มีวันเหล่านี้เลย
2,"Necessity is a hard nurse, but she raises stro...","ความจำเป็นเป็นสิ่งที่ยาก, แต่เธอสามารถเลี้ยงลู..."
3,In one ear and out the other.,เข้าหู้ข้างหนึ่งและออกอีกข้างหนึ่ง
4,It can't happen here is number one on the list...,ไม่สามารถเกิดขึ้นที่นี่ได้คือคำพูดสุดท้ายที่ดั...
5,Facts do not cease to exist because they are i...,ความจริงไม่ได้หายไปเพราะถูกเมิน
6,Pride cometh before a fall.,ความภาคภูมิมาก่อนที่จะล้ม
7,There is no fool like an old fool.,ไม่มีคนโง่อย่างเช่นคนแก่
8,Use not today what tomorrow will need.,อย่าใช้สิ่งวันนี้ถ้าต้องการวันพรุ่งนี้
9,"See nothing, say nothing, know nothing.",ไม่มอง ไม่พูด ไม่รู้


In [3]:
data.info # summary of the dataset

<bound method DataFrame.info of                                                  en_text  \
0                The fool wanders, the wise man travels.   
1               One of these days is none of these days.   
2      Necessity is a hard nurse, but she raises stro...   
3                          In one ear and out the other.   
4      It can't happen here is number one on the list...   
...                                                  ...   
33792                   If you can't help, don't hinder.   
33793                           It's all in a days work.   
33794  Laziness travels so slowly that poverty soon o...   
33795  Pushchairs can be folded when the toddler want...   
33796  Quantum computing machines are rare and hard t...   

                                                 th_text  
0                            คนโง่พเนจร คนฉลาดท่องเที่ยว  
1               หนึ่งในวันเหล่านี้คือไม่มีวันเหล่านี้เลย  
2      ความจำเป็นเป็นสิ่งที่ยาก, แต่เธอสามารถเลี้ยงลู...  
3          

In [4]:
data.isnull().sum() # Check if there is null value

en_text    0
th_text    0
dtype: int64

In [5]:
df = pd.read_csv('C:/Users/poope/env/MLII_asg/mozilla_common_voice.csv', encoding='utf-8')

# Extract only the desired columns
desired_columns = ["en_text", "th_text"]
df_subset = df[desired_columns]

# Write the subset to a new text file with tab separation
text_file_path = 'C:/Users/poope/env/MLII_asg/output.txt'
newData = df_subset.to_csv(text_file_path, sep='\t', index=False, header=False, encoding='utf-8')


In [6]:
# Open and read the file
with open('C:/Users/poope/env/MLII_asg/output.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [7]:
import numpy as np

text = text.replace("!", "").replace(".", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.seed(42)  # extra code – ensures reproducibility on CPU
np.random.shuffle(pairs)
sentences_en, sentences_th = zip(*pairs)  # separates the pairs into 2 lists

In [8]:
for i in range(3):
    print(sentences_en[i], "=>", sentences_th[i])

You see, a minute goes by so fearfully quick => คุณเห็นแล้วใช่ไหมว่าแต่ละนาทีผ่านไปอย่างรวดเร็ว
Such is my passage engaged on the steamer => นี่คือข้อที่ฉันใช้บนเรือกลไฟ
The Queen sampled the selection of cakes => พระราชินีลองชิมเค้กที่แบ่งเป็นชิ้นไว้


In [9]:
vocab_size = 1000
max_length = 500
text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)
text_vec_layer_th = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_th.adapt([f"startofseq {s} endofseq" for s in sentences_th])

In [10]:
text_vec_layer_en.get_vocabulary()[20:30]

['be', 'this', 'her', 'are', 'his', 'with', 'on', 'we', 'what', 'said']

In [11]:
text_vec_layer_th.get_vocabulary()[20:30]

['5',
 '3',
 'ฉันกล่าว',
 'เบอร์ตี้',
 'ฉันพูด',
 'คือ',
 'กล่าว',
 'มา',
 'คุณผู้หญิง',
 'adele']

In [12]:
text_vec_layer_en.dtype

'string'

In [13]:
non_string_items = []
for idx, val in enumerate(sentences_th):
    if not isinstance(val, str):
        non_string_items.append((idx, val))
non_string_items

[]

In [14]:
# Create a subset
subset = sentences_th[:10]  # Starting with the first 10 elements

# Try processing the subset
try:
    processed_subset = text_vec_layer_th(subset)
    print("Subset processed successfully!")
except Exception as e:
    print(f"Error encountered: {e}")

subset

Subset processed successfully!


('คุณเห็นแล้วใช่ไหมว่าแต่ละนาทีผ่านไปอย่างรวดเร็ว',
 'นี่คือข้อที่ฉันใช้บนเรือกลไฟ',
 'พระราชินีลองชิมเค้กที่แบ่งเป็นชิ้นไว้',
 'มีดยาวปักทะลุหัวใจของเขาจนทำให้เขาต้องล้มลงไปกับพื้น',
 'มันอาจจะเป็นสิ่งแรกที่พวกเขาคิดถึง',
 'แล้วเกิดอะไรขึ้นกับเขา',
 'เอสเธอร์เองก็คิดเช่นนั้น',
 'เศษที่ขาดไปของจดหมาย หมายถึงอะไร?',
 'ที่อยู่นี้ราคาเอื้อมถึงและสะดวกสบาย',
 'วัสดุอินทรีย์จะสลายตัวได้ตามธรรมชาติซึ่งแตกต่างจากพลาสติกอนินทรีย์')

In [15]:
X_train = tf.constant(sentences_en[:33_000])
X_valid = tf.constant(sentences_en[33_000:])
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_th[:33_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_th[33_000:]])
Y_train = text_vec_layer_th([f"{s} endofseq" for s in sentences_th[:33_000]])
Y_valid = text_vec_layer_th([f"{s} endofseq" for s in sentences_th[33_000:]])

In [16]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [17]:
embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_th(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [18]:
encoder = tf.keras.layers.LSTM(512, return_state=True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

In [19]:
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [20]:
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)

In [None]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])

# Regularize validation set and stop once the model creases to improve.
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model.fit((X_train, X_train_dec), Y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), Y_valid), batch_size=256, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10

In [None]:
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_en])  # encoder input 
        X_dec = np.array(["startofseq " + translation])  # decoder input
        y_proba = model.predict((X, X_dec))[0, word_idx]  # last token's probas
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_th.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

In [None]:
translate('Facts)

In [None]:
translate('Sound as a bell')

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

tf.config.list_physical_devices('GPU')

from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())

tf.test.is_built_with_cuda()

tf.debugging.set_log_device_placement(True)