In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras.layers import Embedding, Dense, Flatten, Dropout
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string, re

In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
batch_size = 64
raw_data_ds = tf.data.TextLineDataset(["tr_corpus.txt"])

In [4]:
text=""
for elem in raw_data_ds:
   text=text+(elem.numpy().decode('utf-8'))

In [5]:
maxlen = 20
step = 3
input_chars = []
next_char = []

In [6]:
for i in range(0, len(text) - maxlen, step):
    input_chars.append(text[i : i + maxlen])
    next_char.append(text[i + maxlen])

In [7]:
print("Number of sequences:", len(input_chars))
print("input X  (input_chars)  --->   output y (next_char) ")

for i in range(5):
  print( input_chars[i],"   --->  ", next_char[i])

Number of sequences: 378737
input X  (input_chars)  --->   output y (next_char) 
﻿1948 yılında on ark    --->   a
48 yılında on arkada    --->   ş
yılında on arkadaş,     --->   N
ında on arkadaş, Nez    --->   i
a on arkadaş, Nezih     --->   B


In [8]:
X_train_ds_raw=tf.data.Dataset.from_tensor_slices(input_chars)
y_train_ds_raw=tf.data.Dataset.from_tensor_slices(next_char)

In [9]:
def custom_standardization(input_data):
    lowercase     = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    stripped_num  = tf.strings.regex_replace(stripped_html, "[\d-]", " ")
    stripped_punc  =tf.strings.regex_replace(stripped_num, "[%s]" % re.escape(string.punctuation), "")    
    return stripped_punc

def char_split(input_data):
  return tf.strings.unicode_split(input_data, 'UTF-8')

def word_split(input_data):
  return tf.strings.split(input_data)

In [10]:
max_features = 96   # Number of distinct chars / words
embedding_dim = 16             # Embedding layer output dimension
sequence_length = maxlen       # Input sequence size

In [11]:
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    split=char_split, # word_split or char_split
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [12]:
vectorize_layer.adapt(X_train_ds_raw.batch(batch_size))

In [13]:
print("The size of the vocabulary (number of distinct characters): ", len(vectorize_layer.get_vocabulary()))

The size of the vocabulary (number of distinct characters):  55


In [14]:
def vectorize_text(text):
  text = tf.expand_dims(text, -1)
  return tf.squeeze(vectorize_layer(text))

In [15]:
# Vectorize the data.
X_train_ds = X_train_ds_raw.map(vectorize_text)
y_train_ds = y_train_ds_raw.map(vectorize_text)

X_train_ds.element_spec, y_train_ds.element_spec

(TensorSpec(shape=(20,), dtype=tf.int64, name=None),
 TensorSpec(shape=(20,), dtype=tf.int64, name=None))

In [16]:
y_train_ds=y_train_ds.map(lambda x: x[0])

In [17]:
train_ds =  tf.data.Dataset.zip((X_train_ds,y_train_ds))

In [18]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.shuffle(buffer_size=512).batch(batch_size, drop_remainder=True).cache().prefetch(buffer_size=AUTOTUNE)

In [19]:
for sample in train_ds.take(1):
  print("input (X) dimension: ", sample[0].numpy().shape, "\noutput (y) dimension: ",sample[1].numpy().shape)

input (X) dimension:  (64, 20) 
output (y) dimension:  (64,)


In [20]:
for sample in train_ds.take(1):
  print("input (sequence of chars): ", sample[0][0].numpy(), "\noutput (next char to complete the input): ",sample[1][0].numpy())

input (sequence of chars):  [ 9 11  3  7  2 11 13 15  8 13 26  9  7  3  6  2 21  3 30  0] 
output (next char to complete the input):  4


In [21]:
def decode_sequence (encoded_sequence):
  deceoded_sequence=[]
  for token in encoded_sequence:
    deceoded_sequence.append(vectorize_layer.get_vocabulary()[token])
  sequence= ''.join(deceoded_sequence)
  print(sequence.capitalize())
  return sequence

In [22]:
for sample in train_ds.take(1):
  print("input (sequence of chars): ", decode_sequence (sample[0][0].numpy()), "\noutput (next char to complete the input): ",vectorize_layer.get_vocabulary()[sample[1][0].numpy()])

Acıkır mülk köyü s
input (sequence of chars):  acıkır mülk köyü s 
output (next char to complete the input):  i


In [23]:
def softmax(z):
   return np.exp(z)/sum(np.exp(z))

In [24]:
def temperature_sampling (conditional_probability, temperature=1.0):
  conditional_probability = np.asarray(conditional_probability).astype("float64")
  conditional_probability = np.log(conditional_probability) / temperature
  reweighted_conditional_probability = softmax(conditional_probability)
  probas = np.random.multinomial(1, reweighted_conditional_probability, 1)
  return np.argmax(probas)

In [25]:
inputs = tf.keras.Input(shape=(sequence_length), dtype="int64")
x = Embedding(max_features, embedding_dim)(inputs)
x = CuDNNLSTM(256,return_sequences=True)(x)
x = CuDNNLSTM(256, return_sequences=True)(x)
x = Dropout(0.1)(x)
x = CuDNNLSTM(256, return_sequences=True)(x)
x = Flatten()(x)
predictions=  Dense(max_features, activation='softmax')(x)
model = tf.keras.Model(inputs, predictions,name="model")

In [26]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20)]              0         
                                                                 
 embedding (Embedding)       (None, 20, 16)            1536      
                                                                 
 cu_dnnlstm (CuDNNLSTM)      (None, 20, 256)           280576    
                                                                 
 cu_dnnlstm_1 (CuDNNLSTM)    (None, 20, 256)           526336    
                                                                 
 dropout (Dropout)           (None, 20, 256)           0         
                                                                 
 cu_dnnlstm_2 (CuDNNLSTM)    (None, 20, 256)           526336    
                                                                 
 flatten (Flatten)           (None, 5120)              0     

In [27]:
try:
    model.fit(train_ds,epochs=17)
except KeyboardInterrupt:
    print("\nEğitim sonlandırıldı.")

Epoch 1/17



Eğitim sonlandırıldı.


In [28]:
model.save("TRcudnn_version.keras")

In [30]:
model.load_weights("cudnn_version.keras")

In [31]:
def generate_text(model, seed_original, step,temperatures=[]):
    seed_original=seed_original.lower()
    seed= vectorize_text(seed_original)
    print("The prompt is :",end="")
    decode_sequence(seed.numpy().squeeze())
    seed= vectorize_text(seed_original).numpy().reshape(1,-1)
    #Text Generated by Temperature Sampling
    for temperature in temperatures:
        print("Temperature: ", temperature)
        seed= vectorize_text(seed_original).numpy().reshape(1,-1)
        generated_temperature = (seed)
        for i in range(step):
            predictions=model.predict(seed)
            next_index = temperature_sampling(predictions.squeeze(), temperature)
            generated_temperature = np.append(generated_temperature, next_index)
            seed= generated_temperature[-sequence_length:].reshape(1,sequence_length)
        print("Output: ",end="")
        decode_sequence(generated_temperature)

In [34]:
generate_text(model,"Charlie and john ",95,[0.2])

The prompt is :Charlie and john 
Temperature:  0.3
Output: Charlie and john elıniluakeat e  ç                                                                             
