## Veri Çekme

In [1]:
with open('dataset.txt', encoding='utf-8') as file:
    data = file.readlines()

temp= list(map(lambda x : x.split('~'),list(map(lambda x:x.replace('\n',''),data))))

## Soru-Cevap çiftlerinin ayrılması

In [2]:
questions = list(map(lambda x : x[0], temp))
answers = list(map(lambda x : x[1], temp))
del(temp, file, data)

## Veri temizliği

In [3]:
turkish_stopwords = None

with open("turkce-stop-words.txt", "r", encoding='utf-8') as file:
    turkish_stopwords = set(file.read().replace("\n"," ").split())

import re

def veri_temizligi(text):
    metin = re.sub("[^a-zA-ZçÇğĞıİöÖşŞüÜ]", " ", text).lower()
    kelimeler = metin.split()
    kelimeler = [i for i in kelimeler if not i in turkish_stopwords]
    
    return kelimeler

## Verilerin filtrelenip güncellenmesi

In [4]:
def update_dataset(data):
    MAX_LEN = 0
    for i in range(len(data)):
        kokler = veri_temizligi(data[i])
        MAX_LEN = len(kokler) if MAX_LEN < len(kokler) else MAX_LEN  
        data[i] = " ".join(kokler)
        
    return data, MAX_LEN

questions_data , MAX_LEN_QUESTION = update_dataset(questions)
answers_data , MAX_LEN_ANSWER = update_dataset(answers)

In [5]:
MAX_LEN = max(MAX_LEN_ANSWER, MAX_LEN_QUESTION)
del(MAX_LEN_QUESTION, MAX_LEN_ANSWER, answers, questions, file)

## Kelime sözlüğü oluşturulması

In [6]:
vocab = {}

temp_list = answers_data + questions_data
word_num = 0

for line in temp_list:
    for i in line.split():
        if not i in vocab:
            vocab[i] = word_num
            word_num += 1

for i in range(len(answers_data)):
    answers_data[i] = '<SOS> ' + answers_data[i] + ' <EOS>'

del(i, line, word_num, temp_list)

## Cümlelere özel tokenların eklenmesi

In [7]:
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']

length_of_vocab = len(vocab)

for token in tokens:
    vocab[token] = length_of_vocab
    length_of_vocab += 1

vocab[list(vocab.items())[0][0]] = vocab['<PAD>']
vocab['<PAD>'] = 0

inv_vocab = {k:v for v,k in vocab.items()}
del(length_of_vocab, token, tokens)

## Tokenler için decoder-encoder oluşturulması

In [8]:
def decoder_encoder(input):
    main_list = []
    for line in input:
        temp_list = []
        for word in line.split():
            temp_list.append(vocab['<OUT>'] if word not in vocab else vocab[word])

        main_list.append(temp_list)
    return main_list

encoder_input = decoder_encoder(questions_data)
decoder_input = decoder_encoder(answers_data)

## Verilen kelime matrisine çevrilmesi

In [9]:
from keras.utils import pad_sequences

encoder_input = pad_sequences(encoder_input, MAX_LEN, padding="post", truncating="post")
decoder_input = pad_sequences(decoder_input, MAX_LEN, padding="post", truncating="post")

In [10]:
decoder_final_output = pad_sequences(list(map(lambda x:x[1:],decoder_input)), MAX_LEN, padding="post", truncating="post")

## LSTM katmanı oluşturulması
#### LSTM'e verilen nöron sayısı 128e çekildi. 64e çekilebilir

In [11]:
from keras.models import Model
from keras.layers import Dense, Embedding, LSTM, Input
from keras.utils import to_categorical
from keras import optimizers

VOCAB_SIZE = len(vocab)

decoder_final_output = to_categorical(decoder_final_output, len(vocab))

enc_inp = Input(shape=(MAX_LEN,))
dec_inp = Input(shape=(MAX_LEN,))

embed = Embedding(len(vocab) + 1, output_dim=50, input_length=MAX_LEN, trainable=True)

enc_embed = embed(enc_inp)
enc_lstm = LSTM(128, return_sequences=True, return_state=True)
enc_op, h, c = enc_lstm(enc_embed)
enc_states = [h,c]

dec_embed = embed(dec_inp)
dec_lstm = LSTM(128, return_sequences=True, return_state=True)
dec_op, _, _ = dec_lstm(dec_embed, initial_state=enc_states)

dense = Dense(len(vocab), activation='softmax')
dense_op = dense(dec_op)

model = Model([enc_inp, dec_inp], dense_op)

opt = optimizers.Adam(learning_rate=0.0125)

model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=opt)
model.fit([encoder_input, decoder_input], decoder_final_output, epochs=60)

2024-06-30 18:50:31.211173: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-06-30 18:50:31.211196: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-06-30 18:50:31.211202: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-06-30 18:50:31.211551: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-30 18:50:31.212037: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/60


2024-06-30 18:50:32.948482: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-06-30 18:50:33.194981: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-06-30 18:50:33.266913: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-06-30 18:50:33.405926: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-06-30 18:50:33.511792: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.src.callbacks.History at 0x301b378e0>

## Model oluşturulması

In [12]:
enc_model = Model([enc_inp], enc_states)

decoder_state_input_h = Input(shape=(128,))
decoder_state_input_c = Input(shape=(128,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = dec_lstm(dec_embed, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

dec_model = Model([dec_inp] + decoder_states_inputs, [decoder_outputs] + decoder_states)

## Tahmin

In [13]:
import numpy as np
from keras.utils import pad_sequences

prepro1 = ""
sayac = 0
while sayac < 3:
    sayac += 1
    prepro1 = input("you : ")
    soru = prepro1
    prepro1 = ' '.join(veri_temizligi(prepro1))
    prepro = [prepro1]


    txt = []
    for x in prepro:
        lst = []
        for y in x.split():
            try:
                lst.append(vocab[y])
            except:
                lst.append(vocab['<OUT>'])
        
        txt.append(lst)

    txt = pad_sequences(txt, MAX_LEN, padding='post', truncating="post")

    stat = enc_model.predict(txt)
    empty_target_seq = np.zeros((1,1))
    empty_target_seq[0,0] = vocab['<SOS>']

    stop_condition = False
    decoded_translation = ''

    while not stop_condition:
        dec_outputs, h, c = dec_model.predict([empty_target_seq] + stat)
        decoder_concat_input = dense(dec_outputs)

        sample_word_index = np.argmax(decoder_concat_input[0, -1, :])
        sample_word = inv_vocab[sample_word_index] + ' '
        
        if sample_word != '<EOS> ':
            decoded_translation += sample_word

        if sample_word == '<EOS> ' or len(decoded_translation.split()) > MAX_LEN:
            stop_condition = True


        empty_target_seq = np.zeros((1, 1))
        empty_target_seq[0,0] = sample_word_index
        stat = [h, c]
        
    print(f'Sen: {soru}')    
    print(f'Chatbot: {decoded_translation.title()}')
    



2024-06-30 18:51:34.793447: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-06-30 18:51:34.844266: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-06-30 18:51:35.015053: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-06-30 18:51:35.055097: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Sen: merhaba nasılsın
Chatbot: Merhaba Iyiyim Nasılsın 
Sen: 
Chatbot: Atatürk Parkı Batıpark Açık Hava Konser Alanlarıdır 
Sen: 
Chatbot: Atatürk Parkı Batıpark Açık Hava Konser Alanlarıdır 
