In [1]:
#read the dataset
import pandas as pd
import numpy as np
QA = pd.read_csv('QA.csv')

In [2]:
#construct a Q&A pair list
qListTemp = []
aListTemp = []
for q in QA['Q']:
    qListTemp.append(q)
for a in QA['A']:
    aListTemp.append(a)
questionTweets = qListTemp
answerTweets = aListTemp
pairs = list(zip(questionTweets,answerTweets))

In [3]:
print(pairs[500])

('I accidentally deleted some important files on my iPhone. Can I recover them? ', "If you have an iCloud or iTunes backup, you can restore your device from the backup to recover the deleted files. If you don't have a backup, there's a chance that the files may be recoverable using a third-party data recovery tool. We can recommend some options if you'd like.")


In [4]:
import re
input_docs = []
target_docs = []
input_tokens = set()
target_tokens = set()

for tweet in pairs:
    input_doc, target_doc = tweet[0], tweet[1]
    input_docs.append(input_doc)
    target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))
    # add <START> and <END> token to the target sentences
    target_doc = '<START> ' + target_doc + ' <END>' 
    target_docs.append(target_doc)
    # create vocabulary lists for both input and target sentences
    for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
        if token not in input_tokens:
            input_tokens.add(token)
    for token in target_doc.split():
        if token not in target_tokens:
            target_tokens.add(token)

input_tokens = sorted(list(input_tokens))
print("INPUT TOKENS")
print(input_tokens)
target_tokens = sorted(list(target_tokens))
print("TARGET TOKENS")
print(target_tokens)
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)

#create a dictionary mapping vocab to index
input_features_dict = dict(
    [(token, i) for i, token in enumerate(input_tokens)])
target_features_dict = dict(
    [(token, i) for i, token in enumerate(target_tokens)])

print("INPUT FEATURES")
print(input_features_dict)



#create a reverse dictionary mapping index to vocab
reverse_input_features_dict = dict(
    (i, token) for token, i in input_features_dict.items())
reverse_target_features_dict = dict(
    (i, token) for token, i in target_features_dict.items())

#find out max input length and max output length
max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", input_doc)) for input_doc in input_docs])
max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", target_doc)) for target_doc in target_docs])

#create encoder input data with size: number of input * max input length * input vocab size
encoder_input_data = np.zeros(
    (len(input_docs), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
#create decoder input data with size: number of output * max output length * output vocab size
decoder_input_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
#create decoder target data with size: number of output * max output length * output vocab size
decoder_target_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

print("encoder input")
print(encoder_input_data)

INPUT TOKENS
['"', ',', '-', '.', '/', '?', 'Activation', 'AirDrop', 'AirPods', 'Android', 'App', 'Apple', 'Bluetooth', 'Books', 'But', 'CDs', 'Calendar', 'Can', 'Contacts', 'Cut', 'DVDs', 'Error', 'Face', 'FaceTime', 'Family', 'Fi', 'Final', 'Find', 'GPS', 'GarageBand', 'Health', 'Hello', 'Hi', 'Home', 'HomePod', 'How', 'I', "I'll", "I'm", "I've", 'ID', 'IMEI', 'Is', 'It', "It's", 'Keychain', 'Keynote', 'MEID', 'Mac', 'MacBook', 'Machine', 'Mail', 'Maps', 'Messages', 'Music', 'My', 'News', 'No', 'Notes', 'Numbers', 'Okay', 'Pages', 'Pay', 'Pencil', 'Podcasts', 'Pro', "Pro's", 'Reminders', 'SIM', 'Safari', 'Sharing', 'Siri', 'Stocks', 'Store', 'TV', 'The', 'They', 'Time', 'Touch', 'Wallet', 'Watch', 'Weather', 'What', "What's", 'Whenever', 'Why', 'Wi', 'Yes', 'Your', 'a', 'about', 'access', 'accessing', 'accidentally', 'account', 'accurately', 'activate', 'activating', 'activity', 'ads', 'alerts', 'all', 'almost', 'am', 'an', 'and', 'any', 'anymore', 'anything', 'app', 'appointment', '

In [5]:
# create encoder input data, decoder input data and decoder output data 
for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):
    for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]", input_doc)):
        encoder_input_data[line, timestep, input_features_dict[token]] = 1. 

    
    for timestep, token in enumerate(target_doc.split()):
        decoder_input_data[line, timestep, target_features_dict[token]] = 1. 
        if timestep > 0: 
            decoder_target_data[line, timestep - 1, target_features_dict[token]] = 1.

In [6]:
#seq2seq model implementation and training
from tensorflow import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model
import tensorflow as tf


dimensionality = 256
batch_size = 10 
epochs = 100

encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder_lstm = LSTM(dimensionality, return_state=True)
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)
encoder_states = [state_hidden, state_cell]

decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(dimensionality, return_sequences=True, return_state=True)
decoder_outputs, decoder_state_hidden, decoder_state_cell = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
training_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'], sample_weight_mode='temporal')
training_model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size = batch_size, epochs = epochs, validation_split = 0.1)
training_model.save('training_model3.h5')

2023-04-18 00:04:13.618625: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-18 00:04:19.013779: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-18 00:04:19.072939: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


Epoch 1/100


2023-04-18 00:04:22.183151: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-18 00:04:22.186529: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-18 00:04:22.188685: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-04-18 00:04:28.193970: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-18 00:04:28.196079: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-18 00:04:28.198526: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7