<a href="https://colab.research.google.com/github/tobby-lie/transformer_chatbot_example/blob/main/Transformer_chatbot_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import sys

if 'google.colab' in sys.modules:
  %tensorflow_version 2.x
import tensorflow as tf

tf.random.set_seed(1234)
AUTO = tf.data.experimental.AUTOTUNE

#!pip install tensorflow-datasets==1.2.0
import tensorflow_datasets as tfds

import os
import re
import numpy as np
from time import time
import matplotlib.pyplot as plt

print("Tensorflow version {}".format(tf.__version__))

Tensorflow version 2.4.1


In [2]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [3]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU {}'.format(tpu.cluster_spec().as_dict()['worker']))
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: {}".format(strategy.num_replicas_in_sync))

REPLICAS: 1


In [4]:
# Maximum sentence length
MAX_LENGTH = 30

# For tf.data.Dataset
BATCH_SIZE = int(64 * strategy.num_replicas_in_sync)
BUFFER_SIZE = 20000

# For Transformer
NUM_LAYERS = 2 #6
D_MODEL = 256 #512
NUM_HEADS = 8
UNITS = 512 #2048
DROPOUT = 0.1

EPOCHS = 100

In [5]:
# Need to add data.txt from final output data everytime this notebook is run

In [6]:
def textPreprocess(input_text):

  def removeAccents(input_text):
      strange='ąćęłńóśżź'
      ascii_replacements='acelnoszz'
      translator=str.maketrans(strange,ascii_replacements)
      return input_text.translate(translator)

  def removeSpecial(input_text):
      special='[^A-Za-z0-9 ]+'
      return re.sub(special, '', input_text)

  def removeTriplicated(input_text):
      return re.compile(r'(.)\1{2,}', re.IGNORECASE).sub(r'\1', input_text)

  return removeTriplicated(removeSpecial(removeAccents(input_text.lower())))

In [9]:
initiates = []
responses = []

with open('/content/drive/MyDrive/OSN_Project/data.txt', 'r', encoding="utf-8") as file:
    lines = file.readlines()
    for line in lines:
        if '|||' in line:
            initiates.append(line.split('|||')[0])
            responses.append(line.split('|||')[1])
#             print(line.split('|||'))

In [10]:
len(initiates)

155358

In [11]:
len(responses)

155358

In [12]:
print('Sample initiation: {}'.format(initiates[20]))
print("\n")
print('Sample response: {}'.format(responses[20]))

Sample initiation: congrats to you and to all of the virtual conference speakers . hopefully , we will still socialize in the evenings and mornings like irl . just think - - no long lines for the restrooms and elevators . 


Sample response:  thanks ! yes , i think some online socializing is a great idea - i will miss the dancing , tho !



In [13]:
text_preprocessor = lambda x: textPreprocess(x)

In [14]:
initiates_preprocessed = list(map(text_preprocessor, initiates))
responses_preprocessed = list(map(text_preprocessor, responses))

In [15]:
initiates_preprocessed[20]

'congrats to you and to all of the virtual conference speakers  hopefully  we will still socialize in the evenings and mornings like irl  just think no long lines for the restrooms and elevators  '

In [16]:
responses_preprocessed[20]

' thanks  yes  i think some online socializing is a great idea  i will miss the dancing  tho '

In [23]:
# Build tokenizer using tfds for both questions and answers
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    initiates_preprocessed + responses_preprocessed, target_vocab_size=2**13)

# Define start and end token to indicate the start and end of a sentence
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]

# Vocabulary size plus start and end token
VOCAB_SIZE = tokenizer.vocab_size + 2

In [25]:
print('Tokenized sample question: {}'.format(tokenizer.encode(initiates_preprocessed[20])))

Tokenized sample question: [2009, 4, 8, 7, 4, 40, 9, 3, 2138, 3885, 4265, 8062, 1, 6496, 175, 1, 29, 37, 98, 4662, 1804, 12, 3, 4212, 1188, 7, 968, 13, 34, 2992, 8055, 1, 30, 70, 64, 224, 1786, 15, 3, 7772, 6312, 13, 7, 7656, 3987, 8062, 1]
