<a href="https://colab.research.google.com/github/hululuzhu/chinese-ai-writing-share/blob/main/training/transformer_supervised/couplet_Transformer_Source_Code_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chinese Couplet Transformer model source code. e.g.

```
上: 欢天喜地度佳节
下: 举国迎春贺新年
上: 不待鸣钟已汗颜，重来试手竟何艰
下: 只缘沧海常风雨，再去翻身只等闲
上: 相思俱付三更月
下: 寂寞难留一夜风
```

# Connect to google drive and prepare local files

In [None]:
from google.colab import drive
drive.mount('/content/gdrive') # mount to google drive to save models after training

import tensorflow as tf
import os
os.environ['TF_KERAS'] = '1'
!pip install keras-transformer &> /dev/null
from keras_transformer import get_model, decode, get_custom_objects

import pathlib
import numpy as np
import pandas as pd
import pickle

## TPU setup

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
# print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)

# Fetch Data and extract

In [None]:
working_dir = "/tmp/working_dir"
!mkdir -p {working_dir}
!wget https://github.com/wb14123/couplet-dataset/releases/download/1.0/couplet.tar.gz -P {working_dir}
!ls -l {working_dir}

In [None]:
!mkdir -p {working_dir}/couplet_files
!tar -xf {working_dir}/couplet.tar.gz -C {working_dir}/couplet_files
# !ls -l -R  /tmp/working_dir/couplet_files

In [None]:
!head -1 {working_dir}/couplet_files/couplet/train/in.txt {working_dir}/couplet_files/couplet/train/out.txt

## Get vocabs of all chars

In [None]:
COUPLET_PATH = f'{working_dir}/couplet_files/couplet'
token_dict = {
    '<PAD>': 0,
    '<START>': 1,
    '<END>': 2,
}
with open(f"{COUPLET_PATH}/vocabs", "r") as f:
  for x in f:
    c = x.strip()[0]
    if c not in token_dict:
      token_dict[c] = len(token_dict)

for t in ['train', 'test']:
  for i in ['in', 'out']:
    with open(f"{COUPLET_PATH}/{t}/{i}.txt", "r") as f:
      for line in f:
        for cs in line.strip().replace(' ', '').replace('\n', ''):
          for c in cs:
            if c not in token_dict:
              token_dict[c] = len(token_dict)

assert 9132 == len(token_dict)

In [None]:
with open(os.path.join('/content/gdrive/MyDrive/ML/Models/szhu_public_062021', 'couplet_vocab.pickle'), 'wb') as handle:
    pickle.dump(token_dict, handle)

In [None]:
rev_token_dict = {v: k for k, v in token_dict.items()}

# Encode data (chars to char-ids)

In [None]:
MAX_SEQ_LEN = 34  # 32 chars plus start/end

def clean_input(rawq):
  return rawq.strip().replace(' ', '')

def encode(rawq, is_decode_output = False, is_2d=False):
  output = []
  if not is_decode_output:
    output.append([1] if is_2d else 1) # start added to encode/decode outputs
  # content encoding
  string_leng = len(rawq.strip().replace(' ', ''))
  for c in rawq.strip().replace(' ', ''):
    if c not in token_dict:
      token_dict[c] = len(token_dict)
    output.append([token_dict[c]] if is_2d else token_dict[c])
  output.append([2] if is_2d else 2) # end
  for i in range(MAX_SEQ_LEN - len(output)):
    output.append([0] if is_2d else 0) # padding to fixed MAX_SEQ_LEN size
  return output

train_raw = {"in": [], "out": [], "pre": [], "post": [], "decode_in": []}
test_raw = {"in": [], "out": [], "pre": [], "post": [], "decode_in": []}
total_raw = {'train': train_raw, 'test': test_raw}

for t in ['train', 'test']:
  for i in ['in', 'out']:
    with open(f"{COUPLET_PATH}/{t}/{i}.txt", "r") as f:
      for line in f:
        if i == 'out':
          total_raw[t]['decode_in'].append(encode(line, False, i=='in'))
        total_raw[t][i].append(encode(line, i=='out', i=='out'))
        total_raw[t]["pre" if i == 'in' else 'post'].append(clean_input(line))

In [None]:
def decode_tokens(token_ids):
  output = ""
  for token_id in token_ids:
    if token_id > 2:
      output += rev_token_dict[token_id]
    elif token_id == 0:
      break
  return output

for inq, indecode, outq in zip(total_raw['train']['in'][:3],
                     total_raw['train']['decode_in'][:3],
                     total_raw['train']['out'][:3]):
  print(inq, "\n", indecode, "\n", outq)
  print(decode_tokens(inq), decode_tokens(np.asarray(outq).reshape(-1)))

In [None]:
dfs = {}

for t in ['train', 'test']:
  dfs[t] = pd.DataFrame(
      list(zip(total_raw[t]['in'], total_raw[t]['out'], total_raw[t]['pre'], total_raw[t]['post'], total_raw[t]['decode_in'])),
      columns =['in', 'out', 'pre', 'post', 'decode_in'])
  dfs[t]['in_length']  = dfs[t]['in'].str.len()
  dfs[t]['out_length']  = dfs[t]['out'].str.len()
  dfs[t]['de_in_length']  = dfs[t]['decode_in'].str.len()

In [None]:
dfs['train'].describe()

# Transformer model and training

In [None]:
in_np = np.array(dfs['train']['in'].values.tolist())
decode_in_np = np.array(dfs['train']['decode_in'].values.tolist())
out_np = np.asarray(dfs['train']['out'].values.tolist())

In [None]:
in_np_test = np.array(dfs['test']['in'].values.tolist())
decode_in_np_test = np.array(dfs['test']['decode_in'].values.tolist())
out_np_test = np.array(dfs['test']['out'].values.tolist())

In [None]:
print(in_np.shape, decode_in_np.shape, out_np.shape)
print(in_np_test.shape, decode_in_np_test.shape, out_np_test.shape)

In [None]:
with strategy.scope():
  num_encoders = 4
  num_docoders = 4
  num_heads = 8
  embed_size = 64 * num_docoders
  drop_out_rate = 0.1
  model = get_model(
    token_num=len(token_dict),
    embed_dim=embed_size,
    encoder_num=num_encoders,
    decoder_num=num_docoders,
    head_num=num_heads,
    hidden_dim=embed_size,
    attention_activation='gelu',
    feed_forward_activation='gelu',
    dropout_rate=drop_out_rate,
    embed_weights=np.random.random((len(token_dict), embed_size)),
  )
  model.compile(
      optimizer=tf.keras.optimizers.Adam(),
      loss='sparse_categorical_crossentropy',
  )

In [None]:
epochs = 80
batch_size = 256
model.fit(
  x=[in_np, decode_in_np],
  y=out_np,
  batch_size=batch_size,
  epochs=epochs,
  validation_data=([in_np_test, decode_in_np_test], out_np_test),
)

## save model weights

In [None]:
DRIVE_MODEL_DIR = '/content/gdrive/MyDrive/ML/Models/chinese_couplet_v1'
!mkdir -p {DRIVE_MODEL_DIR}
model.save_weights(DRIVE_MODEL_DIR)

# Inference, see [this colab](https://github.com/hululuzhu/chinese-ai-writing-share/blob/main/RC_01_AI_Writing_Demo_06_2021.ipynb)