In [1]:
!pip install -q --upgrade keras-nlp
!pip install -q --upgrade keras

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m508.4/508.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import keras_nlp
import pathlib
import random

import keras
from keras import ops
import tensorflow as tf
import tensorflow.data as tf_data
from tensorflow_text.tools.wordpiece_vocab import (
    bert_vocab_from_dataset as bert_vocab,
)

In [3]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

# Load files
train_pairs = pd.read_csv('drive/MyDrive/train_pairs.csv')
valid_pairs = pd.read_csv('drive/MyDrive/valid_pairs.csv')
test_pairs = pd.read_csv('drive/MyDrive/test_pairs.csv')

Mounted at /content/drive


In [4]:
#modify from pairs used in t5 fine tuning
prefix = 'translate biased to unbiased: '

train_pairs = train_pairs.text_pairs_dict.apply(lambda x: tuple(eval(x).values())).apply(lambda x: (x[0][len(prefix):], x[1]))
val_pairs = valid_pairs.text_pairs_dict.apply(lambda x: tuple(eval(x).values())).apply(lambda x: (x[0][len(prefix):], x[1]))
test_pairs = test_pairs.text_pairs_dict.apply(lambda x: tuple(eval(x).values())).apply(lambda x: (x[0][len(prefix):], x[1]))

In [5]:
train_file = 'drive/MyDrive/train_pairsS2S.csv'
valid_file = 'drive/MyDrive/valid_pairsS2S.csv'
test_file = 'drive/MyDrive/test_pairsS2S.csv'

pd.DataFrame(train_pairs).to_csv(train_file)
pd.DataFrame(val_pairs).to_csv(valid_file)
pd.DataFrame(test_pairs).to_csv(test_file)

In [5]:
print(f"{len(train_pairs)+len(val_pairs)+len(test_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

181473 total pairs
127033 training pairs
27220 validation pairs
27220 test pairs


In [5]:
def train_word_piece(text_samples, vocab_size, reserved_tokens):
    word_piece_ds = tf.data.Dataset.from_tensor_slices(text_samples)
    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
        word_piece_ds.batch(1000).prefetch(2),
        vocabulary_size=vocab_size,
        reserved_tokens=reserved_tokens,
    )
    return vocab

In [6]:
BATCH_SIZE = 64
EPOCHS = 10  # This should be at least 10 for convergence
MAX_SEQUENCE_LENGTH = 256

#The size of our source and target language vocabularies
ORG_VOCAB_SIZE = 15000
MOD_VOCAB_SIZE = 15000

#define some hyperparameter values for our transformers
EMBED_DIM = 256
INTERMEDIATE_DIM = 2048
NUM_HEADS = 8

In [7]:
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]

# Biased Examples (Original)
org_samples = [text_pair[0] for text_pair in train_pairs]
org_vocab = train_word_piece(org_samples, ORG_VOCAB_SIZE, reserved_tokens)

# Unbiased Examples (Modified)
mod_samples = [text_pair[1] for text_pair in train_pairs]
mod_vocab = train_word_piece(mod_samples, MOD_VOCAB_SIZE, reserved_tokens)

In [9]:
print("Biased Tokens: ", org_vocab[1000:1020])
print("Unbiased Tokens: ", mod_vocab[1000:1020])

Biased Tokens:  ['african', 'half', 'woman', 'announced', 'information', 'least', '##ts', 'numerous', 'reported', 'stated', 'founder', 'legendary', 'present', '##um', '19', 'production', 'russia', '21', 'eastern', 'association']
Unbiased Tokens:  ['areas', 'street', 'medical', 'themselves', '##re', '28', 'once', 'half', 'natural', 'commonly', 'list', 'notable', 'arab', 'important', 'performance', 'project', 're', 'woman', 'continued', 'gay']


In [8]:
org_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=org_vocab, lowercase=True
)
mod_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=mod_vocab, lowercase=True
)

In [10]:
org_input_ex = train_pairs[0][0]
org_tokens_ex = org_tokenizer.tokenize(org_input_ex)
print("Biased sentence: ", org_input_ex)
print("Tokens: ", org_tokens_ex)
print(
    "Recovered text after detokenizing: ",
    org_tokenizer.detokenize(org_tokens_ex),
)

print()

mod_input_ex = train_pairs[0][1]
mod_tokens_ex = mod_tokenizer.tokenize(mod_input_ex)
print("Unbiased sentence: ", mod_input_ex)
print("Tokens: ", mod_tokens_ex)
print(
    "Recovered text after detokenizing: ",
    mod_tokenizer.detokenize(mod_tokens_ex),
)

Biased sentence:  this loris is a small, slender , cute looking primate with big forward facing eyes.
Tokens:  tf.Tensor(
[  318    50 10019   304   297    39   622    14    57  8957  1810    14
  8854  2743    54  5896   997   305  1139  2405  4587  3087    16], shape=(23,), dtype=int32)
Recovered text after detokenizing:  tf.Tensor(b'this loris is a small , slender , cute looking primate with big forward facing eyes .', shape=(), dtype=string)

Unbiased sentence:  this loris is a small, slender primate with big forward facing eyes.
Tokens:  tf.Tensor(
[ 322   50 9088  307  300   39  682   14   57 8918 1560   54 6348  983
  308 1247 2529 4642 3096   16], shape=(20,), dtype=int32)
Recovered text after detokenizing:  tf.Tensor(b'this loris is a small , slender primate with big forward facing eyes .', shape=(), dtype=string)


In [9]:
def preprocess_batch(org, mod):
    batch_size = tf.shape(mod)[0]

    org = org_tokenizer(org)
    mod = mod_tokenizer(mod)

    # Pad `biased` to `MAX_SEQUENCE_LENGTH`.
    org_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH,
        pad_value=org_tokenizer.token_to_id("[PAD]"),
    )
    org = org_start_end_packer(org)

    # Add special tokens (`"[START]"` and `"[END]"`) to `unbiased` and pad it as well.
    mod_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH + 1,
        start_value=mod_tokenizer.token_to_id("[START]"),
        end_value=mod_tokenizer.token_to_id("[END]"),
        pad_value=mod_tokenizer.token_to_id("[PAD]"),
    )
    mod = mod_start_end_packer(mod)

    return (
        {
            "encoder_inputs": org,
            "decoder_inputs": mod[:, :-1],
        },
        mod[:, 1:],
    )

def make_dataset(pairs):
    org_texts, mod_texts = zip(*pairs)
    org_texts = list(org_texts)
    mod_texts = list(mod_texts)
    dataset = tf.data.Dataset.from_tensor_slices((org_texts, mod_texts))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.shuffle(2048).prefetch(16).cache()



In [10]:
#make the training data
train_ds = make_dataset(train_pairs)

#make the validation data
val_ds = make_dataset(val_pairs)

In [14]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (64, 256)
inputs["decoder_inputs"].shape: (64, 256)
targets.shape: (64, 256)


In [11]:
# Encoder
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=ORG_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    #mask_zero=True,
)(encoder_inputs)

encoder_outputs = keras_nlp.layers.TransformerEncoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(inputs=x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

In [12]:
# Decoder
decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=MOD_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    #mask_zero=True,
)(decoder_inputs)

x = keras_nlp.layers.TransformerDecoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(MOD_VOCAB_SIZE, activation="softmax")(x)
decoder = keras.Model(
    [
        decoder_inputs,
        encoded_seq_inputs,
    ],
    decoder_outputs,
)
decoder_outputs = decoder([decoder_inputs, encoder_outputs])

In [13]:
#connect the encoder and decoder together in sequence
seq2seq = keras.Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs,
    name="s2sTransformer",
)

In [18]:
seq2seq.summary()

In [14]:
seq2seq.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

In [17]:
seq2seq.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)

Epoch 1/10
[1m1985/1985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m746s[0m 354ms/step - accuracy: 0.8682 - loss: 1.1801 - val_accuracy: 0.8802 - val_loss: 0.8656
Epoch 2/10
[1m1985/1985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m654s[0m 330ms/step - accuracy: 0.8848 - loss: 0.8441 - val_accuracy: 0.9151 - val_loss: 0.6250
Epoch 3/10
[1m1985/1985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m653s[0m 329ms/step - accuracy: 0.9197 - loss: 0.5978 - val_accuracy: 0.9347 - val_loss: 0.4621
Epoch 4/10
[1m1985/1985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m681s[0m 329ms/step - accuracy: 0.9350 - loss: 0.4642 - val_accuracy: 0.9414 - val_loss: 0.3904
Epoch 5/10
[1m1985/1985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m652s[0m 328ms/step - accuracy: 0.9426 - loss: 0.3933 - val_accuracy: 0.9441 - val_loss: 0.3575
Epoch 6/10
[1m1985/1985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m649s[0m 327ms/step - accuracy: 0.9469 - loss: 0.3519 - val_accuracy: 0.9462 - val_loss:

<keras.src.callbacks.history.History at 0x7ab8bf016290>

In [15]:
seq2seq.load_weights("drive/MyDrive/seq2seq.weights.h5")

  trackable.load_own_variables(weights_store.get(inner_path))


In [19]:
seq2seq.save("drive/MyDrive/seq2seq.keras")
seq2seq.save_weights("drive/MyDrive/seq2seq.weights.h5")


In [16]:
def decode_sequences(input_sentences):
    batch_size = tf.shape(input_sentences)[0]

    # Tokenize the encoder input.
    encoder_input_tokens = org_tokenizer(input_sentences).to_tensor(
        shape=(None, MAX_SEQUENCE_LENGTH)
    )

    # Define a function that outputs the next token's probability given the
    # input sequence.
    def next(prompt, cache, index):
        logits = seq2seq([encoder_input_tokens, prompt])[:, index - 1, :]
        # Ignore hidden states for now; only needed for contrastive search.
        hidden_states = None
        return logits, hidden_states, cache

    # Build a prompt of length 128 with a start token and padding tokens.
    length = 128
    start = tf.fill((batch_size, 1), mod_tokenizer.token_to_id("[START]"))
    pad = tf.fill((batch_size, length - 1), mod_tokenizer.token_to_id("[PAD]"))
    prompt = tf.concat((start, pad), axis=-1)

    generated_tokens = keras_nlp.samplers.GreedySampler()(
        next,
        prompt,
        #end_token_id=mod_tokenizer.token_to_id("[END]"),
        index=1,  # Start sampling after start token.
    )
    generated_sentences = mod_tokenizer.detokenize(generated_tokens)
    return generated_sentences

examples = ['the player must not make any move that would place his king in check.',
            "the lyrics are about mankind 's perceived idea of hell.",
            'marriage is a holy union of individuals.']
for input_sentence in examples:
    translated = decode_sequences(tf.constant([input_sentence]))
    translated = translated.numpy()[0].decode("utf-8")
    translated = (
        translated.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )

    print("Input: " + input_sentence)
    print("Output: " + translated)
    print()

Input: the player must not make any move that would place his king in check.
Output: the player must not make any move that would place in his .

Input: the lyrics are about mankind 's perceived idea of hell.
Output: the lyrics are about humankind ' s perceived idea of hell .

Input: marriage is a holy union of individuals.
Output: marriage is a holy union of individuals .



In [16]:
# Example test data
test_source_sequences = test_pairs.apply(lambda x: x[0])  # List of source sequences
test_target_sequences = test_pairs.apply(lambda x: x[1])   # List of target sequences

In [26]:
test_source_sequences[0]

'he devoted his enormous energies to the destruction of what he considered the slave power, that is the conspiracy he saw of slave owners to seize control of the federal government and block the progress of liberty .'

In [27]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
predicted_sequences3 = []

# Define batch size
batch_size = 1000

# Define the start index
start_index = 8500

# Divide source sequences into batches
num_batches = ((len(test_source_sequences) - start_index) + batch_size - 1) // batch_size

# Process each batch
for i in range(num_batches):
    start_idx = start_index + (i * batch_size)
    end_idx = min(start_index + ((i + 1) * batch_size), len(test_source_sequences))
    batch_sequences = test_source_sequences[start_idx:end_idx]

    # Generate predictions for the current batch
    batch_predictions = []
    for source_sequence in batch_sequences:
        # Generate prediction for source sequence using the loaded model
        predicted_sequence = decode_sequences(tf.constant([source_sequence]))
        predicted_sequence = predicted_sequence.numpy()[0].decode("utf-8")
        predicted_sequence = (
            predicted_sequence.replace("[PAD]", "")
            .replace("[START]", "")
            .replace("[END]", "")
            .strip()
        )
        batch_predictions.append(predicted_sequence)

    # Extend the list of predicted sequences with the batch predictions
    predicted_sequences3.extend(batch_predictions)


In [None]:
import multiprocessing

# Define the function for processing a batch of sequences
def process_batch(batch_sequences):
    batch_predictions = []
    for source_sequence in batch_sequences:
        # Generate prediction for source sequence using the loaded model
        predicted_sequence = decode_sequences(tf.constant([source_sequence]))
        predicted_sequence = predicted_sequence.numpy()[0].decode("utf-8")
        predicted_sequence = (
            predicted_sequence.replace("[PAD]", "")
            .replace("[START]", "")
            .replace("[END]", "")
            .strip()
        )
        batch_predictions.append(predicted_sequence)
    return batch_predictions

# Define batch size and start index
batch_size = 1000
start_index = 10500

# Divide source sequences into batches
num_batches = ((len(test_source_sequences) - start_index) + batch_size - 1) // batch_size

# Create a multiprocessing pool
pool = multiprocessing.Pool()

# Process each batch in parallel
results = []
for i in range(num_batches):
    start_idx = start_index + (i * batch_size)
    end_idx = min(start_index + ((i + 1) * batch_size), len(test_source_sequences))
    batch_sequences = test_source_sequences[start_idx:end_idx]
    results.append(pool.apply_async(process_batch, args=(batch_sequences,)))

# Get the results from all processes
predicted_sequences4 = []
for result in results:
    predicted_sequences4.extend(result.get())

# Close the pool
pool.close()
pool.join()


In [19]:
pd.DataFrame(predicted_sequences3).to_csv("drive/MyDrive/test_seq2seq_sequences3.csv")

In [None]:
#evaluate metrics

import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher



# Compute BLEU score
bleu_scores = [sentence_bleu([target_sequence], predicted_sequence) for target_sequence, predicted_sequence in zip(test_target_sequences, predicted_sequences)]
average_bleu_score = sum(bleu_scores) / len(bleu_scores)
overall_bleu_score = corpus_bleu([[target_sequence] for target_sequence in test_target_sequences], predicted_sequences)

# Compute ROUGE score
rouge = Rouge()
rouge_scores = rouge.get_scores([predicted_sequence for predicted_sequence in predicted_sequences], [target_sequence for target_sequence in test_target_sequences], avg=True)

# Compute accuracy
accuracy = accuracy_score(test_target_sequences, predicted_sequences)

# Compute Similar Sequence Matcher score
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()
similarity_score = similar(test_target_sequences, predicted_sequences)

# Print or store the evaluation metrics
print("BLEU Score (Average):", average_bleu_score)
print("BLEU Score (Overall):", overall_bleu_score)
print("ROUGE Score (Avg):", rouge_scores)
print("Accuracy:", accuracy)
print("Similarity:", similarity_score)