# Probing the Dynamics of Language Using Word-Embedding and Text-Generation Models


**Authors:** Roman Hall, Gopal Iyer, Zaul Tavangar

**Description:** Notebook containing our language models (miniature version of GPT) for each year of headlines. Adapted from "Text generation with a miniature GPT" by Apoorv Nandan. https://keras.io/examples/generative/text_generation_with_miniature_gpt/


## Introduction

We implement an autoregressive language model
using a miniature version of the GPT model.
The model consists of a single Transformer block with causal masking
in its attention layer.

## Setup

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import numpy as np
import os
import re
import string
import random
import pickle
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


## Implement a Transformer block as a layer

In [None]:

def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Mask the upper half of the dot product matrix in self attention.
    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)


In [None]:
# Perform yearwise
pretrained_embeddings_full = np.loadtxt('/content/drive/My Drive/Deep Learning Final Project/vectors/all_vectors.tsv')

pretrained_embeddings_yearwise = []
for i in range(2003, 2022):
  pretrained_embeddings_yearwise.append(np.loadtxt(f'/content/drive/My Drive/Deep Learning Final Project/vectors/{i}_vectors.tsv'))

## Implement an embedding layer

Create two seperate embedding layers: one for tokens and one for token index
(positions).

In [None]:

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, pretrained_embeddings):
        super().__init__()
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

        self.token_emb = tf.keras.layers.Embedding(
            input_dim=vocab_size, 
            output_dim=embed_dim,
            embeddings_initializer=tf.keras.initializers.Constant(pretrained_embeddings),
            trainable=False,  # Set to False if you don't want to fine-tune the embeddings
            mask_zero=True   # Set to True if your embeddings include a zero-padding token
        )

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


## Implement the miniature GPT model

In [None]:
vocab_size = 4095  # Only consider the top 4095 words
maxlen = 14  # Max sequence size
embed_dim = 256  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer


def create_model(year):
    if year == -1: # Full dataset
      pretrained_embeddings = pretrained_embeddings_full
    else:
      pretrained_embeddings = pretrained_embeddings_yearwise[year - 2003]
    inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim, pretrained_embeddings)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
    x = transformer_block(x)
    outputs = layers.Dense(vocab_size)(x)
    model = keras.Model(inputs=inputs, outputs=[outputs, x])
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        "adam", loss=[loss_fn, None],
    )  # No loss and optimization based on word embeddings from transformer block
    return model


## Prepare the data for word-level language modelling

In [None]:
batch_size = 1024

filenames_full = []

all_headlines_file = ['/content/drive/My Drive/Deep Learning Final Project/data/all_headlines.txt']
filenames_full.append(all_headlines_file)

filenames_yearwise = []
for i in range(2003, 2022):
  year_file = [f'/content/drive/My Drive/Deep Learning Final Project/data/{i}_headlines.txt']
  filenames_yearwise.append(year_file)

text_ds_full = tf.data.TextLineDataset(filenames_full)
text_ds_full = text_ds_full.shuffle(buffer_size=256)
text_ds_full = text_ds_full.batch(batch_size)

text_ds_yearwise = []
for i in range(2003, 2022):
  text_ds_year = tf.data.TextLineDataset(filenames_yearwise[i-2003])
  text_ds_year = text_ds_year.shuffle(buffer_size=256)
  text_ds_year = text_ds_year.batch(batch_size)
  text_ds_yearwise.append(text_ds_year)

# Custom standardization function to lowercase the text and remove punctuation
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return tf.strings.regex_replace(lowercase, '[%s]' % re.escape(string.punctuation), '')

# Create a vectorization layer and adapt it to the text
vectorize_layer_full = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)
vectorize_layer_full.adapt(text_ds_full)
vocab_full = vectorize_layer_full.get_vocabulary()  # To get words back from token indices

vectorize_layer_yearwise = []
vocab_yearwise = []
for i in range(2003, 2022):
  vly = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
    )
  vly.adapt(text_ds_yearwise[i-2003])
  vectorize_layer_yearwise.append(vly)
  vocab_yearwise.append(vly.get_vocabulary())

print(vectorize_layer_full.get_vocabulary())
print(vl.get_vocabulary() for vl in vectorize_layer_yearwise)


def prepare_lm_inputs_labels(text, year=-1):
    """
    Shift word sequences by 1 position so that the target for position (i) is
    word at position (i+1). The model will use all words up till position (i)
    to predict the next word.
    """
    text = tf.expand_dims(text, -1)
    if year == -1:
      tokenized_sentences = vectorize_layer_full(text)
    else:
      tokenized_sentences = vectorize_layer_yearwise[year-2003](text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


text_ds_full = text_ds_full.map(prepare_lm_inputs_labels)
text_ds_full = text_ds_full.prefetch(tf.data.AUTOTUNE)

text_ds_yearwise = (text_ds_yearwise)

for i in range(2003, 2022):
  text_ds_yearwise[i-2003] = text_ds_yearwise[i-2003].map(lambda x: prepare_lm_inputs_labels(text=x, year=i))
  text_ds_yearwise[i-2003] = text_ds_yearwise[i-2003].prefetch(tf.data.AUTOTUNE)


<generator object <genexpr> at 0x7feeab320c10>


## Implement a Keras callback for generating text

In [None]:

class TextGenerator(keras.callbacks.Callback):
    """A callback to generate text from a trained model.
    1. Feed some starting prompt to the model
    2. Predict probabilities for the next token
    3. Sample the next token and add it to the next input

    Arguments:
        max_tokens: Integer, the number of tokens to be generated after prompt.
        start_tokens: List of integers, the token indices for the starting prompt.
        index_to_word: List of strings, obtained from the TextVectorization layer.
        top_k: Integer, sample from the `top_k` token predictions.
        print_every: Integer, print after this many epochs.
    """

    def __init__(
        self, max_tokens, start_tokens, index_to_word, top_k=10, print_every=1
    ):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.print_every = print_every
        self.k = top_k

    def sample_from(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def on_epoch_end(self, epoch, logs=None):
        start_tokens = [_ for _ in self.start_tokens]
        if (epoch + 1) % self.print_every != 0:
            return
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:maxlen]
                sample_index = maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y, _ = self.model.predict(x)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            # start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
        txt = " ".join(
            [self.detokenize(_) for _ in self.start_tokens + tokens_generated]
        )
        print(f"generated text:\n{txt}\n")


# Tokenize starting prompt
word_to_index_full = {}
for index, word in enumerate(vocab_full):
    word_to_index_full[word] = index

start_prompt = "the police are"
start_tokens = [word_to_index_full.get(_, 1) for _ in start_prompt.split()]
num_tokens_generated = 3
text_gen_callback_full = TextGenerator(num_tokens_generated, start_tokens, vocab_full)

word_to_index_yearwise = []
text_gen_callback_yearwise = []
for i in range(2003, 2022):
  year_dict = {}
  for index, word in enumerate(vocab_yearwise[i-2003]):
    year_dict[word] = index
  word_to_index_yearwise.append(year_dict)
  start_tokens = [year_dict.get(_, 1) for _ in start_prompt.split()]
  text_gen_callback_yearwise.append(TextGenerator(num_tokens_generated, start_tokens, vocab_yearwise[i-2003]))

## Train the model

Note: This code should preferably be run on GPU.

In [None]:
# Train model on full set of headlines
model_full = create_model(-1)

model_full.fit(text_ds_full, verbose=2, epochs=20, callbacks=[text_gen_callback_full])

# Train models yearwise
models_yearwise = [None] * (2021-2003+1)
for year in range(2003, 2022):
  models_yearwise[year-2003] = create_model(year)
  models_yearwise[year-2003].fit(text_ds_yearwise[year-2003], verbose=2, epochs=20, callbacks=[text_gen_callback_yearwise[year-2003]])

Epoch 1/20


In [None]:
start_prompt = "obama"

print("Full model:")
start_tokens_full = [word_to_index_full.get(_, 1) for _ in start_prompt.split()]

input_sequence = np.array(start_tokens_full)[np.newaxis, :]
model_full.predict(input_sequence)[1].shape

next_tokens = np.argsort(model_full.predict(input_sequence)[0][0, -1, :])[-4:] # Last index controls number of tokens generated

start_tokens_full += list(next_tokens)
print(len(start_tokens_full))

generated_text = ' '.join([vocab_full[token] for token in start_tokens_full])
print(generated_text)

for year in range(2003, 2022):
  print("\n ********* \n")
  print(year, "model:")
  print(start_prompt.split())
  start_tokens_year = [word_to_index_yearwise[year - 2003].get(_, 1) for _ in start_prompt.split()]
  print(start_tokens_year)

  input_sequence = np.array(start_tokens_year)[np.newaxis, :]
  models_yearwise[year - 2003].predict(input_sequence)[1].shape

  next_tokens = np.argsort(models_yearwise[year - 2003].predict(input_sequence)[0][0, -1, :])[-4:] # Last index controls number of tokens generated

  start_tokens_year += list(next_tokens)
  print(len(start_tokens_year))

  generated_text = ' '.join([vocab_yearwise[year - 2003][token] for token in start_tokens_year])
  print(generated_text)

Full model:
5
obama announces of says [UNK]

 ********* 

2003 model:
['obama']
[1]
5
[UNK] in  to [UNK]

 ********* 

2004 model:
['obama']
[1]
5
[UNK] in for to [UNK]

 ********* 

2005 model:
['obama']
[1]
5
[UNK] in for to [UNK]

 ********* 

2006 model:
['obama']
[1]
5
[UNK] in  to [UNK]

 ********* 

2007 model:
['obama']
[3168]
5
obama in for to [UNK]

 ********* 

2008 model:
['obama']
[156]
5
obama in for to [UNK]

 ********* 

2009 model:
['obama']
[176]
5
obama for  to [UNK]

 ********* 

2010 model:
['obama']
[339]
5
obama in for to [UNK]

 ********* 

2011 model:
['obama']
[314]
5
obama for  to [UNK]

 ********* 

2012 model:
['obama']
[272]
5
obama on for to [UNK]

 ********* 

2013 model:
['obama']
[525]
5
obama for  to [UNK]

 ********* 

2014 model:
['obama']
[599]
5
obama for in to [UNK]

 ********* 

2015 model:
['obama']
[712]
5
obama for of to [UNK]

 ********* 

2016 model:
['obama']
[439]
5
obama in of to [UNK]

 ********* 

2017 model:
['obama']
[892]
5
obama in