# Code to generate the plots used in the presentation.

In [None]:
# uv venv 
# uv pip install pandas matplotlib seaborn scipy scikit-learn statsmodels palmerpenguins pip ipykernel ipywidgets setuptools jinja2 transformers tqdm torch

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from itertools import cycle
from IPython.display import display, HTML

import torch
from transformers import pipeline, set_seed
from transformers import GPT2Tokenizer, GPT2Model
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

# Next word probability

In [None]:
# Try to change the text to complete
text = "It was the best of times,"
text = "The wizard" 

# Code adapted from: https://stackoverflow.com/questions/76397904/generate-the-probabilities-of-all-the-next-possible-word-for-a-given-text

t = GPT2TokenizerFast.from_pretrained("gpt2")
m = GPT2LMHeadModel.from_pretrained("gpt2")

fig, axs = plt.subplots( 3, 6, figsize = (16,9), layout = 'constrained', dpi = 100 )

for i in tqdm(range(100)):
        
    encoded_text = t(text, return_tensors="pt")
    with torch.inference_mode():
      outputs = m(**encoded_text)
    next_token_logits = outputs.logits[0, -1, :]
    next_token_probs = torch.softmax(next_token_logits, -1)
    topk_next_tokens= torch.topk(next_token_probs, 10)
    
    next_word = pd.DataFrame( 
        [(t.decode(idx), prob.item()) for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values)], 
        columns = ['token', 'probability'],
    )
    next_word['probability'] /= next_word['probability'].sum()
    next_word = next_word.iloc[::-1,:]

    w = np.random.choice( next_word['token'], p = next_word['probability'] )   # Random token
    #w = next_word['token'].iloc[-1]                                            # Most likely token

    if i <  len(axs.flatten()): 
            
        #fig, ax = plt.subplots( figsize = (4,4), layout = 'constrained' )
        ax = axs.flatten()[i]
        b = ax.barh( next_word['token'], next_word['probability'] )
        j = np.argwhere( next_word['token'] == w )[0,0]
        b[j].set_color('tab:red')
        for side in ['left', 'top', 'right']: 
            ax.spines[side].set_visible(False)
        ax.tick_params(axis='y', length=0)
        ax.set_xlim(0,1)
        if i < 3:  # axs.shape[1]: 
            ax.set_title( re.sub(r'\s+', ' ', text) )
        #plt.show()

    text += w

plt.show()
print( text )

# Text completion

In [None]:
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_new_tokens=20, num_return_sequences=5)

# Tokenization

In [None]:
text = """It was the best of times, it was the worst of times, it was the age of
wisdom, it was the age of foolishness, it was the epoch of belief, it
was the epoch of incredulity, it was the season of Light, it was the
season of Darkness, it was the spring of hope, it was the winter of
despair, we had everything before us, we had nothing before us, we were
all going direct to Heaven, we were all going direct the other way--in
short, the period was so far like the present period, that some of its
noisiest authorities insisted on its being received, for good or for
evil, in the superlative degree of comparison only."""

In [None]:
before = """<html>
<head>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Alegreya">
<style>
body {
  font-family: "Alegreya", serif;
}
</style>
</head>
<body>
"""

after = """</body>
</html>
"""

text = text.replace( "\n", " ")

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
token_ids = tokenizer(text)['input_ids']
tokens = [ tokenizer.decode(u) for u in token_ids ]
colors = cycle( [ 'blue', 'red' ] )
colors = cycle( ['Aqua', 'Bisque', 'LightGreen', 'LightSalmon', 'SkyBlue', 'Pink' ] )
''.join( [ f'<span style="color:{color}">{text}</span>' for text, color in zip( tokens, colors ) ] )
result = ''.join( [ f'<span style="background:{color}">{text}</span>' for text, color in zip( tokens, colors ) ] )
with open('a.html','w') as f:
    print( 
        before + 
        result +
        after,
        file = f,
    )
! perl -p -e 's#</?span.*?>##g' a.html  > b.html  
HTML( f"<span style='font-size: 2em'>" + result + "</span>" )

# Token embeddings

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [None]:
x = model.wte( torch.tensor( token_ids ) )
for i in range(20): 
    a = f'"{tokens[i]}"'
    a = f'{a:10s}' + str([ round(u,3) for u in x[i,:10].tolist() ] + ["⋯"])
    a = a.replace( "'⋯'", "⋯" )
    print(a)

In [None]:
text = "central bank"
token_ids = tokenizer(text)['input_ids']
tokens = [ tokenizer.decode(u) for u in token_ids ]
x = model.wte( torch.tensor( token_ids ) )
for i in range(2): 
    a = f'"{tokens[i]}"'
    a = f'{a:10s}' + str([ round(u,3) for u in x[i,:10].tolist() ] + ["⋯"])
    a = a.replace( "'⋯'", "⋯" )
    print(a)

# Plots used in the presentation

In [None]:
fig, axs = plt.subplots( 2, 3, figsize = (15, 8), dpi = 100 )

a = 3
xs = np.linspace( -a, a, 101 )

# Linear
ax = axs[0,0]
ys = 1 + 1.5 * xs
ax.plot( xs, ys, linewidth = 5, zorder = 10 )

# ReLU
ax = axs[0,1]
ys = np.maximum( xs, 0 )
ax.plot( xs, ys, linewidth = 5, zorder = 10 )

# Leaky ReLU
ax = axs[0,2]
ys = np.where( xs >= 0, ys, .2 * xs )
ax.plot( xs, ys, linewidth = 5, zorder = 10 )

# Softplus
ax = axs[1,0]
ys = np.log( 1 + np.exp(xs) )
ax.plot( xs, ys, linewidth = 5, zorder = 10 )

# tanh
ax = axs[1,1]
ys = np.tanh( xs )
ax.plot( xs, ys, linewidth = 5, zorder = 10 )

ax = axs[1,2]
ys = 1 / ( 1 + np.exp(-xs) )
ax.plot( xs, ys, linewidth = 5, zorder = 10 )

for ax in axs.flatten():
    ax.axhline( 0, color = 'black' )
    ax.axvline( 0, color = 'black' )
    ax.scatter( np.arange(-a,a+1), (2*a+1)*[0], marker = '+', color = 'black' )
    ax.scatter( (2*a+1)*[0], np.arange(-a,a+1), marker = '+', color = 'black' )
    ax.set_xlim( -1.02*a, 1.02*a )
    ax.set_ylim( -1.02*a, 1.02*a )
    ax.axis('off')
    ax.set_aspect(1)

fig.subplots_adjust(hspace=0.2)
plt.show()