In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

%matplotlib inline
%matplotlib ipympl
import matplotlib.pyplot as plt 
import random
import math 
import time
import numpy as np

#local imports
from TransformerModules import Modelconfig, Block, MLP, CausalSelfAttention

In [None]:
class AttentionOnlyModel(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config 
        self.transformer = nn.ModuleDict(
            dict(
                wce = nn.Embedding(config.vocab_size, config.n_embed),
                wpe = nn.Embedding(config.block_size, config.n_embed),
                h = CausalSelfAttention(config)
                )
            )
        self.lm_head = nn.Linear(config.n_embed, config.vocab_size, bias=False)

    def forward(self, idx, targets = None):
        B, T = idx.shape
        assert T <= self.config.block_size, f"Cannot forward sequence of lenght {T}, block size is {self.config.block_size}"
        
        char_emb = self.transformer.wce(idx)
        pos_emb = self.transformer.wpe(torch.arange(0, T, dtype = torch.long))
        
        x = char_emb + pos_emb
        
        x = x+self.transformer.h(x)
        
        logits = self.lm_head(x)
        
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss


    def generate(self, idx, max_new_tokes):

        for _ in range(max_new_tokes):
            idx_cond = idx[:, -self.config.block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim = -1)
            idx_next = torch.multinomial(probs, num_samples = 1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx 



In [None]:
# tools to build the basic A, B, C, A, B, C, ... patterned dataset
data = list('ABC'*1000)
c_to_i = {'A':0, 'B':1, 'C':2}
i_to_c = {i:c for c,i in c_to_i.items()}
encode = lambda s: [c_to_i[c] for c in s] 
decode = lambda l: ''.join([i_to_c[i] for i in l])


i_data = [c_to_i[c] for c in data]
i_data = torch.tensor(i_data, dtype=torch.long)

In [None]:
# get a batch of training data
def get_batch(model, batch_size):
	block_size = model.config.block_size
	ix = torch.randint(len(i_data)-block_size, (batch_size,))
	x = torch.stack([i_data[i:i+block_size] for i in ix])
	y = torch.stack([i_data[i+1:i+block_size+1] for i in ix])
	return x,y

In [None]:
@torch.no_grad()
def estimate_loss(model, batch_size, train_data, val_data, eval_iters):
	out = {}
	data = {'train': train_data, 'val': val_data}
	model.eval()
	for split in ['train', 'val']:
		losses = torch.zeros(eval_iters)
		for k in range(eval_iters):
			X, Y = get_batch(model, batch_size)
			logits, loss = model(X, Y)
			losses[k] = loss.item()
		out[split] = losses.mean()
	model.train()
	return out

In [None]:
def training_loop(model, optimizer, batch_size, max_iters, eval_interval, train_data, val_data, eval_iters):
	start_time = time.time()

	for iter in range(max_iters):

		if iter % eval_interval == 0:
			losses = estimate_loss(model, batch_size, train_data, val_data, eval_iters)
			print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

		# sample a batch of data
		xb, yb = get_batch(model, batch_size)

		# evaluation the loss
		logits, loss = model(xb, yb)
		optimizer.zero_grad(set_to_none=True)
		loss.backward()
		optimizer.step()

	losses = estimate_loss(model, batch_size, train_data, val_data, eval_iters)
	print(f"step {max_iters}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
	print(f"Took {(time.time() - start_time)//60} minutes to train")	

In [None]:
tinyconfig = Modelconfig
tinyconfig.block_size = 3
tinyconfig.vocab_size = 3
tinyconfig.n_layer = 1
tinyconfig.n_head = 1
tinyconfig.n_embed = 2


In [None]:
torch.manual_seed(1234)
# torch.manual_seed(2345)
one_layer_model = AttentionOnlyModel(tinyconfig)
optimizer = torch.optim.AdamW(one_layer_model.parameters(), )

max_iters=5000
# eval_interval=max_iters//100
eval_interval = 500
eval_iters = 200
batch_size = 64

In [None]:
training_loop(one_layer_model, optimizer, batch_size, max_iters, eval_interval, i_data, i_data, eval_iters)

In [None]:
for param_tensor in one_layer_model.state_dict():
		print(param_tensor, "\t", one_layer_model.state_dict()[param_tensor])

In [None]:
#plot the embedding vectors
emb_weights = one_layer_model.transformer.wce.weight.detach().numpy()
pos_weights = one_layer_model.transformer.wpe.weight.detach().numpy()
emb_weights

In [None]:
origin = np.array([[0,0,0],[0,0,0]])
fig, (ax1, ax2) = plt.subplots(1,2, figsize = (14,8))
#plot 1
ax1.quiver(*origin, emb_weights[:,0], emb_weights[:,1], color=['r','b','g'], angles='xy', scale_units='xy', scale=1)
# ax1.xaxis.set_ticks([])
# ax1.yaxis.set_ticks([])
ax1.axis([-2,2,-2,2])
ax1.set_aspect('equal')
ax1.set_title('Token Embeddings, A ~ red, B ~ blue, C ~ green')

#plot 2
ax2.quiver(*origin, pos_weights[:,0], pos_weights[:,1], color=['r','b','g'], angles='xy', scale_units='xy', scale=1)
# ax2.xaxis.set_ticks([])
# ax2.yaxis.set_ticks([])
ax2.axis([-2,2,-2,2])
ax2.set_aspect('equal')
ax2.set_title('Position Embeddings, 0 ~ red, 1 ~ blue, 2 ~ green')

plt.show()

In [None]:
emb_x1 = emb_weights + pos_weights
print(emb_x1)
plt.clf() #clear current figure in order to start a new one
plt.plot(figsize = (14,8))
plt.quiver(*origin, emb_x1[:,0], emb_x1[:,1], color=['r','b','g'], angles='xy', scale_units='xy', scale=1)
# plt.xticks([])
# plt.yticks([])
plt.axis([-3,3,-3,3])
ax = plt.gca()
ax.set_aspect('equal', adjustable='box')
plt.title('Result of Embedding Layer, A at pos 0:red, B at pos 1:blue, C at pos 2:green')

plt.show()

In [None]:
logits, loss= one_layer_model(torch.zeros((1,1), dtype=torch.long))
print(logits.dtype)
print(logits)

In [None]:
x = torch.zeros((1,1), dtype = torch.long)
wce_x = one_layer_model.transformer.wce(x)
wpe_x = one_layer_model.transformer.wpe(torch.arange(0, 1, dtype = torch.long))
x.dtype

In [None]:
print("Weight embedding", "\t", wce_x)
print("Position embedding", "\t", wpe_x)

In [None]:
wt = one_layer_model.transformer.h.c_attn.weight.transpose(0,1)
bs = one_layer_model.transformer.h.c_attn.bias
wt.split(2, 1)

In [None]:
embed_x = wce_x + wpe_x
print("Embedding", "\t", embed_x, embed_x.dtype)
transformer_x = one_layer_model.transformer.h.c_attn(embed_x)
q, k, v = transformer_x.split(2, dim=2)
print("Project x into transformer", "\t", transformer_x)
print("Q projection:", "\t", q)
print("K projection:", "\t", k)
print("V projection:", "\t", v)

In [None]:
embed_x @ wt + bs

In [None]:
x1 = torch.arange(0,3, dtype=torch.long).view(1,3)
wte_x1 = one_layer_model.transformer.wce(x1)
wpe_x1 = one_layer_model.transformer.wpe(torch.arange(0,3, dtype = torch.long))
print(wte_x1, wpe_x1)
embed_x1 = wte_x1+wpe_x1
print(embed_x1)

In [None]:
transformer_x = one_layer_model.transformer.h.c_attn(embed_x1)
q, k, v = transformer_x.split(2, dim=2)
print("Project x into transformer", "\n", transformer_x)
print("Q projection:", "\n", q)
print("K projection:", "\n", k)
print("V projection:", "\n", v)

In [None]:
print('Q, K, V weights: ', one_layer_model.transformer.h.c_attn.weight)
print('Q, K, V biases: ', one_layer_model.transformer.h.c_attn.bias)

In [None]:
print(embed_x1 @ wt)
print(embed_x1 @ wt + bs)

In [None]:
one_layer_model(torch.arange(0,3, dtype=torch.long).view(1,3))

In [None]:
out_proj = one_layer_model.lm_head.weight
out_proj

In [None]:
U, S, V = np.linalg.svd(out_proj.detach().numpy(), full_matrices=False)
print("U = \n", U)
print("Sigma = \n", np.diag(S))
print("V = \n", V)

In [None]:
eigenvalues, eigenvectors = np.linalg.eig(np.matmul(np.diag(S), np.matrix_transpose(V)))
print(eigenvalues)
print(eigenvectors)

In [None]:
np.matmul(U, np.matmul(np.diag(S), np.matrix_transpose(V)))

In [None]:
plt.clf()
from mpl_toolkits.mplot3d import Axes3D
n=1000
theta = np.linspace(0, 2*np.pi, n, endpoint=False)
x = np.cos(theta)
y = np.sin(theta)
#colormap 
cmap = plt.get_cmap(None)

U_scaled = U @ np.diag(S)

A = out_proj.detach().numpy()
XY = np.stack([x,y])
XYZ = np.matmul(A, XY)
X, Y, Z = XYZ[0], XYZ[1], XYZ[2]

fig = plt.figure(figsize = (14,8))
ax1 = fig.add_subplot(1,2,1)
ax1.scatter(x,y, c=theta, cmap=cmap)
ax1.axis([-2,2,-2,2])
ax1.set_aspect('equal')

ax2 = fig.add_subplot(1,2,2, projection='3d')
ax2.scatter(X, Y, Z, c=theta, cmap=cmap)
ax2.quiver(np.array([0,0]), np.array([0,0]), np.array([0,0]), U_scaled[0], U_scaled[1], U_scaled[2])
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_zlabel('z')
ax2.set_aspect('equal')

In [None]:
V @ V.transpose()

In [None]:
import numpy as np

In [None]:
.9*75

In [None]:
scores = np.genfromtxt('Midterm_1_scores_only.csv', delimiter=',')
x = scores[1:]/75*100
y = np.sqrt(x)*10
z = x
w = 0.75*x+25
percent_a = lambda x : 100 * (len(x[x>=90])/len(x))
percent_b = lambda x : 100 * (len(x[np.logical_and(x < 90, x>= 75)]) / len(x))
percent_c = lambda x : 100 * (len(x[np.logical_and(x < 75, x>= 50)]) / len(x))
percent_d = lambda x : 100 * (len(x[np.logical_and(x < 50, x>= 30)]) / len(x))

percent_b_alt = lambda x : 100 * (len(x[np.logical_and(x < 90, x>= 80)]) / len(x))
percent_c_alt = lambda x : 100 * (len(x[np.logical_and(x < 80, x>= 70)]) / len(x))
percent_d_alt = lambda x : 100 * (len(x[np.logical_and(x < 70, x>= 60)]) / len(x))
percent_f_alt = lambda x : 100 * (len(x[x<30]) / len(x))
print(percent_a(x), percent_b(x))

In [None]:
plt.clf()
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(14,8))
axs[0,0].hist(x, edgecolor='white')
axs[0,0].axvline(x=30, color='green', label='A cutoff')
axs[0,0].set_title(f"A: {percent_a(x):.1f}%, B: {percent_b(x):.1f}%, C: {percent_c(x):.1f}%, D: {percent_d(x):.1f}%")

axs[0,1].hist(y, edgecolor='white')
axs[0,1].axvline(x=55, color='green', label='A cutoff')
axs[0,1].set_title(f"A: {percent_a(y):.1f}%, B: {percent_b(y):.1f}% C: {percent_c(y):.1f}%, D: {percent_d(y):.1f}%")

axs[1,0].hist(z, edgecolor='white')
axs[1,0].axvline(x=30, color='green', label='A cutoff')
axs[1,0].set_title(f"A: {percent_a(z):.1f}%, B: {percent_b_alt(z):.1f}% C: {percent_c_alt(z):.1f}%, D: {percent_d_alt(z):.1f}%, Fail: {percent_f_alt(z):.1f}")

axs[1,1].hist(w, edgecolor='white')
axs[1,1].axvline(x=55 , color='green', label='A cutoff')
axs[1,1].set_title(f"A: {percent_a(w):.1f}%, B: {percent_b(w):.1f}% C: {percent_c(w):.1f}%, D: {percent_d(w):.1f}%")
plt.show()


In [None]:
plt.close()