In [1]:
import pandas as pd

In [28]:
df = pd.read_json("ascii_art_data.json")


In [3]:
print(df['text'][1])

       _.---._    /\\
    ./'       "--`\//
  ./              o \          .-----.
 /./\  )______   \__ \        ( help! )
./  / /\ \   | \ \  \ \       /`-----'
   / /  \ \  | |\ \  \7--- ooo ooo ooo ooo ooo ooo


In [29]:
# prompt: create a variable called ASCII_TEXT which is a list of all the df['text'] items where the width is less than 32 AND the height is less than 32. You might have to first calculate the width and height for each item before filtering since they're just strings. Width in this case is the width of the longest line, not the average width of the ascii image

import pandas as pd

df = pd.read_json("ascii_art_data.json")

df['width'] = df['text'].apply(lambda x: max(len(line) for line in x.splitlines()))
df['height'] = df['text'].apply(lambda x: len(x.splitlines()))

ASCII_TEXT = df[ (df['width'] < 32) & (df['height'] < 32) ]['text'].tolist()
len(ASCII_TEXT)


2108

In [30]:
import torch
from transformers import AutoTokenizer

class ASCIITokenizer:
    def __init__(self):
        self.vocab = [chr(i) for i in range(32, 127)] + ['<PAD>']
        self.vocab_size = len(self.vocab)
        self.token_to_char = {i: c for i, c in enumerate(self.vocab)}
        self.char_to_token = {c: i for i, c in enumerate(self.vocab)}

    def tokenize(self, text):
        tokens = [self.char_to_token[c] for c in text if c in self.vocab]
        return tokens

tokenizer = ASCIITokenizer()

tokenized_data = []
for ascii_art in ASCII_TEXT:
    tokens = tokenizer.tokenize(ascii_art)
    tokenized_data.append(tokens)

print(f'Num Unique Tokens: {len(tokenizer.token_to_char)}')

Num Unique Tokens: 96


In [50]:
tokenizer.char_to_token['\\']

60

One thing to note is that the tokenization here tokenizes each character individually, meaning that `\` and `n` are individual tokens, I think it'll be fine and the model will learn that they should be combined for a new line but I don't know if we have enough data and time for it to properly learn that.

I think the padding I'm doing underneath is definitely an area we gotta do some more experimentation on. Right now it's just flattening the art into a long string and padding it at the end but it might be better to add padding in between to center the image. Not sure.

In [31]:
def pad_ascii_art(ascii_art, max_width=32, max_height=32, pad_char='<PAD>'):
    lines = ascii_art.splitlines()
    padded_lines = [line.ljust(max_width, pad_char) for line in lines]
    while len(padded_lines) < max_height:
        padded_lines.append(pad_char * max_width)
    return '\n'.join(padded_lines)

padded_data = [pad_ascii_art(art) for art in ASCII_TEXT]

# Tokenize the padded data
tokenized_data = [tokenizer.tokenize(art) for art in padded_data]

# Convert to tensor and pad to 32x32 (1024 tokens)
max_length = 32 * 32
tokenized_data = [tokens + [0] * (max_length - len(tokens)) for tokens in tokenized_data]
tokenized_data = torch.tensor(tokenized_data)

TypeError: The fill character must be exactly one character long

In [7]:
from torch.utils.data import DataLoader, TensorDataset

dataset = TensorDataset(tokenized_data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

## Modeling

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ASCIITransformer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, hidden_dim, output_dim):
        super(ASCIITransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, dim_feedforward=hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(1, 0, 2)  # batch_size, seq_len, embedding_dim
        x = self.transformer_encoder(x)
        x = x.permute(1, 0, 2)  # batch_size, seq_len, embedding_dim
        x = self.fc(x)
        return x

model = ASCIITransformer(vocab_size=len(tokenizer.token_to_char),
                          embedding_dim=128,
                          num_heads=4,
                          hidden_dim=256,
                          output_dim=len(tokenizer.token_to_char)
        ).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    for batch in dataloader:
        input_ids = batch[0].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs.view(-1, len(tokenizer.token_to_char)), input_ids.view(-1))
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')



Epoch 1, Loss: 0.03230954334139824
Epoch 2, Loss: 0.023365095257759094
Epoch 3, Loss: 0.008311551995575428
Epoch 4, Loss: 0.005041135940700769
Epoch 5, Loss: 0.002155726309865713
Epoch 6, Loss: 0.0019831599202007055
Epoch 7, Loss: 0.0011665376368910074
Epoch 8, Loss: 0.001104850904084742
Epoch 9, Loss: 0.0013060752535238862
Epoch 10, Loss: 0.0007990804733708501


In [27]:
def generate_ascii_art(model, tokenizer, device, max_length=1024, temperature=1.0):
    input_ids = torch.tensor([[tokenizer.char_to_token[' ']]]).to(device)  # start with a space character
    generated_art = []

    for i in range(max_length):
        outputs = model(input_ids)
        next_token_logits = outputs[:, -1, :]
        next_token_probs = F.softmax(next_token_logits / temperature, dim=-1)
        next_token = torch.multinomial(next_token_probs, num_samples=1)
        generated_art.append(tokenizer.token_to_char[next_token.item()])
        input_ids = torch.cat((input_ids, next_token), dim=1)

    generated_art = ''.join(generated_art)
    return generated_art
    generated_art = generated_art.replace(' ', '')  # remove spaces
    generated_art = generated_art.split('\n')  # split into lines
    max_width = max(len(line) for line in generated_art)
    generated_art = [line.ljust(max_width) for line in generated_art]
    generated_art = '\n'.join(generated_art)
    return generated_art

generated_art = generate_ascii_art(model, tokenizer, device)
print(len(generated_art))
# check if all generated_art tokens are ' '
if generated_art.count(' ') == len(generated_art):
    print("All generated_art tokens are ' '")
else:
    print(generated_art)

1024
All generated_art tokens are ' '
