In [1]:
import sys
import ast
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch import einsum
from torch.utils.data import DataLoader, Dataset

from sentence_transformers import SentenceTransformer

from ascii_utils import ASCIITransformer, ASCIITokenizer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# check for GPU
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using apple metal GPU")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using nvidia GPU")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU")

Using apple metal GPU


In [3]:
# class ASCIITokenizer:
#     def __init__(self):
#         self.vocab = [chr(i) for i in range(32, 127)] + ['']
#         self.vocab_size = len(self.vocab)
#         self.token_to_char = {i: c for i, c in enumerate(self.vocab)}
#         self.char_to_token = {c: i for i, c in enumerate(self.vocab)}

#     def tokenize(self, text):
#         tokens = [self.char_to_token[c] for c in text if c in self.vocab]
#         return tokens

In [4]:
# def pad_ascii_art(ascii_art, max_width=32, max_height=32, pad_char=' '):
#     lines = ascii_art.splitlines()
#     padded_lines = [line.ljust(max_width, pad_char) for line in lines]
#     while len(padded_lines) < max_height:
#         padded_lines.append(pad_char * max_width)
#     return '\n'.join(padded_lines)

# df = pd.read_json("./ascii_data/ascii_art_data.json")
# df['width'] = df['text'].apply(lambda x: max(len(line) for line in x.splitlines()))
# df['height'] = df['text'].apply(lambda x: len(x.splitlines()))
# df['text'] = df['text'].apply(pad_ascii_art)

# df = df[ (df['width'] < 32) & (df['height'] < 32) ][['topic', 'text']]

# sentence_transformer_model="sentence-transformers/all-MiniLM-L6-v2"
# embedding_model = SentenceTransformer(sentence_transformer_model).to(device)
# def encode(x):
#     return embedding_model.encode(x)
# embeddings = df['text'].apply(encode)
# df['embedding'] = [repr(list(e)) for e in embeddings]
# df.to_csv('ascii_embeddings.csv', index=False)
# df[:5]

In [5]:
df = pd.read_csv('ascii_embeddings.csv')
df['embedding'] = df['embedding'].apply(ast.literal_eval)
print(type(df['embedding'][0]))
df[:3]

<class 'list'>


Unnamed: 0,topic,text,embedding
0,aardvarks,_.---._ /\\ \n ./' ...,"[-0.07842264, 0.07851021, -0.029405361, -0.022..."
1,bats,"_ ,_, _ \n / `'=...","[-0.074843615, 0.031718493, 0.11798973, 0.0175..."
2,bats,"(_ ,_, _) \n / `'-...","[-0.09386347, 0.02620181, 0.065080315, -0.0151..."


In [6]:
class ASCIIDataLoader(Dataset):
    def __init__(self, df, tokenizer):
        # df = df.reset_index()
        self.labels = df['topic']
        tokenized_data = [tokenizer.tokenize(art) for art in df['text']]
        max_length = 32 * 32
        tokenized_data = [tokens + [0] * (max_length - len(tokens)) for tokens in tokenized_data]
        self.asciis = torch.tensor(tokenized_data)
        self.embedding = torch.tensor(df['embedding'])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embedding, self.labels[idx], self.asciis[idx]

dataset = ASCIIDataLoader(df, ASCIITokenizer())
loader = DataLoader(dataset, batch_size=5, shuffle=True)

In [7]:
if __name__=="__main__":
    model = ASCIITransformer().to(device)

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [9]:
loader = iter(loader)

: 

In [10]:
for epoch in range(10):
    for embedding, _, ascii_text in loader:
        print(embedding)
        print(ascii_text)
        ascii_text = ascii_text.to(device)
        embedding = embedding.to(device)
        optimizer.zero_grad()
        outputs = model(embedding)
        loss = criterion(outputs.view(-1), ascii_text.view(-1).float())
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')