In [1]:
from typing import Optional

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
from sklearn import datasets

%matplotlib inline
device = "cuda:0" if torch.cuda.is_available() else "cpu"



# prepare data

In [2]:
# download dictionary

#! curl https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt -o words_alpha.txt
# ! curl https://www.mit.edu/~ecprice/wordlist.10000 -o words_alpha.txt

# corpus_fn = 'words_alpha.txt'
corpus_fn = '/kaggle/input/eng-dictionary-data/words_alpha.txt'

with open(corpus_fn, 'r') as f:
    wordlist = f.read().split()

In [3]:
import pandas as pd

corpus_fn = '/kaggle/input/english-freq/english.txt'

df = pd.read_csv(corpus_fn, sep='\t', header=None)
df[0] = df[0].apply(lambda x: round(x / 5000) if x / 1000 > 1 else 1)
wordlist = []

for row in df.iterrows():
    wordlist.extend([row[1][1]] * row[1][0])

In [4]:
wordlist[::10000]

['you',
 'are',
 'fucking',
 'willoughby',
 'flagship',
 'ruprecht',
 'deneen',
 'misdirect',
 'atrophies',
 'damita']

In [5]:
# wordlist = [word for word in wordlist if len(word) > 3]
wordlist = [word for word in wordlist if isinstance(word, str) and len(word) > 2]

In [6]:
len(wordlist)

92550

In [7]:
import random

random.choice(wordlist)
# wordlist

'sarris'

In [8]:
sum([isinstance(word, float) for word in wordlist])

0

In [9]:
# make character list

def build_char_list(wordlist):
    charlist = set()
    for word in wordlist:
        charlist.update(word)
    charlist.add('_') #begin char
    charlist.add('^') # end character
    return sorted(charlist)

build_char_list(['abc', 'abd', 'aba'])

['^', '_', 'a', 'b', 'c', 'd']

In [10]:
charlist = build_char_list(wordlist)
input_dim = len(charlist)
input_length = max(32, len(max(wordlist, key=len)))
print('Number of unique characters: ', input_dim)
print('Max word length (32 if less): ', input_length)

Number of unique characters:  28
Max word length (32 if less):  34


In [11]:
charlist

['^',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [12]:
id2char = dict(zip(range(input_dim), charlist))
char2id = dict(zip(charlist, range(input_dim)))

In [13]:
#спиздил от шадовского курса
def to_matrix(lines, max_len=None, pad_begin=char2id['_'], pad_end=char2id['^'], dtype=np.int64):
    """Casts a list of lines into torch-digestable matrix"""
    max_len = (max_len or max(map(len, lines))) + 2
    lines_ix = np.full([len(lines), max_len], pad_end, dtype=dtype)
    lines_ix[:, 0] = pad_begin
    for i in range(len(lines)):
        line_ix = list(map(char2id.get, lines[i][:max_len]))
        lines_ix[i, 1:len(line_ix)+1] = line_ix
    return lines_ix

In [14]:
to_matrix(wordlist[:5])

array([[ 1, 26, 16, 22,  0],
       [ 1, 26, 16, 22,  0],
       [ 1, 26, 16, 22,  0],
       [ 1, 26, 16, 22,  0],
       [ 1, 26, 16, 22,  0]])

In [15]:
import torch.nn as nn
import torch
import torch.nn.functional as F

def compute_mask(input_ix, eos_ix=char2id['^']):
    """ compute a boolean mask that equals "1" until first EOS (including that EOS) """
    return F.pad(torch.cumsum(input_ix == eos_ix, dim=-1)[..., :-1] < 1, pad=(1, 0, 0, 0), value=True)

compute_mask(torch.tensor(to_matrix(wordlist[:5]))).to(torch.int32)

tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]], dtype=torch.int32)

[source](https://machinelearningmastery.com/text-generation-with-lstm-in-pytorch/)

## prepare model

In [62]:
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
 
class CharModel(nn.Module):
    def __init__(self, n_vocab=len(charlist)):
        super().__init__()
        self.embed = nn.Embedding(len(charlist), 64)
        self.lstm = nn.GRU(input_size=64, hidden_size=256, num_layers=5, batch_first=True)
        self.linear = nn.Linear(256, n_vocab)
    def forward(self, x):
        x = self.embed(x)
        x, _ = self.lstm(x)
        x = self.linear(x)
        return x
    def get_next_tokens(self, prefix, temperature=1.0, max_len=256):
        prefix_ix = torch.as_tensor(to_matrix([prefix]), dtype=torch.int64).to('cuda')
        with torch.no_grad():
            probs = torch.softmax(self(prefix_ix)[0, -1], dim=-1).cpu().numpy()  # shape: [n_tokens]
        return dict(zip(charlist, probs))

In [67]:
model.eval()
model.to('cuda')
with torch.no_grad():
    probs = torch.softmax(model(torch.zeros((1, 1)).long().cuda())[0, -1], dim=-1).cpu().numpy()
probs.round(2)

array([0.03, 0.04, 0.04, 0.04, 0.04, 0.03, 0.04, 0.03, 0.04, 0.03, 0.04,
       0.04, 0.04, 0.03, 0.04, 0.04, 0.04, 0.04, 0.04, 0.03, 0.04, 0.03,
       0.04, 0.03, 0.03, 0.04, 0.04, 0.04], dtype=float32)

In [64]:
model = CharModel()

In [57]:
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

train, test = train_test_split(wordlist)

In [69]:
class Words(Dataset):
    def __init__(self, wordlist):
        self.wordlist = wordlist
        
    def __getitem__(self, idx):
        return self.wordlist[idx]
    
    def __len__(self):
        return len(self.wordlist)
    
def custom_collate(words):
    return torch.tensor(to_matrix(words))

train_dataset = Words(train)
train_loader = DataLoader(dataset=train_dataset, batch_size=256, collate_fn=custom_collate, shuffle=True, num_workers=2)

test_dataset = Words(test)
test_loader = DataLoader(dataset=test_dataset, batch_size=256, collate_fn=custom_collate, shuffle=True, num_workers=2)

In [70]:
def loss(logits, answers, mask):
    loss = nn.CrossEntropyLoss(reduction='none')(logits, answers) * mask
    return loss[loss != 0.0].mean()  

In [72]:
from tqdm import tqdm, trange
from IPython.display import clear_output

device = 'cuda'
model = model.to(device)
model.train()
optim = torch.optim.Adam(model.parameters(), lr=0.0001)

th, vh = [], []
for epoch in trange(30):
    train_history = []
    val_history = []
    for i, x in enumerate(train_loader):
        x = x.to(device)
        logits = model(x[:, :-1]).permute(0, 2, 1)
        answers = x[:, 1:]
        mask = compute_mask(x[:, 1:])
        l_t = loss(logits, answers, mask)
        train_history.append(l_t.detach().cpu().numpy())
        optim.zero_grad()
        l_t.backward()
        optim.step()
    th.append((epoch, np.mean(train_history)))
    with torch.no_grad():
        for i, x in enumerate(test_loader):
            x = x.to(device)
            logits = model(x[:, :-1]).permute(0, 2, 1)
            answers = x[:, 1:]
            mask = compute_mask(x)[:, 1:]
            l_v = loss(logits, answers, mask)
            val_history.append(l_v.detach().cpu().numpy())
    vh.append((epoch, np.mean(val_history)))
    print(f'train {epoch} {np.mean(train_history)}')
    print(f'val {epoch} {np.mean(val_history)}')
        
    
        
    

  3%|▎         | 1/30 [00:04<02:19,  4.82s/it]

train 0 2.8548645973205566
val 0 2.646972179412842


  7%|▋         | 2/30 [00:09<02:14,  4.81s/it]

train 1 2.5361733436584473
val 1 2.4694573879241943


 10%|█         | 3/30 [00:14<02:09,  4.80s/it]

train 2 2.439051866531372
val 2 2.411015033721924


 13%|█▎        | 4/30 [00:19<02:05,  4.81s/it]

train 3 2.387172222137451
val 3 2.363158702850342


 17%|█▋        | 5/30 [00:24<02:00,  4.80s/it]

train 4 2.3398258686065674
val 4 2.3192696571350098


 20%|██        | 6/30 [00:28<01:55,  4.82s/it]

train 5 2.2961318492889404
val 5 2.279724359512329


 23%|██▎       | 7/30 [00:33<01:50,  4.82s/it]

train 6 2.259925365447998
val 6 2.2498133182525635


 27%|██▋       | 8/30 [00:38<01:46,  4.82s/it]

train 7 2.2314658164978027
val 7 2.2242565155029297


 30%|███       | 9/30 [00:43<01:41,  4.81s/it]

train 8 2.207279682159424
val 8 2.2055089473724365


 33%|███▎      | 10/30 [00:48<01:36,  4.82s/it]

train 9 2.1864280700683594
val 9 2.187302827835083


 37%|███▋      | 11/30 [00:53<01:31,  4.84s/it]

train 10 2.167553663253784
val 10 2.173428535461426


 40%|████      | 12/30 [00:57<01:27,  4.85s/it]

train 11 2.1514484882354736
val 11 2.1566061973571777


 43%|████▎     | 13/30 [01:02<01:22,  4.84s/it]

train 12 2.135450839996338
val 12 2.145413398742676


 47%|████▋     | 14/30 [01:07<01:17,  4.82s/it]

train 13 2.12101411819458
val 13 2.1328845024108887


 50%|█████     | 15/30 [01:12<01:12,  4.82s/it]

train 14 2.1075592041015625
val 14 2.1193671226501465


 53%|█████▎    | 16/30 [01:17<01:07,  4.80s/it]

train 15 2.095742702484131
val 15 2.1114795207977295


 57%|█████▋    | 17/30 [01:21<01:02,  4.81s/it]

train 16 2.0839576721191406
val 16 2.101125478744507


 60%|██████    | 18/30 [01:26<00:57,  4.81s/it]

train 17 2.072908878326416
val 17 2.092506170272827


 63%|██████▎   | 19/30 [01:31<00:53,  4.82s/it]

train 18 2.0621588230133057
val 18 2.082587718963623


 67%|██████▋   | 20/30 [01:36<00:48,  4.81s/it]

train 19 2.0526044368743896
val 19 2.076439380645752


 70%|███████   | 21/30 [01:41<00:43,  4.82s/it]

train 20 2.0432965755462646
val 20 2.06821608543396


 73%|███████▎  | 22/30 [01:45<00:38,  4.81s/it]

train 21 2.034022092819214
val 21 2.060514450073242


 77%|███████▋  | 23/30 [01:50<00:33,  4.82s/it]

train 22 2.0252137184143066
val 22 2.0540268421173096


 80%|████████  | 24/30 [01:55<00:28,  4.82s/it]

train 23 2.0169296264648438
val 23 2.047433376312256


 83%|████████▎ | 25/30 [02:00<00:24,  4.83s/it]

train 24 2.00905704498291
val 24 2.042158842086792


 87%|████████▋ | 26/30 [02:05<00:19,  4.82s/it]

train 25 2.0012357234954834
val 25 2.0361275672912598


 90%|█████████ | 27/30 [02:10<00:14,  4.82s/it]

train 26 1.9937429428100586
val 26 2.032172203063965


 93%|█████████▎| 28/30 [02:14<00:09,  4.81s/it]

train 27 1.9870915412902832
val 27 2.0264222621917725


 97%|█████████▋| 29/30 [02:19<00:04,  4.81s/it]

train 28 1.9795420169830322
val 28 2.0220792293548584


100%|██████████| 30/30 [02:24<00:00,  4.82s/it]

train 29 1.9728119373321533
val 29 2.0162577629089355





In [73]:
def generate(model, prefix='_', temperature=1.0, max_len=256):
    with torch.no_grad():
        while True:
            token_probs = model.get_next_tokens(prefix)
            tokens, probs = zip(*token_probs.items())
            if temperature == 0:
                next_token = tokens[np.argmax(probs)]
            else:
                probs = np.array([p ** (1. / temperature) for p in probs])
                probs /= sum(probs)
                next_token = np.random.choice(tokens, p=probs)

            prefix += next_token
            if next_token == '^' or len(prefix) > max_len: break
    return prefix

In [86]:
prefix = '_gr'
for _ in range(10):
    print(generate(model, prefix, temperature=1.0))

_gr^
_gri^
_gras^
_gr^
_grad^
_grifia^
_grg^
_gre^
_gr^
_grik^


In [88]:
test_words = ['tkemali', 'train', 'rtain', 'aenocoeia', ]

In [98]:
from tqdm import tqdm, trange
from IPython.display import clear_output

device = 'cuda'
model = model.to(device)
model.eval()

vh = []
for word in test_words:
    val_history = []
    with torch.no_grad():
#         x = x.to(device)
        word_t = torch.tensor(to_matrix([word])).to(device)
        logits = model(word_t[:, :-1]).permute(0, 2, 1)
        answers = word_t[:, 1:]
        mask = compute_mask(word_t)[:, 1:]
        l_v = loss(logits, answers, mask)
        perp = torch.exp(l_v)
        perp = perp.detach().cpu().numpy()
        val_history.append(perp)
        print(f'perplexity on {word} {perp}')

perplexity on tkemali 23.93702507019043
perplexity on train 5.92509651184082
perplexity on rtain 30.396867752075195
perplexity on aenocoeia 47.84872817993164


In [95]:
?np.round

[0;31mSignature:[0m [0mnp[0m[0;34m.[0m[0mround[0m[0;34m([0m[0ma[0m[0;34m,[0m [0mdecimals[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mout[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Round an array to the given number of decimals.

See Also
--------
around : equivalent function; see for details.
[0;31mFile:[0m      /opt/conda/lib/python3.10/site-packages/numpy/core/fromnumeric.py
[0;31mType:[0m      function

In [None]:
# exp_params = {
#     'batch_size': [8, 32],
#     'lr': [1e-5, 1e-3],
#     'n_conv_filters': [32, 128],
#     'n_fc_neurons': [32, 128],
#     'ncritic': [-5, 1, 5]
# }

# n_runs = 1
# for val in exp_params.values():
#     n_runs *= len(val)
# print(f'number of runs: {n_runs}')

In [None]:
# def run_experiment(exp_params):
#     trainer = Trainer(
#         models=gan_network,
#         losses_list=[LeastSquaresDiscriminatorLoss(), LeastSquaresGeneratorLoss()],
#         ncritic=exp_params['ncritic'], 
#         epochs=30, 
#         sample_size=1, 
#         checkpoints='./model/gan', 
#         retain_checkpoints=5, 
#         log_dir='./logs/', 
#         test_noise=None, 
#         nrow=8,
#         device='cuda'
#     )
    
#     run_name = '_'.join([f'{param}: {value}' for param, value in exp_params.items()])
    
#     PROJECT = 'nonce-cnn'
#     wandb.init(project=PROJECT, name=run_name, resume=True)
#     trainer.train(trainloader_chars)
#     wandb.finish()
#     print(run_name)
#     print(*decode(trainer.generator(prior_chars.sample((10,))).detach().cpu().numpy()), sep='\n')
#     print('====================================\n')