# Наумкин Владимир, С01-119.

## Задача 3. Модель автокодировщика.

### Подключим библиотеки

In [1]:
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset
from torch.utils.tensorboard import SummaryWriter
from prettytable import PrettyTable
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split

### Уберём предупреждения

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Зададим устройство исполнения кода (вычисления провожу на своём ПК)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

### Функции для работы с датасетом

In [4]:
class Tokenizer:
    def __init__(self, word_to_ind, tokenizer):
        self.word_to_ind = word_to_ind
        self.tokenizer = tokenizer
    def __call__(self, sentences, max_len = 10, strict_padding_to_max_len = True):
        tokens = self.tokenizer.tokenize_sents(sentences)
        if strict_padding_to_max_len == False: # Пытаемся уменьшить max_len 
            max_len = min(max_len, max(map(len, tokens))) # до длины самого длинного предложения.
        def process_sentence(s, ma_l):
            if len(s) < ma_l:
                return ['[CLS]'] + s + ['[SEP]'] + ['[PAD]'] * (ma_l - len(s))
            else:
                return ['[CLS]'] + s[: ma_l] + ['[SEP]']
        tokens = [process_sentence(sent, max_len) for sent in tokens]
        idxs = [[self.word_to_ind.get(word, self.word_to_ind['[UNK]']) for word in sent] for sent in tokens]
        return torch.tensor(idxs)

In [5]:
def word_dict(dataset, min_count = 1): # Не включаем в словарь редкие слова
    temp = {}
    for sent in tqdm(dataset.values[:, 1]):
        for word in RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+').tokenize(sent):
            # слова или знаки препинания (не буквы и цифры, пробелы и табы) или цифры
            if word in temp:
                temp[word] += 1
            else:
                temp[word] = 1
    word2idx = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3}
    idx2word = {0: '[PAD]', 1: '[UNK]', 2: '[CLS]', 3: '[SEP]'}
    for elem, number in temp.items():
        if number >= min_count and elem not in word2idx:
            word2idx[elem] = len(word2idx)
            idx2word[len(idx2word)] = elem
    return word2idx, idx2word

### Функция проверки качества модели

In [6]:
def check_model(batch_size, dataset, model, loss_function, idx2word):
    model.eval()
    batch_generator = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size)
    test_loss = 0
    for it, (x_batch, y_batch) in enumerate(batch_generator):
        x_batch = x_batch.to(model.device)
        y_batch = y_batch.to(model.device)
        with torch.no_grad():
            output = model(x_batch)
        test_loss += loss_function(output.transpose(1,2), y_batch).cpu().item()*len(x_batch)
    test_loss /= len(dataset)
    print(f'loss: {test_loss}')
    dataloader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)
    x, y = next(iter(dataloader))
    x = x.to(device)
    y = y.to(device)
    with torch.no_grad():
        outputs = model(x)
    one_x = x[0].cpu().numpy()
    one_output = outputs[0].argmax(dim=-1).cpu().numpy()
    words = [idx2word[idx] for idx in one_x]
    pred_words = [idx2word[idx] for idx in one_output]
    table = PrettyTable(["Word", "Predict"])
    table.align["Word"], table.align["Predict"] = "l", "l"
    for word, pred in zip(words, pred_words):
        if word != idx2word[word2idx['[PAD]']]:
            table.add_row([word, pred])
    print(table)
    return test_loss

### Код для обучения модели

In [7]:
def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    model.train()
    optimizer.zero_grad()
    output = model(x_batch.to(model.device))
    loss = loss_function(output.transpose(1,2), y_batch.to(device))
    loss.backward()
    optimizer.step()
    return loss.cpu().item()

In [8]:
def train_epoch(train_generator, model, loss_function, optimizer, callback = None):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function)
        if callback is not None:
            with torch.no_grad():
                callback(model, batch_loss)
        epoch_loss += batch_loss*len(batch_of_x)
        total += len(batch_of_x)
    return epoch_loss/total

In [9]:
def trainer(count_of_epoch, 
            batch_size, 
            dataset,
            model, 
            loss_function,
            optimizer,
            lr = 0.001,
            callback = None):
    optima = optimizer(model.parameters(), lr=lr)
    iterations = tqdm(range(count_of_epoch), desc='epoch')
    iterations.set_postfix({'train epoch loss': np.nan})
    for it in iterations:
        batch_generator = tqdm(
            torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True), 
            leave=False, total=len(dataset)//batch_size+(len(dataset)%batch_size> 0))
        epoch_loss = train_epoch(train_generator=batch_generator, 
                    model=model, 
                    loss_function=loss_function, 
                    optimizer=optima, 
                    callback=callback)
        iterations.set_postfix({'train epoch loss': epoch_loss})

### Отслеживание обучения

In [10]:
class callback():
    def __init__(self, writer, dataset, loss_function, delimeter = 300, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size
        self.dataset = dataset
    def forward(self, model, loss):
        model.eval()
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)
        if self.step % self.delimeter == 0:
            model.eval()
            batch_generator = torch.utils.data.DataLoader(dataset=self.dataset, batch_size=self.batch_size)
            test_loss = 0
            for it, (x_batch, y_batch) in enumerate(batch_generator):
                x_batch = x_batch.to(model.device)
                y_batch = y_batch.to(model.device)
                output = model(x_batch)
                test_loss += self.loss_function(output.transpose(1,2), y_batch).cpu().item()*len(x_batch)
            test_loss /= len(self.dataset)
            print(f'\t\tstep={self.step}, train_loss={loss}, val_loss={test_loss}')
            self.writer.add_scalar('LOSS/test', test_loss, self.step)
    def __call__(self, model, loss):
        return self.forward(model, loss)

## Модель автокодировщика

In [11]:
class Encoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device
    def __init__(self, vocab_dim, emb_dim, latent_dim, num_layers = 3, dropout = 0, batch_norm = False):
        super(type(self), self).__init__()
        self.emb = torch.nn.Embedding(vocab_dim, emb_dim)
        self.lstm = torch.nn.LSTM(emb_dim, latent_dim, num_layers, dropout = dropout, batch_first = True)
        if batch_norm:
            self.batch_norm = torch.nn.BatchNorm1d(latent_dim)
        else:
            self.batch_norm = None
    def forward(self, x):
        out = self.emb(x)
        _, (h, c) = self.lstm(out)
        if self.batch_norm is not None:
            out = self.batch_norm(out.transpose(1,2)).transpose(1,2)
        out = torch.cat([h, c], dim=-1).transpose(0, 1)[:, -1, :] # cat => 2 * latent_dim
        return out

In [26]:
class Decoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device
    def __init__(self, vocab_dim, latent_dim, emb_dim, hidden_dim, num_layers = 3, dropout = 0, batch_norm = False):
        super(type(self), self).__init__()
        self.num_layers = num_layers
        self.h0 = torch.nn.Linear(latent_dim, hidden_dim)
        self.c0 = torch.nn.Linear(latent_dim, hidden_dim)
        self.emb = torch.nn.Embedding(1, emb_dim)
        self.lstm = torch.nn.LSTM(emb_dim, hidden_dim, num_layers, dropout = dropout, batch_first = True)
        if batch_norm:
            self.batch_norm = torch.nn.BatchNorm1d(emb_dim)
        else:
            self.batch_norm = None
        self.linear = torch.nn.Linear(hidden_dim, vocab_dim)
    def forward(self, latent_vector):
        h = self.h0(latent_vector).unsqueeze(0).repeat(self.num_layers, 1, 1)
        c = self.c0(latent_vector).unsqueeze(0).repeat(self.num_layers, 1, 1)
        emb = self.emb(torch.zeros(len(latent_vector), 1).long())
        logits = []
        for i in range(12): # в Tokenizer по умолчанию max_len = 10, но ещё + 2 токена начала и конца
            out, (h, c) = self.lstm(emb, (h, c))
            if self.batch_norm is not None:
                out = self.batch_norm(out.transpose(1,2)).transpose(1,2)
            logits.append(out[:,-1,:])
        out = torch.stack(logits, 1)
        out = self.linear(out)
        return out

In [27]:
class Autoencoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device
    def __init__(self, vocab_dim, emb_dim, latent_dim, hidden_dim, num_layers = 3, dropout = 0, batch_norm = False):
        super(type(self), self).__init__()
        self.encoder = Encoder(vocab_dim, emb_dim, latent_dim, num_layers, dropout, batch_norm)
        self.decoder = Decoder(vocab_dim, 2 * latent_dim, emb_dim, hidden_dim, num_layers, dropout, batch_norm)
        # 2 * latent_dim - смотри forward в Encoder
    def forward(self, x):
        return self.decoder(self.encoder(x))

### Загрузка датасета

In [14]:
dataset = pd.read_csv('twitter.csv')

In [15]:
dataset = dataset[dataset[['tag', 'message']].notnull().all(1)]
dataset = dataset.sample(100000, random_state = 777)
dataset_train, dataset_test = train_test_split(dataset, test_size = 0.2, random_state = 777)

In [16]:
word2idx, idx2word = word_dict(dataset_train)

  0%|          | 0/80000 [00:00<?, ?it/s]

In [17]:
tokenizer = Tokenizer(word2idx, RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+'))

In [18]:
train_data_sent = tokenizer(dataset_train.values[:, 1])
test_data_sent = tokenizer(dataset_test.values[:, 1])

In [19]:
# переводим в формат PyTorch данные, причём т.к. у нас автокодировщик, то выход в идеале совпадает со входом
dataset_train_pt = TensorDataset(train_data_sent, train_data_sent)
dataset_test_pt = TensorDataset(test_data_sent, test_data_sent)

### Обучение модели

In [20]:
loss_function = torch.nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam

Перебираемые параметры модели:

In [21]:
dim_params = [10, 25, 50]
num_layers_params = [3, 5, 7]
dropout_params = [0, 0.25, 0.5]
batch_norm_params = [False, True]
min_count_params = [1, 3, 5]

Разный размер слоя:

In [22]:
for dim in dim_params:
    print(f'dim = {dim}')
    model = Autoencoder(vocab_dim=len(word2idx), emb_dim=dim, latent_dim=dim, hidden_dim=dim)
    model.to(device)
    writer = SummaryWriter(log_dir=f'tensorboard3/dim_{dim}')
    call = callback(writer, dataset_test_pt, loss_function)
    check_model(64, dataset_test_pt, model, loss_function, idx2word)
    trainer(count_of_epoch=1,
            batch_size=64,
            dataset=dataset_train_pt,
            model=model,
            loss_function=loss_function,
            optimizer = optimizer,
            lr=0.001,
            callback = call)
    check_model(64, dataset_test_pt, model, loss_function, idx2word)

dim = 10
loss: 11.426295571899415
+-----------+---------------+
| Word      | Predict       |
+-----------+---------------+
| [CLS]     | sorryy        |
| Is        | Huggg         |
| listening | JackieKessler |
| to        | JackieKessler |
| echelon   | JulietWeybret |
| .         | JulietWeybret |
| It        | JulietWeybret |
| '         | JulietWeybret |
| s         | JulietWeybret |
| been      | JulietWeybret |
| my        | JulietWeybret |
| [SEP]     | JulietWeybret |
+-----------+---------------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=7.401067733764648, val_loss=7.243672047424316
		step=600, train_loss=6.495371341705322, val_loss=6.445184295654297
		step=900, train_loss=6.391746997833252, val_loss=6.364573110198974
		step=1200, train_loss=6.356290340423584, val_loss=6.3169028869628905
loss: 6.302846558380127
+------------+---------+
| Word       | Predict |
+------------+---------+
| [CLS]      | [CLS]   |
| I          | [SEP]   |
| think      | [SEP]   |
| I          | [SEP]   |
| '          | [SEP]   |
| m          | [SEP]   |
| going      | [SEP]   |
| to         | [SEP]   |
| miss       | [SEP]   |
| #          | [SEP]   |
| masterchef | [SEP]   |
| [SEP]      | [SEP]   |
+------------+---------+
dim = 25
loss: 11.463343186950684
+--------+-----------------+
| Word   | Predict         |
+--------+-----------------+
| [CLS]  | shortstackhater |
| he     | MacFUSE         |
| looks  | MacFUSE         |
| a      | MacFUSE         |
| bit    | MacFUSE         |
| stoned | MacFUSE         |
| .

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=6.339521884918213, val_loss=6.264262136077881
		step=600, train_loss=6.075460433959961, val_loss=6.059688710021972
		step=900, train_loss=5.932245254516602, val_loss=5.87548848953247
		step=1200, train_loss=5.700625896453857, val_loss=5.753637548065186
loss: 5.736734986877441
+----------+---------+
| Word     | Predict |
+----------+---------+
| [CLS]    | [CLS]   |
| @        | @       |
| dhewlett | [CLS]   |
| I        | I       |
| '        | .       |
| m        | .       |
| still    | .       |
| trying   | .       |
| to       | .       |
| see      | .       |
| Star     | [SEP]   |
| [SEP]    | [SEP]   |
+----------+---------+
dim = 50
loss: 11.462046812438965
+-------------+---------------+
| Word        | Predict       |
+-------------+---------------+
| [CLS]       | SarahAnnGreen |
| @           | McFlyNews     |
| DontTrustMe | McFlyNews     |
| 49          | McFlyNews     |
| thankyou    | McFlyNews     |
| [SEP]       | McFlyNews     |
+---------

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=5.983788967132568, val_loss=6.018139950561523
		step=600, train_loss=5.899569511413574, val_loss=5.743125173187256
		step=900, train_loss=5.682153224945068, val_loss=5.62994130859375
		step=1200, train_loss=5.431457042694092, val_loss=5.5370668601989745
loss: 5.5228437919616695
+-------+---------+
| Word  | Predict |
+-------+---------+
| [CLS] | [CLS]   |
| In    | @       |
| bed   | '       |
| .     | '       |
| I     | I       |
| know  | I       |
| I     | I       |
| '     | I       |
| m     | I       |
| going | '       |
| to    | .       |
| [SEP] | [SEP]   |
+-------+---------+


Разное число слоёв:

In [23]:
for num_layers in num_layers_params:
    print(f'num_layers = {num_layers}')
    model = Autoencoder(vocab_dim=len(word2idx), emb_dim=10, latent_dim=10, hidden_dim=10, num_layers=num_layers)
    model.to(device)
    writer = SummaryWriter(log_dir=f'tensorboard3/num_layers_{num_layers}')
    call = callback(writer, dataset_test_pt, loss_function)
    check_model(64, dataset_test_pt, model, loss_function, idx2word)
    trainer(count_of_epoch=1,
            batch_size=64,
            dataset=dataset_train_pt,
            model=model,
            loss_function=loss_function,
            optimizer = optimizer,
            lr=0.001,
            callback = call)
    check_model(64, dataset_test_pt, model, loss_function, idx2word)

num_layers = 3
loss: 11.465850242614746
+---------+---------+
| Word    | Predict |
+---------+---------+
| [CLS]   | Deejay  |
| @       | nevah   |
| [UNK]   | nevah   |
| Can     | bikers  |
| '       | bikers  |
| t       | bikers  |
| fricken | bikers  |
| wait    | bikers  |
| !       | bikers  |
| Still   | bikers  |
| sad     | bikers  |
| [SEP]   | bikers  |
+---------+---------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=7.324864387512207, val_loss=7.288043412017823
		step=600, train_loss=6.365283012390137, val_loss=6.389882967376709
		step=900, train_loss=6.40216588973999, val_loss=6.285347200012207
		step=1200, train_loss=6.322598457336426, val_loss=6.236143389892578
loss: 6.216730666351318
+--------+---------+
| Word   | Predict |
+--------+---------+
| [CLS]  | [CLS]   |
| Sad    | [SEP]   |
| !      | [CLS]   |
| Chris  | [SEP]   |
| has    | [SEP]   |
| to     | [SEP]   |
| work   | [SEP]   |
| on     | [SEP]   |
| Easter | [SEP]   |
| [SEP]  | [SEP]   |
+--------+---------+
num_layers = 5
loss: 11.458915177917481
+---------+---------------+
| Word    | Predict       |
+---------+---------------+
| [CLS]   | anilmujagic   |
| @       | 3133          |
| [UNK]   | withgoodworks |
| sorry   | withgoodworks |
| I       | withgoodworks |
| '       | withgoodworks |
| m       | withgoodworks |
| missing | withgoodworks |
| it      | withgoodworks |
| [SEP]   | withgoodworks |
+-

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=7.110495567321777, val_loss=7.119984931182861
		step=600, train_loss=6.646178722381592, val_loss=6.5703259887695316
		step=900, train_loss=6.695794582366943, val_loss=6.554778318023682
		step=1200, train_loss=6.4009108543396, val_loss=6.4913670951843265
loss: 6.473345589447021
+---------+---------+
| Word    | Predict |
+---------+---------+
| [CLS]   | [CLS]   |
| Cant    | [SEP]   |
| sleep   | [SEP]   |
| =       | [SEP]   |
| (       | [SEP]   |
| Have    | [SEP]   |
| a       | [SEP]   |
| lot     | [SEP]   |
| of      | [SEP]   |
| packing | [SEP]   |
| to      | [CLS]   |
| [SEP]   | [CLS]   |
+---------+---------+
num_layers = 7
loss: 11.470828549194335
+-------+----------+
| Word  | Predict  |
+-------+----------+
| [CLS] | coons    |
| @     | calamari |
| [UNK] | calamari |
| was   | calamari |
| gonna | calamari |
| DM    | calamari |
| you   | calamari |
| but   | calamari |
| it    | calamari |
| says  | calamari |
| you   | calamari |
| [SEP] | cal

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=7.208164691925049, val_loss=7.335866539001465
		step=600, train_loss=6.451229095458984, val_loss=6.39777795791626
		step=900, train_loss=6.33244514465332, val_loss=6.2405818359375
		step=1200, train_loss=6.147186279296875, val_loss=6.167121617889404
loss: 6.1477489791870115
+-------+---------+
| Word  | Predict |
+-------+---------+
| [CLS] | [CLS]   |
| @     | @       |
| [UNK] | @       |
| i     | [SEP]   |
| have  | [SEP]   |
| heard | [SEP]   |
| of    | [SEP]   |
| that  | [SEP]   |
| movie | [SEP]   |
| .     | [SEP]   |
| just  | [SEP]   |
| [SEP] | [SEP]   |
+-------+---------+


Зависимость от dropout:

In [24]:
for dropout in dropout_params:
    print(f'dropout = {dropout}')
    model = Autoencoder(vocab_dim=len(word2idx), emb_dim=10, latent_dim=10, hidden_dim=10, dropout=dropout)
    model.to(device)
    writer = SummaryWriter(log_dir=f'tensorboard3/dropout_{dropout}')
    call = callback(writer, dataset_test_pt, loss_function)
    check_model(64, dataset_test_pt, model, loss_function, idx2word)
    trainer(count_of_epoch=1,
            batch_size=64,
            dataset=dataset_train_pt,
            model=model,
            loss_function=loss_function,
            optimizer = optimizer,
            lr=0.001,
            callback = call)
    check_model(64, dataset_test_pt, model, loss_function, idx2word)

dropout = 0
loss: 11.52413955078125
+---------+------------+
| Word    | Predict    |
+---------+------------+
| [CLS]   | IchigoNoiZ |
| playing | IchigoNoiZ |
| intense | IchigoNoiZ |
| bingo   | IchigoNoiZ |
| in      | migre      |
| sunrise | migre      |
| .       | migre      |
| .       | migre      |
| .       | migre      |
| wow     | migre      |
| [SEP]   | migre      |
+---------+------------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=7.416171550750732, val_loss=7.357627681732177
		step=600, train_loss=6.580418109893799, val_loss=6.41712360534668
		step=900, train_loss=6.361220359802246, val_loss=6.281792028045654
		step=1200, train_loss=6.123701095581055, val_loss=6.179844000244141
loss: 6.167657291412353
+-----------+---------+
| Word      | Predict |
+-----------+---------+
| [CLS]     | [CLS]   |
| is        | @       |
| uploading | @       |
| pics      | [SEP]   |
| on        | [SEP]   |
| her       | [SEP]   |
| facebook  | [SEP]   |
| profile   | [SEP]   |
| .         | [SEP]   |
| .         | [SEP]   |
| .         | [SEP]   |
| [SEP]     | [SEP]   |
+-----------+---------+
dropout = 0.25
loss: 11.480394509887695
+-----------+-------------+
| Word      | Predict     |
+-----------+-------------+
| [CLS]     | oddly       |
| i         | tristankent |
| wish      | lordy       |
| i         | lordy       |
| were      | lordy       |
| going     | lordy       |
| to        | lordy     

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=7.284395217895508, val_loss=7.219936313629151
		step=600, train_loss=6.483773231506348, val_loss=6.426385845184326
		step=900, train_loss=6.156592845916748, val_loss=6.315860888671875
		step=1200, train_loss=6.134279251098633, val_loss=6.173957640075684
loss: 6.154406372833252
+---------+---------+
| Word    | Predict |
+---------+---------+
| [CLS]   | [CLS]   |
| Is      | [SEP]   |
| gonna   | [SEP]   |
| fry     | [SEP]   |
| some    | [SEP]   |
| chicken | [SEP]   |
| later   | [SEP]   |
| 4       | [SEP]   |
| me      | [SEP]   |
| ,       | [SEP]   |
| my      | [SEP]   |
| [SEP]   | [SEP]   |
+---------+---------+
dropout = 0.5
loss: 11.431422059631348
+-----------+---------+
| Word      | Predict |
+-----------+---------+
| [CLS]     | iyan    |
| @         | godness |
| [UNK]     | godness |
| I         | godness |
| can       | godness |
| see       | godness |
| them      | godness |
| both      | godness |
| switching | godness |
| .         | Mick  

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=7.099998474121094, val_loss=7.1858977066040035
		step=600, train_loss=6.4759416580200195, val_loss=6.423458170318604
		step=900, train_loss=6.4295806884765625, val_loss=6.342418541717529
		step=1200, train_loss=6.101335048675537, val_loss=6.257410876464844
loss: 6.240466682434082
+---------+---------+
| Word    | Predict |
+---------+---------+
| [CLS]   | [CLS]   |
| my      | [SEP]   |
| girlies | [SEP]   |
| @       | [SEP]   |
| [UNK]   | [SEP]   |
| &       | [SEP]   |
| amp     | [SEP]   |
| ;       | [SEP]   |
| @       | [SEP]   |
| [UNK]   | [SEP]   |
| r       | [SEP]   |
| [SEP]   | [SEP]   |
+---------+---------+


Добавление BatchNorm:

In [28]:
for batch_norm in batch_norm_params:
    print(f'batch_norm = {batch_norm}')
    model = Autoencoder(vocab_dim=len(word2idx), emb_dim=10, latent_dim=10, hidden_dim=10, batch_norm=batch_norm)
    model.to(device)
    writer = SummaryWriter(log_dir=f'tensorboard3/batch_norm_{batch_norm}')
    call = callback(writer, dataset_test_pt, loss_function)
    check_model(64, dataset_test_pt, model, loss_function, idx2word)
    trainer(count_of_epoch=1,
            batch_size=64,
            dataset=dataset_train_pt,
            model=model,
            loss_function=loss_function,
            optimizer = optimizer,
            lr=0.001,
            callback = call)
    check_model(64, dataset_test_pt, model, loss_function, idx2word)

batch_norm = False
loss: 11.495204937744141
+-----------+-----------------+
| Word      | Predict         |
+-----------+-----------------+
| [CLS]     | BryonyRocks     |
| @         | voluptuouspanic |
| [UNK]     | voluptuouspanic |
| Yes       | voluptuouspanic |
| ,         | voluptuouspanic |
| I         | voluptuouspanic |
| agree     | voluptuouspanic |
| .         | voluptuouspanic |
| [UNK]     | voluptuouspanic |
| difficult | voluptuouspanic |
| to        | voluptuouspanic |
| [SEP]     | voluptuouspanic |
+-----------+-----------------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=7.137426853179932, val_loss=7.25108508682251
		step=600, train_loss=6.502966403961182, val_loss=6.437609629821777
		step=900, train_loss=6.442091464996338, val_loss=6.343654668426514
		step=1200, train_loss=6.122651100158691, val_loss=6.210512525939941
loss: 6.188670066070556
+-----------+---------+
| Word      | Predict |
+-----------+---------+
| [CLS]     | [CLS]   |
| According | [SEP]   |
| to        | [SEP]   |
| this      | [SEP]   |
| article   | [SEP]   |
| ,         | [SEP]   |
| the       | [SEP]   |
| Taco      | [SEP]   |
| Bell      | [SEP]   |
| [UNK]     | [SEP]   |
| Sauce     | [SEP]   |
| [SEP]     | [SEP]   |
+-----------+---------+
batch_norm = True
loss: 11.481671836853028
+-------+------------+
| Word  | Predict    |
+-------+------------+
| [CLS] | Airy       |
| mo    | Fb         |
| ¿     | RJFlamingo |
| o     | RJFlamingo |
| no    | RJFlamingo |
| [UNK] | RJFlamingo |
| ¿     | RJFlamingo |
| [UNK] | RJFlamingo |
| &     | RJFlamingo

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=8.163247108459473, val_loss=10.113876968383789
		step=600, train_loss=6.417255878448486, val_loss=15.904628247070313
		step=900, train_loss=5.674330234527588, val_loss=19.049508041381834
		step=1200, train_loss=5.843593120574951, val_loss=21.089601528930665
loss: 24.01557347717285
+---------+----------+
| Word    | Predict  |
+---------+----------+
| [CLS]   | megashea |
| Finally | slpknt   |
| at      | slpknt   |
| home    | '        |
| with    | '        |
| my      | '        |
| honey   | '        |
| bunny   | [SEP]    |
| after   | [SEP]    |
| a       | [SEP]    |
| long    | .        |
| [SEP]   | .        |
+---------+----------+


Уменьшение размера словаря:

In [29]:
for min_count in min_count_params:
    print(f'min_count = {min_count}')
    word2idx, idx2word = word_dict(dataset_train, min_count=min_count)
    tokenizer = Tokenizer(word2idx, RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+'))
    train_data_sent = tokenizer(dataset_train.values[:, 1])
    test_data_sent = tokenizer(dataset_test.values[:, 1])
    dataset_train_pt = TensorDataset(train_data_sent, train_data_sent)
    dataset_test_pt = TensorDataset(test_data_sent, test_data_sent)
    model = Autoencoder(vocab_dim=len(word2idx), emb_dim=10, latent_dim=10, hidden_dim=10)
    model.to(device)
    writer = SummaryWriter(log_dir=f'tensorboard3/min_count_{min_count}')
    call = callback(writer, dataset_test_pt, loss_function)
    check_model(64, dataset_test_pt, model, loss_function, idx2word)
    trainer(count_of_epoch=1,
            batch_size=64,
            dataset=dataset_train_pt,
            model=model,
            loss_function=loss_function,
            optimizer = optimizer,
            lr=0.001,
            callback = call)
    check_model(64, dataset_test_pt, model, loss_function, idx2word)

min_count = 1


  0%|          | 0/80000 [00:00<?, ?it/s]

loss: 11.510943992614745
+-------+---------+
| Word  | Predict |
+-------+---------+
| [CLS] | jhdrr   |
| #     | jhdrr   |
| [UNK] | jhdrr   |
| #     | jhdrr   |
| [UNK] | jhdrr   |
| 09    | jhdrr   |
| 35    | jhdrr   |
| .     | jhdrr   |
| [UNK] | jhdrr   |
| 14    | jhdrr   |
| .     | jhdrr   |
| [SEP] | jhdrr   |
+-------+---------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=7.440906047821045, val_loss=7.312699252319336
		step=600, train_loss=6.278777599334717, val_loss=6.365197151947021
		step=900, train_loss=6.1684112548828125, val_loss=6.235853273773193
		step=1200, train_loss=6.075265407562256, val_loss=6.152376276397705
loss: 6.139908186340332
+---------+---------+
| Word    | Predict |
+---------+---------+
| [CLS]   | [CLS]   |
| Well    | @       |
| I       | [SEP]   |
| said    | [SEP]   |
| I       | [SEP]   |
| would   | [SEP]   |
| drag    | [SEP]   |
| the     | [SEP]   |
| fine    | [SEP]   |
| weather | [SEP]   |
| north   | [SEP]   |
| [SEP]   | [SEP]   |
+---------+---------+
min_count = 3


  0%|          | 0/80000 [00:00<?, ?it/s]

loss: 9.804461152648926
+---------+---------+
| Word    | Predict |
+---------+---------+
| [CLS]   | pickle  |
| ALWAYS  | pickle  |
| LOOKING | PEOPLE  |
| FOR     | PEOPLE  |
| [UNK]   | PEOPLE  |
| AND     | PEOPLE  |
| [UNK]   | jst     |
| [UNK]   | jst     |
| ,       | jst     |
| IF      | jst     |
| [UNK]   | jst     |
| [SEP]   | jst     |
+---------+---------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=6.16902494430542, val_loss=6.088178916931152
		step=600, train_loss=5.598550796508789, val_loss=5.530441246795654
		step=900, train_loss=5.373997211456299, val_loss=5.438145085144043
		step=1200, train_loss=5.326599597930908, val_loss=5.389489937591553
loss: 5.3827854225158696
+-----------+---------+
| Word      | Predict |
+-----------+---------+
| [CLS]     | [CLS]   |
| Currently | [CLS]   |
| doing     | [SEP]   |
| research  | [SEP]   |
| for       | [SEP]   |
| a         | [SEP]   |
| school    | [SEP]   |
| project   | [SEP]   |
| Its       | [SEP]   |
| due       | [SEP]   |
| wednesday | [SEP]   |
| [SEP]     | [SEP]   |
+-----------+---------+
min_count = 5


  0%|          | 0/80000 [00:00<?, ?it/s]

loss: 9.40808381652832
+-------+---------+
| Word  | Predict |
+-------+---------+
| [CLS] | green   |
| @     | GO      |
| [UNK] | GO      |
| OK    | GO      |
| -     | GO      |
| gonna | GO      |
| give  | ie      |
| it    | ie      |
| a     | ie      |
| try   | ie      |
| on    | ie      |
| [SEP] | ie      |
+-------+---------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=6.031227111816406, val_loss=5.978123084259034
		step=600, train_loss=5.399112701416016, val_loss=5.2325722648620605
		step=900, train_loss=5.28649377822876, val_loss=5.061709470367432
		step=1200, train_loss=4.977850914001465, val_loss=4.980846421813965
loss: 4.969283097076416
+----------+---------+
| Word     | Predict |
+----------+---------+
| [CLS]    | [CLS]   |
| @        | @       |
| [UNK]    | [UNK]   |
| 77       | [UNK]   |
| thats    | [UNK]   |
| great    | [UNK]   |
| !        | [UNK]   |
| !        | [UNK]   |
| !        | [UNK]   |
| whatever | [UNK]   |
| the      | [SEP]   |
| [SEP]    | [SEP]   |
+----------+---------+


## Обучение модели с наилучшими параметрами

После анализов результатов (в т.ч. графиков tensorboard) можно сделать вывод, что лучше выбирать модель с большим количеством и размером слоёв, dropout можно сказать не влияет на результат, batchnorm дал странный результат (похоже на переобучение, очень плохие результаты на тесте и даже на трейне сначала, потом разница небольшая), редко используемые слова лучше убрать из словаря. Протестируем модель с наибольшим (из экспериментов) размером и количеством слоёв, выключенным dropout и batchnorm, наименьшим размером словаря.

In [30]:
word2idx, idx2word = word_dict(dataset_train, min_count = 5)
tokenizer = Tokenizer(word2idx, RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+'))
train_data_sent = tokenizer(dataset_train.values[:, 1])
test_data_sent = tokenizer(dataset_test.values[:, 1])
dataset_train_pt = TensorDataset(train_data_sent, train_data_sent)
dataset_test_pt = TensorDataset(test_data_sent, test_data_sent)
model = Autoencoder(vocab_dim=len(word2idx), emb_dim = 50, latent_dim = 50, hidden_dim = 50, num_layers = 7, dropout = 0, batch_norm = False)
model.to(device)
writer = SummaryWriter(log_dir=f'tensorboard3/final')
call = callback(writer, dataset_test_pt, loss_function)
check_model(64, dataset_test_pt, model, loss_function, idx2word)
trainer(count_of_epoch=1,
        batch_size=64,
        dataset=dataset_train_pt,
        model=model,
        loss_function=loss_function,
        optimizer = optimizer,
        lr=0.001,
        callback = call)
check_model(64, dataset_test_pt, model, loss_function, idx2word)

  0%|          | 0/80000 [00:00<?, ?it/s]

loss: 9.392846656799316
+-------+---------+
| Word  | Predict |
+-------+---------+
| [CLS] | doctors |
| haha  | tot     |
| ,     | tot     |
| didn  | tot     |
| '     | tot     |
| t     | tot     |
| post  | demon   |
| in    | demon   |
| a     | demon   |
| long  | demon   |
| time  | demon   |
| [SEP] | demon   |
+-------+---------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=5.054401874542236, val_loss=4.968156016540528
		step=600, train_loss=4.913304805755615, val_loss=4.8078180587768555
		step=900, train_loss=4.804904460906982, val_loss=4.740937191009522
		step=1200, train_loss=4.738305568695068, val_loss=4.692846277618409
loss: 4.681375462341308
+---------+---------+
| Word    | Predict |
+---------+---------+
| [CLS]   | [CLS]   |
| yeah    | I       |
| ,       | I       |
| its     | .       |
| the     | .       |
| weekend | .       |
| [SEP]   | .       |
+---------+---------+


4.681375462341308

Да, эта модель дала абсолютно лучший результат.

In [31]:
def check_model_with_acc(batch_size, dataset, model, loss_function, idx2word):
    model.eval()
    batch_generator = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size)
    test_loss = 0
    total_correct = 0
    total_words = 0
    for it, (x_batch, y_batch) in enumerate(batch_generator):
        x_batch = x_batch.to(model.device)
        y_batch = y_batch.to(model.device)
        with torch.no_grad():
            output = model(x_batch)
        test_loss += loss_function(output.transpose(1, 2), y_batch).cpu().item() * len(x_batch)
        # Получаем предсказанные слова
        predicted_indices = output.argmax(dim=-1)  # Индексы предсказанных слов
        total_correct += (predicted_indices == y_batch).sum().item()  # Считаем правильные предсказания
        total_words += y_batch.numel()  # Общее количество слов в батче
    test_loss /= len(dataset)
    accuracy = total_correct / total_words  # Вычисляем accuracy
    print(f'loss: {test_loss}, accuracy: {accuracy:.4f}')  # Выводим loss и accuracy
    dataloader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)
    x, y = next(iter(dataloader))
    x = x.to(model.device)
    y = y.to(model.device)
    with torch.no_grad():
        outputs = model(x)
    one_x = x[0].cpu().numpy()
    one_output = outputs[0].argmax(dim=-1).cpu().numpy()
    words = [idx2word[idx] for idx in one_x]
    pred_words = [idx2word[idx] for idx in one_output]
    table = PrettyTable(["Word", "Predict"])
    table.align["Word"], table.align["Predict"] = "l", "l"
    for word, pred in zip(words, pred_words):
        if word != idx2word[word2idx['[PAD]']]:
            table.add_row([word, pred])
    print(table)
    return test_loss, accuracy  # Возвращаем также accuracy

Однако качество восстановления всё равно плохое. Скорее всего, для данной задачи нужно больше эпох при обучении и, возможно, больше исходных данных. А также имеет смысл попробовать ещё увеличить размер слоя. Но и так каждая модель обучалась где-то 40-45 минут, это долго (хотя всего 1 эпоха).

In [32]:
check_model_with_acc(64, dataset_test_pt, model, loss_function, idx2word)

loss: 4.681375462341308, accuracy: 0.2809
+-------+---------+
| Word  | Predict |
+-------+---------+
| [CLS] | [CLS]   |
| I     | I       |
| hate  | '       |
| this  | .       |
| part  | .       |
| right | .       |
| here  | .       |
| !     | .       |
| I     | .       |
| hate  | .       |
| the   | [SEP]   |
| [SEP] | [SEP]   |
+-------+---------+


(4.681375462341308, 0.28086666666666665)

Хотя при уменьшении размера словаря обучение сильно ускорилось. Дадим ещё один шанс модели, увеличим размер слоя и число эпох обучения.

In [33]:
model = Autoencoder(vocab_dim=len(word2idx), emb_dim = 100, latent_dim = 100, hidden_dim = 100, num_layers = 7, dropout = 0, batch_norm = False)
model.to(device)
writer = SummaryWriter(log_dir=f'tensorboard3/cheat')
call = callback(writer, dataset_test_pt, loss_function)
check_model_with_acc(64, dataset_test_pt, model, loss_function, idx2word)
trainer(count_of_epoch=5,
        batch_size=64,
        dataset=dataset_train_pt,
        model=model,
        loss_function=loss_function,
        optimizer = optimizer,
        lr=0.001,
        callback = call)
check_model_with_acc(64, dataset_test_pt, model, loss_function, idx2word)

loss: 9.391493713378907, accuracy: 0.0000
+----------+-----------+
| Word     | Predict   |
+----------+-----------+
| [CLS]    | possible  |
| I        | microsoft |
| was      | microsoft |
| thinking | microsoft |
| to       | microsoft |
| myself   | microsoft |
| &        | microsoft |
| quot     | microsoft |
| ;        | microsoft |
| wow      | microsoft |
| what     | microsoft |
| [SEP]    | microsoft |
+----------+-----------+


epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

		step=300, train_loss=5.027698516845703, val_loss=4.920947412109375
		step=600, train_loss=4.9747161865234375, val_loss=4.865446396636963
		step=900, train_loss=4.737959384918213, val_loss=4.84345210723877
		step=1200, train_loss=4.845438003540039, val_loss=4.831939554595947


  0%|          | 0/1250 [00:00<?, ?it/s]

		step=1500, train_loss=4.881037712097168, val_loss=4.735683513641358
		step=1800, train_loss=4.7061944007873535, val_loss=4.694440692138672
		step=2100, train_loss=4.5958428382873535, val_loss=4.6776361106872555
		step=2400, train_loss=4.552255153656006, val_loss=4.665281605529785


  0%|          | 0/1250 [00:00<?, ?it/s]

		step=2700, train_loss=4.675617218017578, val_loss=4.632002725982666
		step=3000, train_loss=4.673832893371582, val_loss=4.60349369430542
		step=3300, train_loss=4.731934070587158, val_loss=4.576621356201172
		step=3600, train_loss=4.607517242431641, val_loss=4.540748796844483


  0%|          | 0/1250 [00:00<?, ?it/s]

		step=3900, train_loss=4.375320911407471, val_loss=4.5430634971618655
		step=4200, train_loss=4.420837879180908, val_loss=4.507258138275146
		step=4500, train_loss=4.59522819519043, val_loss=4.493655236816406
		step=4800, train_loss=4.453329086303711, val_loss=4.483151457214356


  0%|          | 0/1250 [00:00<?, ?it/s]

		step=5100, train_loss=4.647826671600342, val_loss=4.460988728332519
		step=5400, train_loss=4.558731555938721, val_loss=4.447318857574463
		step=5700, train_loss=4.181212902069092, val_loss=4.425877815246582
		step=6000, train_loss=4.449232578277588, val_loss=4.399534207916259
loss: 4.389816265106202, accuracy: 0.3069
+---------+---------+
| Word    | Predict |
+---------+---------+
| [CLS]   | [CLS]   |
| [UNK]   | [UNK]   |
| -       | [UNK]   |
| Twitter | [UNK]   |
| for     | .       |
| [UNK]   | .       |
| users   | .       |
| ,       | .       |
| [UNK]   | .       |
| the     | .       |
| spam    | ,       |
| [SEP]   | [SEP]   |
+---------+---------+


(4.389816265106202, 0.3069375)

In [42]:
check_model_with_acc(64, dataset_test_pt, model, loss_function, idx2word)

loss: 4.389816265106202, accuracy: 0.3069
+-----------+---------+
| Word      | Predict |
+-----------+---------+
| [CLS]     | [CLS]   |
| I         | I       |
| have      | am      |
| only      | [UNK]   |
| two       | [UNK]   |
| (         | .       |
| 2         | .       |
| )         | .       |
| followers | .       |
| .         | .       |
| .         | .       |
| [SEP]     | [SEP]   |
+-----------+---------+


(4.389816265106202, 0.3069375)

К сожалению, результат всё же плохой, модель в основном выдаёт знаки препинания. Но что-то получается восстановить (I am вместо I have). Зависимость качества от параметров проследить удалось, странный только результат при batchnorm.