In [34]:
# убеждаемся, что используем видеокарту
!nvidia-smi

Sat Mar 13 20:09:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [35]:
# Подклячаем библиотеки
import os
import torch
from torchtext import legacy, data
import time
import random
import torch.nn as nn
import torch.optim as optim
import torchtext

In [36]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [37]:
# Подключаем диск
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
# Указываем путь
PATH = '/content/drive/My Drive/datasets/dz/'
!ls '/content/drive/My Drive/datasets/dz/'

11	     Content.jpg  GAN_Images.zip  test.json
archive.zip  dz_11	  Style.jpg	  train.json


In [39]:
# !unzip -q '/content/drive/My Drive/datasets/dz/11/test.json.zip' -d '/content/drive/My Drive/datasets/dz/'
# !unzip -q '/content/drive/My Drive/datasets/dz/11/train.json.zip' -d '/content/drive/My Drive/datasets/dz/'

In [43]:
TEXT = legacy.data.Field()
LABEL = legacy.data.LabelField(dtype=torch.float)

fields = {'text': ('text', TEXT), 'label': ('label', LABEL)}

train_data, test_data = legacy.data.TabularDataset.splits(
    path = PATH,
    train = 'train.json',
    test = 'test.json',
    format = 'json',
    fields = fields)

train_data, valid_data = train_data.split(random_state = random.seed(SEED))
idx = 4
print("text :"," ".join(train_data[idx].__dict__['text']) )
print("\nlabel :",train_data[idx].__dict__['label'] )

text : I saw this movie for 2 reasons -- I like Gerard Butler and Christopher Plummer . Unfortunately , these poor men were forced to carry a pretty dumb movie . I liked the idea that Dracula is actually a reincarnation of Judas Iscariot , because it does explain his disdain for all things Christian , but there was so much camp that this idea was not realized as much as it could have been . I see this movie more as a way for the talented Gerard Butler to pay his dues before being truly recognized and a way for the legendary Christopher Plummer to remind the public ( me and the 5 other people who saw this film ) that he still exists . I actually enjoyed the special features on the DVD more than the movie itself .

label : neg


In [44]:
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

print(len(TEXT.vocab))
print(len(LABEL.vocab))

25002
2


In [45]:
TEXT.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f75f9e485d0>>,
            {'<unk>': 0,
             '<pad>': 1,
             'the': 2,
             ',': 3,
             '.': 4,
             'a': 5,
             'and': 6,
             'of': 7,
             'to': 8,
             'is': 9,
             'in': 10,
             'I': 11,
             'it': 12,
             'that': 13,
             '"': 14,
             "'s": 15,
             'this': 16,
             '-': 17,
             '/><br': 18,
             'was': 19,
             'as': 20,
             'with': 21,
             'movie': 22,
             'for': 23,
             'film': 24,
             'The': 25,
             'but': 26,
             '(': 27,
             'on': 28,
             ')': 29,
             "n't": 30,
             'you': 31,
             'are': 32,
             'not': 33,
             'have': 34,
             'his': 35,
             'be': 36,
             'he': 37,
      

In [46]:
LABEL.vocab.stoi

defaultdict(None, {'neg': 0, 'pos': 1})

In [48]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device : {device}")
train_iterator, valid_iterator, test_iterator = legacy.data.BucketIterator.splits(
    (train_data,valid_data,test_data),
    batch_size = BATCH_SIZE,
    sort = False,
    device = device)

device : cuda


In [49]:
# Класс создания сети

class RNN(nn.Module):

    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim,output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        return self.fc(hidden.squeeze(0))

In [50]:
# Параметры для обучения
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)


In [51]:
def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"No. of trainable parameters : {count_params(model):,}")

No. of trainable parameters : 2,592,105


In [52]:
# Функция точности
def bin_acc(preds, y):
    rounded = torch.round(torch.sigmoid(preds))
    correct = (rounded==y).float()
    return sum(correct)/len(correct)

# Функция обучения
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0
    model.train()    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = bin_acc(predictions, batch.label)

        loss.backward()
        optimizer.step()

        epoch_loss += loss
        epoch_acc += acc

    return epoch_loss/len(iterator), epoch_acc/len(iterator)  

# Функция оценки
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = bin_acc(predictions, batch.label)
            epoch_loss += loss
            epoch_acc += acc
    return epoch_loss/len(iterator), epoch_acc/len(iterator) 


# Функция расчета времени
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [53]:
# Запустим обучение
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'simple_rnn.pt')
    

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 17s
	Train Loss: 0.694 | Train Acc: 50.20%
	 Val. Loss: 0.693 |  Val. Acc: 49.39%
Epoch: 02 | Epoch Time: 0m 17s
	Train Loss: 0.693 | Train Acc: 49.58%
	 Val. Loss: 0.693 |  Val. Acc: 49.39%
Epoch: 03 | Epoch Time: 0m 17s
	Train Loss: 0.693 | Train Acc: 49.91%
	 Val. Loss: 0.693 |  Val. Acc: 50.91%
Epoch: 04 | Epoch Time: 0m 17s
	Train Loss: 0.693 | Train Acc: 49.81%
	 Val. Loss: 0.693 |  Val. Acc: 49.36%
Epoch: 05 | Epoch Time: 0m 16s
	Train Loss: 0.693 | Train Acc: 49.87%
	 Val. Loss: 0.693 |  Val. Acc: 50.90%
