In [None]:
import torch
from torchtext.legacy import data

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy', tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)

### IMDB DATASET

In [2]:
from torchtext.legacy import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100% 84.1M/84.1M [13:36<00:00, 103kB/s] 


In [13]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing  examples: {len(test_data)}')

Number of training examples: 17500
Number of testing  examples: 25000


In [4]:
print(vars(train_data.examples[0]))

{'text': ['I', 'would', 'not', 'like', 'to', 'comment', 'on', 'how', 'good', 'the', 'movie', 'was', 'or', 'what', 'were', 'the', 'flaws', 'as', 'I', 'am', 'not', 'a', 'professional', 'film', 'critic', 'and', 'I', 'do', 'not', 'have', 'enough', 'knowledge', 'of', 'making', 'movies', '.', 'What', 'i', 'do', 'know', 'is', 'that', 'making', 'this', 'kind', 'of', 'a', 'movie', 'in', 'your', 'very', 'first', 'shot', 'is', 'a', 'big', 'achievement', 'and', 'I', 'would', 'like', 'to', 'congratulate', 'the', 'Director', 'for', 'that', '.', 'However', ',', 'in', 'some', 'reviews', ',', 'that', 'i', 'have', 'read', ',', 'critics', 'have', 'complained', 'that', 'Hiralal', "'s", 'relationship', 'with', 'his', 'brothers', 'was', 'not', 'highlighted', ',', 'and', 'his', 'siblings', 'were', 'completely', 'erased', 'from', 'the', 'story', '.', 'Now', 'i', 'would', 'really', 'like', 'to', 'raise', 'a', 'point', 'here', 'that', 'as', 'the', 'name', 'of', 'the', 'movie', 'suggests', ',', 'it', 'is', 'not'

In [5]:
import random

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [6]:
print(f'Number of training examples:   {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples:    {len(test_data)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [7]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [8]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [9]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 201598), (',', 191166), ('.', 164264), ('a', 108869), ('and', 108688), ('of', 99786), ('to', 92940), ('is', 75592), ('in', 60795), ('I', 53840), ('it', 53227), ('that', 48734), ('"', 44249), ("'s", 43318), ('this', 41980), ('-', 37242), ('/><br', 35396), ('was', 34638), ('as', 30354), ('with', 29730)]


In [10]:
print(LABEL.vocab.stoi)

defaultdict(None, {'neg': 0, 'pos': 1})


### LOAD DATA

In [11]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

### MODEL


In [15]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(
        self, 
        input_dim, 
        embedding_dim, 
        hidden_dim, 
        output_dim
    ):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        return self.fc(hidden.squeeze(0))

In [16]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [17]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,592,105 trainable parameters


In [19]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [20]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

### TRAIN

In [23]:

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [24]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [25]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 18s
	Train Loss: 0.694 | Train Acc: 50.28%
	 Val. Loss: 0.697 |  Val. Acc: 49.71%
Epoch: 02 | Epoch Time: 0m 18s
	Train Loss: 0.693 | Train Acc: 49.84%
	 Val. Loss: 0.697 |  Val. Acc: 49.99%
Epoch: 03 | Epoch Time: 0m 18s
	Train Loss: 0.693 | Train Acc: 50.08%
	 Val. Loss: 0.697 |  Val. Acc: 50.66%
Epoch: 04 | Epoch Time: 0m 18s
	Train Loss: 0.693 | Train Acc: 49.85%
	 Val. Loss: 0.697 |  Val. Acc: 49.57%
Epoch: 05 | Epoch Time: 0m 18s
	Train Loss: 0.693 | Train Acc: 50.19%
	 Val. Loss: 0.697 |  Val. Acc: 50.69%


In [26]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.709 | Test Acc: 47.88%


# NBoW : Natural Bag-of-Words
### model are stromg whem performimg sentiment analysis or text classification

In [None]:
import functools
import sys

import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm

seed = 0

torch.manual_seed(seed)
np.random.seed(seed)

train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"])

In [31]:
train_data, test_data

(Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 }))

In [32]:
train_data.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)}

In [33]:
train_data[0]

{'text': 'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!',
 'label': 1}

In [36]:
tokenizer = torchtext.data.utils.get_tokenizer("basic_english")

In [37]:
tokenizer("Hello world! How are you doing today? I'm doing fantastic!")

['hello',
 'world',
 '!',
 'how',
 'are',
 'you',
 'doing',
 'today',
 '?',
 'i',
 "'",
 'm',
 'doing',
 'fantastic',
 '!']

### CLEAN DATA

In [40]:
def tokenize_example(example, tokenizer, max_length):
    tokens = tokenizer(example['text'])[:max_length]
    return {'tokens':tokens}

max_length = 256

train_data = train_data.map(tokenize_example, fn_kwargs={'tokenizer': tokenizer, 
                                                         'max_length':max_length})
test_data  = test_data.map(tokenize_example,  fn_kwargs={'tokenizer': tokenizer, 
                                                         'max_length':max_length})

train_data



  0%|          | 0/25000 [00:00<?, ?ex/s]

  0%|          | 0/25000 [00:00<?, ?ex/s]

Dataset({
    features: ['text', 'label', 'tokens'],
    num_rows: 25000
})

In [41]:
train_data.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [42]:
train_data[0]['tokens'][:25]

['bromwell',
 'high',
 'is',
 'a',
 'cartoon',
 'comedy',
 '.',
 'it',
 'ran',
 'at',
 'the',
 'same',
 'time',
 'as',
 'some',
 'other',
 'programs',
 'about',
 'school',
 'life',
 ',',
 'such',
 'as',
 'teachers',
 '.']

In [43]:
test_size = 0.25 
train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data['train']
valid_data = train_valid_data['test']

In [44]:
min_freq = 5 
special_tokens = ['<unk>', '<pad>']
vocab = torchtext.vocab.build_vocab_from_iterator(train_data['tokens'],
                                                 min_freq=min_freq,
                                                 specials=special_tokens)

In [47]:
vocab.get_itos()[:10]
unk_index = vocab['<unk>']
pad_index  = vocab['<pad>']
vocab.set_default_index(unk_index)

In [48]:
def numericalize_data(example, vocab):
    ids = [vocab[token] for token in example['tokens']]
    return {'ids': ids}

In [49]:
train_data = train_data.map(numericalize_data, fn_kwargs={'vocab': vocab})
valid_data = train_data.map(numericalize_data, fn_kwargs={'vocab': vocab})
test_data = train_data.map(numericalize_data, fn_kwargs={'vocab': vocab})

  0%|          | 0/18750 [00:00<?, ?ex/s]

  0%|          | 0/18750 [00:00<?, ?ex/s]

  0%|          | 0/18750 [00:00<?, ?ex/s]

In [50]:
train_data[0]

{'text': 'This documentary is at its best when it is simply showing the ayurvedic healers\' offices and treatment preparation. There is no denying the grinding poverty in India and desperation of even their wealthier clients. However, as an argument for ayurvedic medicine in general, this film fails miserably. Although Indian clients mention having seen "aleopathic" doctors, those doctors are not interviewed, and we have to take the vague statements of their patients at face value-- "the doctor said there was no cure," "the doctor said it was cancer" etc. Well, "no cure" doesn\'t mean "no treatment," and what type of cancer exactly does the patient have? The film is at its most feeble when showing ayurvedic practice in America. There it is reduced, apparently, to the stunning suggestion that having a high powered Wall Street job can make your stomach hurt.',
 'label': 0,
 'tokens': ['this',
  'documentary',
  'is',
  'at',
  'its',
  'best',
  'when',
  'it',
  'is',
  'simply',
  'sho

In [51]:
train_data = train_data.with_format(type='torch', columns=['ids', 'label'])
valid_data = valid_data.with_format(type='torch', columns=['ids', 'label'])
test_data = test_data.with_format(type='torch', columns=['ids', 'label'])

In [52]:
train_data[0]

{'label': tensor(0),
 'ids': tensor([   14,   627,    10,    37,   100,   125,    60,    11,    10,   361,
           834,     2,     0,     0,     9, 12187,     6,  2407,  9694,     3,
            46,    10,    66,  8861,     2, 16732,  3705,    13,  2360,     6,
          4374,     7,    69,    77,     0, 13332,     3,   190,     4,    19,
            41,  4597,    21,     0,  6574,    13,   822,     4,    14,    23,
           962,  3426,     3,   265,  1267, 13332,   798,   266,   111,     0,
          5592,     4,   157,  5592,    30,    29,  8351,     4,     6,    78,
            31,     8,   203,     2,  3400,  6614,     7,    77,  5229,    37,
           454,     0,     2,   937,   307,    46,    17,    66,  4845,     4,
             2,   937,   307,    11,    17,  5362,   487,     3,    82,     4,
            66,  4845,   173,     9,    28,   384,    66,  2407,     4,     6,
            55,   618,     7,  5362,   615,   135,     2,  3307,    31,    56,
             2,    23,  

In [53]:
def collate(batch, pad_idex):
    batch_ids = [i['ids'] for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, 
                                          padding_value=pad_index, 
                                          batch_first=True)
    batch_label = [i['label'] for i in batch]
    batch_label = torch.stack(batch_label)
    batch = {
        'ids':batch_ids,
        'label':batch_label
    }
    return batch

In [56]:
batch_size = 512 
colate = functools.partial(collate, pad_ind_dex=pad_index)
train_dataloader = torch.utils.data.DataLoader(train_data,
                                              batch_size=batch_size,
                                              collate_fn=collate,
                                              shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_data, 
                                               batch_size=batch_size, 
                                               collate_fn=collate)
test_dataloader = torch.utils.data.DataLoader(test_data, 
                                              batch_size=batch_size, 
                                              collate_fn=collate)

# BIULD MODEL NBOW

In [63]:
class NBoW(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        output_dim,
        pad_idx
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,
                                     embedding_dim,
                                     padding_idx=pad_index)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, idx):
        embedded = self.embedding(ids)
        pooled = ebedded.mean(dim=1)
        prediction = self.fc(pooled)
        

In [64]:
vocab_size = len(vocab)
embedding_dim = 300
output_dim = len(train_data.unique('label'))

model = NBoW(vocab_size, embedding_dim, output_dim, pad_index)

In [65]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,458,402 trainable parameters


In [None]:
vectors = torchtext.vocab.FastText()
hello_vector = vectors.get_vecs_by_tokens('hello')

.vector_cache/wiki.en.vec:  73% 4.84G/6.60G [13:12:42<4:42:54, 103kB/s]  

In [None]:
hello_vector.shape


In [None]:
hello_vector

In [None]:
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())

In [None]:
pretrained_embedding.shape

In [None]:
model.embedding.weight

In [None]:
pretrained_embedding

In [None]:
model.embedding.weight.data = pretrained_embedding

In [None]:
model.embedding.weight

In [None]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def train(dataloader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []

    for batch in tqdm.tqdm(dataloader, desc='training...', file=sys.stdout):
        ids = batch['ids'].to(device)
        label = batch['label'].to(device)
        prediction = model(ids)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs


def evaluate(dataloader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            ids = batch['ids'].to(device)
            label = batch['label'].to(device)
            prediction = model(ids)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

In [None]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [None]:
n_epochs = 10
best_valid_loss = float('inf')

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(n_epochs):

    train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, device)
    valid_loss, valid_acc = evaluate(valid_dataloader, model, criterion, device)

    train_losses.extend(train_loss)
    train_accs.extend(train_acc)
    valid_losses.extend(valid_loss)
    valid_accs.extend(valid_acc)
    
    epoch_train_loss = np.mean(train_loss)
    epoch_train_acc = np.mean(train_acc)
    epoch_valid_loss = np.mean(valid_loss)
    epoch_valid_acc = np.mean(valid_acc)
    
    if epoch_valid_loss < best_valid_loss:
        best_valid_loss = epoch_valid_loss
        torch.save(model.state_dict(), 'nbow.pt')
    
    print(f'epoch: {epoch+1}')
    print(f'train_loss: {epoch_train_loss:.3f}, train_acc: {epoch_train_acc:.3f}')
    print(f'valid_loss: {epoch_valid_loss:.3f}, valid_acc: {epoch_valid_acc:.3f}')

In [None]:
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(1,1,1)
ax.plot(train_losses, label='train loss')
ax.plot(valid_losses, label='valid loss')
plt.legend()
ax.set_xlabel('updates')
ax.set_ylabel('loss');

In [None]:
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(1,1,1)
ax.plot(train_accs, label='train accuracy')
ax.plot(valid_accs, label='valid accuracy')
plt.legend()
ax.set_xlabel('updates')
ax.set_ylabel('accuracy');

In [None]:
model.load_state_dict(torch.load('nbow.pt'))

test_loss, test_acc = evaluate(test_dataloader, model, criterion, device)

epoch_test_loss = np.mean(test_loss)
epoch_test_acc = np.mean(test_acc)

print(f'test_loss: {epoch_test_loss:.3f}, test_acc: {epoch_test_acc:.3f}')

In [None]:
def predict_sentiment(text, model, tokenizer, vocab, device):
    tokens = tokenizer(text)
    ids = [vocab[t] for t in tokens]
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    prediction = model(tensor).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1)
    predicted_class = prediction.argmax(dim=-1).item()
    predicted_probability = probability[predicted_class].item()
    return predicted_class, predicted_probability

# SENTENCES TEST

In [None]:
text = "This film is terrible!"

predict_sentiment(text, model, tokenizer, vocab, device)

In [None]:
text = "This film is great!"

predict_sentiment(text, model, tokenizer, vocab, device)

In [None]:
text = "This film is not terrible, it's great!"

predict_sentiment(text, model, tokenizer, vocab, device)

In [None]:
text = "This film is not great, it's terrible!"

predict_sentiment(text, model, tokenizer, vocab, device)