## Data Processing

In [2]:
# For reproducibility
import random
import torch

SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
random_state = random.seed(SEED)
torch.cuda.empty_cache()

device = torch.device("cuda:1" if 
    torch.cuda.is_available() else "cpu")

from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('spacy', language='en_core_web_sm') # <1>


In [3]:
print(torch.cuda.current_device())

torch.cuda.set_device(1) 

0


In [4]:
from torchtext.datasets import IMDB
from torch.utils.data.dataset import random_split

train_iter, test_iter = IMDB(
    split=('train', 'test')) #<1>

train_dataset = list(train_iter) #<2>
test_dataset  = list(test_iter)

num_train = int(len(train_dataset) * 0.70)
train_data, valid_data = \
    random_split(train_dataset, 
        [num_train, 
         len(train_dataset) - num_train]) # <3>

check data 

In [5]:
print(len(train_data), len(valid_data), len(test_dataset))
# out:17500 7500 25000

data_index = 21
print(train_data[data_index][0])
# out: (your results may vary)
#   pos

print(train_data[data_index][1])
# out: (your results may vary)
# ['This', 'film', 'moved', 'me', 'beyond', 'comprehension', ...

17500 7500 25000
neg
I missed the beginning but I did see most of it. A friend got it on DVD in the cheap room at FYE.<br /><br />The skits are all very short, and yet most of them are still too long. The majority of them, they seem to have forgotten to have something funny! Quite a lot of racist/sexist/"homophobic" humor in it, skits based on stereotypes, or skits which use racist terms for people.<br /><br />I'm trying to remember anything I thought was funny in it, and I'm having trouble.... The logo for the Tunnel Vision network is a lipsticked mouth with an eyeball in it. The mouth opens and closes over the eye like eyelids. Kind of creepy.<br /><br />What a disappointment. Most of the actors went on to better things, and it's lucky this bomb didn't hold them back.


Use pre-trained embedding

In [6]:
from torchtext.vocab import GloVe
embed_len = 200
global_vectors = GloVe(name='6B', dim=embed_len)

In [7]:
embeddings = global_vectors.get_vecs_by_tokens(tokenizer("Hello, How are you?"), lower_case_backup=True)

embeddings

tensor([[ 0.2661,  0.2182, -0.1100,  ..., -0.1198, -0.1916, -0.1352],
        [ 0.1765,  0.2921, -0.0021,  ..., -0.2077, -0.2319, -0.1081],
        [ 0.1815,  0.2663,  0.0550,  ...,  0.5375,  0.3151,  0.0162],
        [ 0.0367,  0.1989, -0.0930,  ..., -0.0133, -0.0039,  0.7128],
        [ 0.8540,  0.5715, -0.0237,  ...,  0.3108, -0.2230,  0.2037],
        [ 0.3911,  0.4019, -0.1505,  ..., -0.0348,  0.0798,  0.5031]])

In [8]:
global_vectors.get_vecs_by_tokens("<BOS>")

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])

In [9]:
# this is using the frequencies in the data 

# https://coderzcolumn.com/tutorials/artificial-intelligence/how-to-use-glove-embeddings-with-pytorch#Load-Glove-'42B'-Embeddings

from collections import Counter
from torchtext.vocab import vocab
 
# tokenizer(next(train_iter)[1])

# create vocabulatory
counter = Counter()
for (label, line) in train_iter:
    counter.update(tokenizer(line))

# be careful, lower case vocab to call function   
vocab = vocab(counter, min_freq=10, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))
vocab.set_default_index(vocab['<unk>'])  # default index for oov words

 



In [10]:
len(vocab), vocab.lookup_token(21), vocab['was']

(23404, 'was', 21)

set up embedding for the vocabulatory

In [11]:
# create embedding for dat

# embed_len = 100

glove_embedding_tensor = torch.zeros(len(vocab),  embed_len).to(device)
 
glove_embedding_tensor[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:1')

In [12]:

for i in range(len(vocab)):
    try: 
        glove_embedding_tensor[i] = global_vectors.get_vecs_by_tokens(vocab.lookup_token(i))
    #except KeyError as e:
    #    glove_embedding_tensor[i] = torch.normal(0, 1, size=(embed_len,))
    except KeyError:
        glove_embedding_tensor[i] = torch.normal(0, 1, size=(embed_len,))
   

In [13]:
#text_transform = lambda x: [global_vectors.get_vecs_by_tokens('<BOS>')] + [global_vectors.get_vecs_by_tokens(token) for token in tokenizer(x)] + [global_vectors.get_vecs_by_tokens('<EOS>')]

#t1 = text_transform("how are you?")
 
glove_embedding_tensor.shape,glove_embedding_tensor[1232:1234,]

(torch.Size([23404, 200]),
 tensor([[ 4.7241e-01,  1.3480e-01, -3.0800e-01, -3.8037e-01, -5.0223e-02,
          -1.7769e-01, -6.5435e-01,  7.7492e-02,  5.4810e-01,  4.0445e-01,
          -2.0210e-01,  5.2530e-01,  2.4836e-01, -1.8132e-01,  5.5969e-02,
           6.9195e-02, -5.0195e-02,  4.9772e-01,  1.6974e-01, -2.0894e-01,
           1.6659e-01,  2.4535e+00, -1.7690e-01,  1.8831e-01,  2.0600e-01,
          -4.1079e-01,  2.3757e-01, -8.8489e-03, -1.1169e-01, -2.4455e-01,
          -3.5881e-01, -3.0782e-01,  7.9376e-03, -9.7143e-03, -2.4621e-01,
          -2.7075e-01, -6.3403e-01, -1.5270e-01,  2.4267e-03,  3.7299e-01,
          -2.5578e-01,  7.2947e-02,  1.4815e-01,  3.8708e-01, -2.5071e-01,
           1.2053e-01,  8.2654e-01,  2.5076e-01, -8.6929e-02,  2.6227e-01,
           1.0826e-01, -1.0061e-01, -8.2954e-02,  3.2549e-01,  3.7349e-01,
          -1.2629e-01,  2.0674e-01, -4.6491e-01, -5.4386e-02,  3.6859e-02,
          -1.0246e-01, -4.4074e-02, -1.8846e-01,  1.5918e-01,  1.1041e-01

Setup data loader

In [14]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

text_transform = lambda x: [vocab['<BOS>']] + [vocab[token] for token in tokenizer(x)] + [vocab['<EOS>']]
label_transform = lambda x: 1 if x == 'pos' else 0
    
def collate_batch(batch):
   label_list, text_list = [], []
   for (_label, _text) in batch: # each represents one text
        label_list.append(label_transform(_label))
        processed_text = torch.tensor(text_transform(_text))
        text_list.append(processed_text)
   return torch.tensor(label_list, dtype=torch.long).to(device), pad_sequence(text_list,  padding_value=1.0).to(device) # not using batch_first=True,
 

In [15]:

BATCH_SIZE = 64
train_dataloader = DataLoader(dataset=train_data, 
                              batch_size=BATCH_SIZE, 
                              shuffle=True , 
                              collate_fn=collate_batch)
 
                  # collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_data, 
                  batch_size=BATCH_SIZE,
                  shuffle=True, 
                  collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, 
                  batch_size=BATCH_SIZE,
                  shuffle=True, 
                  collate_fn=collate_batch)

In [16]:
trainlabel, trainfeature = next(iter(train_dataloader)) 
trainfeature,trainlabel, len(trainfeature), len(trainlabel), trainfeature.shape, # len is by row


# 1182 is the longest length of the batch 


(tensor([[    1,     1,     1,  ...,     1,     1,     1],
         [    4, 22276,     4,  ...,   290,  1732,  1404],
         [  409, 20475,  2989,  ...,    58,    41,  1405],
         ...,
         [    1,     1,     1,  ...,     1,     1,     1],
         [    1,     1,     1,  ...,     1,     1,     1],
         [    1,     1,     1,  ...,     1,     1,     1]], device='cuda:1'),
 tensor([0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
         0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
         0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1], device='cuda:1'),
 1042,
 64,
 torch.Size([1042, 64]))

## Model Design

In [17]:
 # https://www.geeksforgeeks.org/pimport  
import torch.nn.functional as F

class FastText(torch.nn.Module):
    def __init__(self, 
                 vocab_size, 
                 embedding_dim, 
                 output_dim, 
                 pad_idx):
        super().__init__()
        # change to pretrained embedding 
        self.embedding = torch.nn.Embedding(
            vocab_size, 
            embedding_dim, 
            padding_idx=pad_idx)

        self.fc = torch.nn.Linear(embedding_dim, 
                            output_dim)
        
    def forward(self, text):
                
        embedded = self.embedding(text) 
        # text is the numeric indices of words 
        # the indices are be based on the pretrained dictionary 

        embedded = embedded.permute(1, 0, 2)
        pooled = F.avg_pool2d(
            embedded, 
            (embedded.shape[1], 1)).squeeze(1) 
        return self.fc(pooled)

In [18]:
model = FastText(
            vocab_size = len(vocab), 
            embedding_dim = embed_len, 
            output_dim = 1, 
            pad_idx = None)

## Train & Validation

In [19]:
model.embedding = torch.nn.Embedding.from_pretrained(glove_embedding_tensor,freeze=True)

In [20]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = torch.nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [21]:
for epoch in range(5):
  epoch_loss = 0
  epoch_acc = 0
  
  model.train()
  for label, text in train_dataloader:
      #print(label.dtype)
      #print(text)
      optimizer.zero_grad()
      predictions = model(text).squeeze(1)
      #print(predictions.dtype)
      loss = criterion(predictions, label.float())
      
      rounded_preds = torch.round(
          torch.sigmoid(predictions))
      correct = \
        (rounded_preds == label).float()
      acc = correct.sum() / len(correct)
      
      loss.backward()
      optimizer.step()
      epoch_loss += loss.item()
      epoch_acc += acc.item()

  print("Epoch %d Train: Loss: %.4f Acc: %.4f" %
          (epoch,
          epoch_loss / len(train_dataloader), 
          epoch_acc / len(train_dataloader)))

  epoch_loss = 0
  epoch_acc = 0
  model.eval()
  with torch.no_grad():
    for label, text in valid_dataloader:
      predictions = model(text).squeeze(1)
      loss = criterion(predictions, label.float())
      
      rounded_preds = torch.round(
          torch.sigmoid(predictions))
      correct = \
        (rounded_preds == label).float()
      acc = correct.sum() / len(correct)
      
      epoch_loss += loss.item()
      epoch_acc += acc.item()

  print("Epoch %d Valid: Loss: %.4f Acc: %.4f" %
          (epoch,
          epoch_loss / len(valid_dataloader), 
          epoch_acc / len(valid_dataloader)))
  
# out: (your results may vary)
# Epoch 0 Train: Loss: 0.6523 Acc: 0.7165
# Epoch 0 Valid: Loss: 0.5259 Acc: 0.7474
# Epoch 1 Train: Loss: 0.5935 Acc: 0.7765
# Epoch 1 Valid: Loss: 0.4571 Acc: 0.7933
# Epoch 2 Train: Loss: 0.5230 Acc: 0.8257
# Epoch 2 Valid: Loss: 0.4103 Acc: 0.8245
# Epoch 3 Train: Loss: 0.4559 Acc: 0.8598
# Epoch 3 Valid: Loss: 0.3828 Acc: 0.8549
# Epoch 4 Train: Loss: 0.4004 Acc: 0.8813
# Epoch 4 Valid: Loss: 0.3781 Acc: 0.8675

Epoch 0 Train: Loss: 0.6894 Acc: 0.5889
Epoch 0 Valid: Loss: 0.6850 Acc: 0.6302
Epoch 1 Train: Loss: 0.6810 Acc: 0.6505
Epoch 1 Valid: Loss: 0.6779 Acc: 0.6241
Epoch 2 Train: Loss: 0.6734 Acc: 0.6620
Epoch 2 Valid: Loss: 0.6708 Acc: 0.6701
Epoch 3 Train: Loss: 0.6662 Acc: 0.6777
Epoch 3 Valid: Loss: 0.6639 Acc: 0.6755
Epoch 4 Train: Loss: 0.6605 Acc: 0.6880
Epoch 4 Valid: Loss: 0.6587 Acc: 0.6758


# Testing & Deployment

In [22]:
test_loss = 0
test_acc = 0
model.eval() # <1>
with torch.no_grad(): # <1>
  for label, text in test_dataloader:
    predictions = model(text).squeeze(1)
    loss = criterion(predictions, label.float())
    
    rounded_preds = torch.round(
        torch.sigmoid(predictions))
    correct = \
      (rounded_preds == label).float()
    acc = correct.sum() / len(correct)

    test_loss += loss.item()
    test_acc += acc.item()

print("Test: Loss: %.4f Acc: %.4f" %
        (test_loss / len(test_dataloader), 
        test_acc / len(test_dataloader)))
# out: (your results will vary)
#   Test: Loss: 0.3821 Acc: 0.8599

Test: Loss: 0.6590 Acc: 0.6740


In [23]:
import spacy
nlp = spacy.load('en_core_web_sm')


text_pipeline = lambda x: [vocab[token] 
      for token in tokenizer(x)]


def predict_sentiment(model, sentence):
    model.eval()
    text = torch.tensor(text_pipeline(sentence)).unsqueeze(1).to(device)
    prediction = torch.sigmoid(model(text))
    return prediction.item()

sentiment = predict_sentiment(model, 
                  "Don't waste your time")
print(sentiment)
# out: 4.763594888613835e-34

sentiment = predict_sentiment(model, 
                  "You gotta see this movie!")
print(sentiment)
# out: 0.941755473613739

0.007512994576245546
0.03589097410440445


In [24]:
torch.save(model.state_dict(), 'fasttext-model.pt')

In [1]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()