In [1]:
#deal with tensors
import torch
import spacy
from torchtext.legacy import data

In [2]:
#Reproducing same results
SEED = 2019

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True


In [3]:
spacy.load("de_dep_news_trf")

<spacy.lang.de.German at 0x7f511ebb5640>

In [4]:
TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float)
fields = [(None, None),(None, None),(None, None),(None, None),(None, None),(None, None),
          ('label', LABEL),
          (None, None),(None, None),(None, None),(None, None),(None, None),(None, None),
          ('text',TEXT)]



In [12]:
# loading custom dataset
training_data=data.TabularDataset(path = 'protocols/labelled/only_lockdown.csv',format = 'csv',fields = fields,skip_header = True,
                                  csv_reader_params={'delimiter': '\t', 'quotechar': None})

In [20]:
positive = [e for e in training_data.examples if getattr(e, 'label') == '+']
negative = [e for e in training_data.examples if getattr(e, 'label') == '-']
neutral = [e for e in training_data.examples if getattr(e, 'label') == 'o']

In [22]:
import random
random.shuffle(positive)
random.shuffle(negative)
random.shuffle(neutral)
train_data = data.Dataset(positive[:] + negative[:90] + neutral[:90], fields)
valid_data = data.Dataset(negative[90:] + neutral[90:], fields)

# train_data, valid_data = training_data.split(split_ratio=0.7, random_state = random.seed(SEED))

In [23]:
#initialize glove embeddings
TEXT.build_vocab(training_data,min_freq=0,vectors = "glove.6B.100d")
LABEL.build_vocab(training_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))

#Word dictionary
print(TEXT.vocab.stoi)

Size of TEXT vocabulary: 3563
Size of LABEL vocabulary: 3
[(',', 1345), ('die', 549), ('.', 508), ('und', 454), ('der', 415), ('Lockdown', 380), ('\xa0', 310), ('in', 232), ('–', 211), ('den', 199)]
defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f5113a17eb0>>, {'<unk>': 0, '<pad>': 1, ',': 2, 'die': 3, '.': 4, 'und': 5, 'der': 6, 'Lockdown': 7, '\xa0': 8, 'in': 9, '–': 10, 'den': 11, 'ist': 12, 'dass': 13, 'm': 14, 'das': 15, 'i': 16, 'auch': 17, 'für': 18, 'des': 19, 'es': 20, ':': 21, 'haben': 22, 'nicht': 23, 'hat': 24, 'wir': 25, 'zu': 26, 'von': 27, 'Lockdowns': 28, 'sind': 29, 'dem': 30, 'mit': 31, '-': 32, 'eine': 33, 'zweiten': 34, 'Sie': 35, 'einen': 36, 'ein': 37, 'noch': 38, 'jetzt': 39, 'wird': 40, 'sich': 41, 'aber': 42, 'wie': 43, 'ich': 44, 'werden': 45, 'vor': 46, 'sie': 47, 'um': 48, '„': 49, 'auf': 50, 'an': 51, 'Der': 52, 'als': 53, '“': 54, 'diese': 55, 'so': 56, 'Die': 57, 'durch': 58, 'wieder': 59, 'am': 60, 'schon': 61, 

In [7]:
print(len(training_data))

100000


In [24]:
#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#set batch size
BATCH_SIZE = 16

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [25]:
i = 0
for x in valid_iterator:
    # for word in x.text:
    #     if 'counterfeit' in word:
    #         print(x.text)
    if i % 2000 == 0:
        print(i)
    i += 1
    if i > 3:
        break

0


In [26]:
import torch.nn as nn

class classifier(nn.Module):

    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout):

        #Constructor
        super().__init__()

        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        #lstm layer
        self.lstm = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout,
                           batch_first=True)

        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        #activation function
        self.act = nn.Sigmoid()

    def forward(self, text, text_lengths):

        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]

        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)

        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]

        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)

        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)

        return outputs

In [27]:
#define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 100
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers,
                   bidirectional = True, dropout = dropout)

In [28]:
#architecture
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

classifier(
  (embedding): Embedding(3563, 100)
  (lstm): LSTM(100, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act): Sigmoid()
)
The model has 415,757 trainable parameters
torch.Size([3563, 100])


In [29]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)

    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [30]:
def train(model, iterator, optimizer, criterion):

    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #set the model in training phase
    model.train()

    for batch in iterator:

        #resets the gradients after every batch
        optimizer.zero_grad()

        #retrieve text and no. of words
        text, text_lengths = batch.text

        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()

        #compute the loss
        loss = criterion(predictions, batch.label)

        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)

        #backpropage the loss and compute the gradients
        loss.backward()

        #update the weights
        optimizer.step()

        #loss and accuracy
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [31]:
def evaluate(model, iterator, criterion):

    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()

    #deactivates autograd
    with torch.no_grad():

        for batch in iterator:

            #retrieve text and no. of words
            text, text_lengths = batch.text

            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()

            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [37]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

	Train Loss: -33.560 | Train Acc: 66.44%
	 Val. Loss: 35.438 |  Val. Acc: 36.86%
	Train Loss: -33.560 | Train Acc: 66.44%
	 Val. Loss: 35.437 |  Val. Acc: 36.86%
	Train Loss: -33.560 | Train Acc: 66.44%
	 Val. Loss: 35.445 |  Val. Acc: 36.86%
	Train Loss: -33.560 | Train Acc: 66.44%
	 Val. Loss: 35.441 |  Val. Acc: 36.86%
	Train Loss: -33.560 | Train Acc: 66.44%
	 Val. Loss: 35.450 |  Val. Acc: 36.86%
	Train Loss: -33.560 | Train Acc: 66.44%
	 Val. Loss: 35.825 |  Val. Acc: 36.86%
	Train Loss: -33.560 | Train Acc: 66.44%
	 Val. Loss: 36.202 |  Val. Acc: 36.86%
	Train Loss: -33.560 | Train Acc: 66.44%
	 Val. Loss: 35.833 |  Val. Acc: 36.86%
	Train Loss: -33.560 | Train Acc: 66.44%
	 Val. Loss: 36.209 |  Val. Acc: 36.86%
	Train Loss: -33.560 | Train Acc: 66.44%
	 Val. Loss: 36.204 |  Val. Acc: 36.86%
	Train Loss: -33.560 | Train Acc: 66.44%
	 Val. Loss: 36.593 |  Val. Acc: 36.86%
	Train Loss: -33.560 | Train Acc: 66.44%
	 Val. Loss: 36.596 |  Val. Acc: 36.86%
	Train Loss: -33.560 | Train

In [25]:
torch.save(model.state_dict(), 'saved_weights/deep_classifier_03.pt')

In [21]:

#load weights
path='saved_weights/deep_classifier_03.pt'
model.load_state_dict(torch.load(path));
model.eval();

#inference
import spacy
nlp = spacy.load('en_core_web_sm')

def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor
    prediction = model(tensor, length_tensor)                  #prediction
    return prediction.item()

In [21]:
#make predictions
print(predict(model, "Is there discrimination against black men in the US?"))

#insincere question
print(predict(model, "Why do black men think they should be treated well?"))

0.803727924823761
0.9885077476501465
