In [1]:
import pandas as pd
import numpy as np
import torch
import re
import tqdm


from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator

from nltk import tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [3]:
dataset=pd.read_csv('data/final_data.csv')


In [22]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,id,count,hate_speech,offensive_language,neither,class,tweet
0,0,0,3,0,0,1,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,1,3,0,1,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,2,3,0,1,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,3,0,1,0,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,4,6,0,1,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [69]:
a=dataset[dataset['neither']==1].tweet.values

In [70]:
a

array(["!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...",
       '" momma said no pussy cats inside my doghouse "',
       '"@Addicted2Guys: -SimplyAddictedToGuys http://t.co/1jL4hi8ZMF" woof woof hot scally lad',
       ...,
       'you know what they say, the early bird gets the worm. *puts gummy worms in your morning coffee*',
       "you've gone and broke the wrong heart baby, and drove me redneck crazy",
       '~~Ruffled | Ntac Eileen Dahlia - Beautiful color combination of pink, orange, yellow &amp; white. A Coll http://t.co/H0dYEBvnZB'],
      dtype=object)

In [6]:

def clean_tweet(tweet):
    tweet = re.sub("#", "",tweet) # Removing '#' from hashtags
    tweet = re.sub("RT", "",tweet) # Removing 'RT' from hashtags
    tweet = re.sub("[^a-zA-Z#]", " ",tweet) # Removing punctuation and special characters
    tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
    tweet = re.sub('http','',tweet)
    tweet = re.sub(" +", " ", tweet)
    tweet = tweet.lower()
    tweet = word_tokenize(tweet)
    return tweet
    return_tweet=[]
    for word in tweet:
        if word not in stop_words:
            return_tweet.append(word)
    return return_tweet

In [7]:
TEXT = Field(sequential=True, tokenize=clean_tweet, lower=True,batch_first=True)
LABEL = Field(sequential=False, use_vocab=False)


In [8]:
dataset_datafields = [("",None),("id",None),("count", None),
                      ("hate_speech", LABEL),("offensive_language", LABEL),
                      ("neither", LABEL),("label", None),("tweet",TEXT)
                    ]

data = TabularDataset(
        path="data/final_data.csv", # the root directory where the data lies
        format='csv',
        skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=dataset_datafields)
train,test,valid=data.split([0.8,0.1,0.1])

In [9]:
TEXT.build_vocab(train)


In [10]:
TEXT.vocab.freqs.most_common(10)


[('i', 7882),
 ('a', 7838),
 ('bitch', 6700),
 ('the', 5816),
 ('t', 5584),
 ('you', 5495),
 ('to', 4299),
 ('that', 3263),
 ('and', 3237),
 ('my', 2876)]

In [11]:
train_iter, val_iter = BucketIterator.splits(
        (train, valid), # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(64, 64),
        device=device, # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.tweet), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)
test_iter = Iterator(test, batch_size=64, device=device, sort=False, sort_within_batch=False, repeat=False)

In [12]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y

    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [13]:

train_dl = BatchWrapper(train_iter, "tweet", ["hate_speech","offensive_language","neither"])
test_dl = BatchWrapper(test_iter, "tweet", ["hate_speech","offensive_language","neither"])
val_dl = BatchWrapper(val_iter, "tweet", ["hate_speech","offensive_language","neither"])


# MODEL

In [14]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F


In [55]:

class SelfAttention(nn.Module):
    def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
        super(SelfAttention, self).__init__()

        """
        Arguments
        ---------
        batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
        output_size : 2 = (pos, neg)
        hidden_sie : Size of the hidden_state of the LSTM
        vocab_size : Size of the vocabulary containing unique words
        embedding_length : Embeddding dimension of GloVe word embeddings
        weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
        
        --------

        """

        self.batch_size = batch_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length
        self.weights = weights

        self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
        self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False)
        self.dropout = 0.8
        self.bilstm = nn.LSTM(embedding_length, hidden_size, dropout=self.dropout, bidirectional=True)
        # We will use da = 350, r = 30 & penalization_coeff = 1 as per given in the self-attention original ICLR paper
        self.W_s1 = nn.Linear(2*hidden_size, 350)
        self.W_s2 = nn.Linear(350, 1)
        self.fc_layer = nn.Linear(1*2*hidden_size, 2000)
        self.label = nn.Linear(2000, output_size)

    def attention_net(self, lstm_output):

        """
        Now we will use self attention mechanism to produce a matrix embedding of the input sentence in which every row represents an
        encoding of the input sentence but giving an attention to a specific part of the sentence. We will use 30 such embedding of 
        the input sentence and then finally we will concatenate all the 30 sentence embedding vectors and connect it to a fully 
        connected layer of size 2000 which will be connected to the output layer of size 2 returning logits for our two classes i.e., 
        pos & neg.
        Arguments
        ---------
        lstm_output = A tensor containing hidden states corresponding to each time step of the LSTM network.
        ---------
        Returns : Final Attention weight matrix for all the 30 different sentence embedding in which each of 30 embeddings give
                  attention to different parts of the input sentence.
        Tensor size : lstm_output.size() = (batch_size, num_seq, 2*hidden_size)
                      attn_weight_matrix.size() = (batch_size, 30, num_seq)
        """
        attn_weight_matrix = self.W_s2(torch.tanh(self.W_s1(lstm_output)))
        attn_weight_matrix = attn_weight_matrix.permute(0, 2, 1)
        attn_weight_matrix = torch.softmax(attn_weight_matrix, dim=2)

        return attn_weight_matrix

    def forward(self, input_sentences, batch_size=None):

        """ 
        Parameters
        ----------
        input_sentence: input_sentence of shape = (batch_size, num_sequences)
        batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
        
        Returns
        -------
        Output of the linear layer containing logits for pos & neg class.
        
        """

        input = self.word_embeddings(input_sentences)
        input = input.permute(1, 0, 2)
        if batch_size is None:
            h_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda())
            c_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda())
        else:
            h_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda())
            c_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda())

        output, (h_n, c_n) = self.bilstm(input, (h_0, c_0))
        output = output.permute(1, 0, 2)

        attn_weight_matrix = self.attention_net(output)
        hidden_matrix = torch.bmm(attn_weight_matrix, output)

        # Let's now concatenate the hidden_matrix and connect it to the fully connected layer.
        fc_out = self.fc_layer(hidden_matrix.view(-1, hidden_matrix.size()[1]*hidden_matrix.size()[2]))
        logits = self.label(fc_out)
        # logits.size() = (batch_size, output_size)

        return logits,attn_weight_matrix

In [56]:
batch_size=64
num_classes=3
h_emb=500
emb_size=100
vocab_Size=len(TEXT.vocab)
vectors=TEXT.vocab.vectors
model = SelfAttention(batch_size, num_classes, h_emb, vocab_Size, emb_size, TEXT.vocab.vectors)
model.cuda()


  "num_layers={}".format(dropout, num_layers))


SelfAttention(
  (word_embeddings): Embedding(30244, 100)
  (bilstm): LSTM(100, 500, dropout=0.8, bidirectional=True)
  (W_s1): Linear(in_features=1000, out_features=350, bias=True)
  (W_s2): Linear(in_features=350, out_features=1, bias=True)
  (fc_layer): Linear(in_features=1000, out_features=2000, bias=True)
  (label): Linear(in_features=2000, out_features=3, bias=True)
)

In [57]:
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)


In [59]:
opt = torch.optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()
epochs=5

In [60]:
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x,y in tqdm.tqdm(train_dl):
        text = x
        target = y

        opt.zero_grad()
        prediction,attentin_matrix = model(text,text.size()[0])
        loss = loss_func(prediction, target)
        loss.backward()
        clip_gradient(model, 1e-1)
        opt.step()
        running_loss += loss.data.item() * x.size(0)
        
    epoch_loss = running_loss / len(train)
    
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x,y in val_dl:
        preds,attentin_matrix = model(x,x.size()[0])
        loss = loss_func(preds, y)
        val_loss += loss.data.item() * x.size(0)

    val_loss /= len(valid)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

100%|██████████| 310/310 [00:03<00:00, 98.99it/s]
  3%|▎         | 8/310 [00:00<00:03, 76.38it/s]

Epoch: 1, Training Loss: 0.4417, Validation Loss: 0.2074


100%|██████████| 310/310 [00:03<00:00, 97.72it/s]
  3%|▎         | 8/310 [00:00<00:03, 76.42it/s]

Epoch: 2, Training Loss: 0.1640, Validation Loss: 0.2148


100%|██████████| 310/310 [00:03<00:00, 98.02it/s]
  3%|▎         | 8/310 [00:00<00:04, 74.98it/s]

Epoch: 3, Training Loss: 0.1075, Validation Loss: 0.3028


100%|██████████| 310/310 [00:03<00:00, 97.41it/s]
  3%|▎         | 8/310 [00:00<00:03, 75.82it/s]

Epoch: 4, Training Loss: 0.0640, Validation Loss: 0.3932


100%|██████████| 310/310 [00:03<00:00, 97.51it/s]


Epoch: 5, Training Loss: 0.0472, Validation Loss: 0.5649


In [61]:
accuracy=0
count=0
for x,y in test_dl:
    preds,attentin_matrix = model(x,x.size()[0])
    preds = F.softmax(preds,1)
    for idx,p in enumerate(preds):
        curr_pred=np.round(p.detach().tolist())
        curr_gt=np.round(y[idx].tolist())
        if np.array_equal(curr_pred,curr_gt):
            accuracy+=1
        count+=1


In [62]:
print(accuracy/count)

0.8329971762807584


In [52]:
len(test[0].tweet)

24

In [96]:
''' Let us now predict the sentiment on a single sentence just for the testing purpose. '''
test_sen1 = "man is indian"
test_sen1 = TEXT.preprocess(test_sen1)
sentence=test_sen1
test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]


test_sen = np.asarray(test_sen1)
test_sen = torch.LongTensor(test_sen)
test_tensor = Variable(test_sen, volatile=True)
test_tensor = test_tensor.cuda()
model.eval()

output,attention_matrix = model(test_tensor, 1)
out = torch.softmax(output, 1)
print(out)
print(torch.argmax(out))

tensor([[3.3067e-04, 9.9960e-01, 7.4215e-05]], device='cuda:0',
       grad_fn=<SoftmaxBackward>)
tensor(1, device='cuda:0')


  # Remove the CWD from sys.path while we load stuff.


In [97]:
a=zip(sentence,attention_matrix[0][0])
for i in a:
    print(i)

('man', tensor(0.4518, device='cuda:0', grad_fn=<SelectBackward>))
('is', tensor(0.5477, device='cuda:0', grad_fn=<SelectBackward>))
('indian', tensor(0.0006, device='cuda:0', grad_fn=<SelectBackward>))
