In [41]:
import pandas as pd
import numpy as np
import torch
import re
import tqdm


from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator

from nltk import tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [42]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [43]:
dataset=pd.read_csv('data/dataset.csv')

In [44]:
dataset

Unnamed: 0,id,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
5,5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."
6,6,3,0,3,0,1,"!!!!!!""@__BrighterDays: I can not just sit up ..."
7,7,3,0,3,0,1,!!!!&#8220;@selfiequeenbri: cause I'm tired of...
8,8,3,0,3,0,1,""" &amp; you might not get ya bitch back &amp; ..."
9,9,3,1,2,0,1,""" @rhythmixx_ :hobbies include: fighting Maria..."


In [46]:
stop_words= set(stopwords.words('english'))

In [47]:

def clean_tweet(tweet):
    tweet = re.sub("#", "",tweet) # Removing '#' from hashtags
    tweet = re.sub("[^a-zA-Z#]", " ",tweet) # Removing punctuation and special characters
    tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
    tweet = re.sub('http','',tweet)
    tweet = re.sub(" +", " ", tweet)
    tweet = tweet.lower()
    tweet = word_tokenize(tweet)
    return tweet
    return_tweet=[]
    for word in tweet:
        if word not in stop_words:
            return_tweet.append(word)
    return return_tweet

In [48]:
TEXT = Field(sequential=True, tokenize=clean_tweet, lower=True)
LABEL = Field(sequential=False, use_vocab=False)


In [49]:
dataset_datafields = [("",None),("id",None),("count", None),
                      ("hate_speech", LABEL),("offensive_language", LABEL),
                      ("neither", LABEL),("label", None),("tweet",TEXT)
                    ]

data = TabularDataset(
        path="data/final_data.csv", # the root directory where the data lies
        format='csv',
        skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=dataset_datafields)
train,test,valid=data.split([0.8,0.1,0.1])

In [50]:
train[5].tweet

['rt',
 'papi',
 'chulo',
 'the',
 'fact',
 'that',
 'all',
 'y',
 'all',
 'hoes',
 'gettin',
 'pregnant',
 'wit',
 'no',
 'ring',
 'on',
 'ya',
 'finger']

In [21]:
TEXT.build_vocab(train)


In [24]:
TEXT.vocab.freqs.most_common(10)


[('i', 7808),
 ('a', 7750),
 ('bitch', 6691),
 ('rt', 6139),
 ('the', 5821),
 ('t', 5655),
 ('you', 5431),
 ('to', 4251),
 ('that', 3255),
 ('and', 3154)]

In [25]:

train_iter, val_iter = BucketIterator.splits(
        (train, valid), # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(64, 64),
        device=device, # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.tweet), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)
test_iter = Iterator(test, batch_size=64, device=device, sort=False, sort_within_batch=False, repeat=False)

In [26]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y

    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [27]:

train_dl = BatchWrapper(train_iter, "tweet", ["hate_speech","offensive_language","neither"])
test_dl = BatchWrapper(test_iter, "tweet", ["hate_speech","offensive_language","neither"])
val_dl = BatchWrapper(val_iter, "tweet", ["hate_speech","offensive_language","neither"])


# MODEL

In [31]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [32]:
class SimpleBiLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300,
                 spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 3)
    
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

In [33]:
em_sz = 100
nh = 500
nl = 3
model = SimpleBiLSTMBaseline(nh, emb_dim=em_sz)
model = model.cuda()
model

  "num_layers={}".format(dropout, num_layers))


SimpleBiLSTMBaseline(
  (embedding): Embedding(30609, 100)
  (encoder): LSTM(100, 500, dropout=0.1)
  (linear_layers): ModuleList()
  (predictor): Linear(in_features=500, out_features=3, bias=True)
)

In [35]:
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()
# loss_func = nn.CrossEntropyLoss()
epochs=10

In [38]:
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x,y in tqdm.tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()
        
        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        
        running_loss += loss.data.item() * x.size(0)
        
    epoch_loss = running_loss / len(train)
    
    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x,y in val_dl:
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.data.item() * x.size(0)

    val_loss /= len(valid)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

100%|██████████| 310/310 [00:01<00:00, 155.31it/s]
  4%|▍         | 12/310 [00:00<00:02, 118.79it/s]

Epoch: 1, Training Loss: 0.1899, Validation Loss: 0.1011


100%|██████████| 310/310 [00:01<00:00, 155.97it/s]
  4%|▍         | 12/310 [00:00<00:02, 118.82it/s]

Epoch: 2, Training Loss: 0.1881, Validation Loss: 0.1009


100%|██████████| 310/310 [00:01<00:00, 158.52it/s]
  4%|▍         | 12/310 [00:00<00:02, 119.21it/s]

Epoch: 3, Training Loss: 0.1884, Validation Loss: 0.1118


100%|██████████| 310/310 [00:01<00:00, 157.53it/s]
  4%|▍         | 13/310 [00:00<00:02, 123.30it/s]

Epoch: 4, Training Loss: 0.1885, Validation Loss: 0.1025


100%|██████████| 310/310 [00:01<00:00, 157.57it/s]
  4%|▍         | 12/310 [00:00<00:02, 117.15it/s]

Epoch: 5, Training Loss: 0.1348, Validation Loss: 0.0706


100%|██████████| 310/310 [00:01<00:00, 155.58it/s]
  4%|▍         | 12/310 [00:00<00:02, 117.73it/s]

Epoch: 6, Training Loss: 0.1051, Validation Loss: 0.0692


100%|██████████| 310/310 [00:01<00:00, 160.78it/s]
  9%|▉         | 28/310 [00:00<00:02, 129.14it/s]

Epoch: 7, Training Loss: 0.0899, Validation Loss: 0.0715


100%|██████████| 310/310 [00:01<00:00, 159.17it/s]
  4%|▍         | 12/310 [00:00<00:02, 119.46it/s]

Epoch: 8, Training Loss: 0.0780, Validation Loss: 0.0694


100%|██████████| 310/310 [00:01<00:00, 157.87it/s]
  4%|▍         | 13/310 [00:00<00:02, 123.29it/s]

Epoch: 9, Training Loss: 0.0680, Validation Loss: 0.0685


100%|██████████| 310/310 [00:01<00:00, 158.47it/s]


Epoch: 10, Training Loss: 0.0599, Validation Loss: 0.0678


In [63]:
accuracy=0
count=0
for x,y in test_dl:
    
    preds = model(x)
    preds = F.softmax(preds,1)
    for idx,p in enumerate(preds):
        curr_pred=np.round(p.detach().tolist())
        curr_gt=np.round(y[idx].tolist())
        if np.array_equal(curr_pred,curr_gt):
            accuracy+=1
        count+=1


In [64]:
print(accuracy/count)

0.8745461879790238
