In [1]:
# Regular Library
import numpy as np
import pandas as pd
import json
# NLP library
from nltk.tokenize import word_tokenize
# DL library
import torch
from torch import nn,optim
from torch.utils.data import TensorDataset,DataLoader
import torch.nn.functional as F 

device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [2]:
df = pd.read_csv("assets/data/data_analyzed_df.csv")
del df['Unnamed: 0']
df.head(5)

Unnamed: 0,id,label,tweet,tidy_tweet,hashtag,word_count,char_count,avg_word,stopwords,hashtags
0,1,0.0,@user when a father is dysfunctional and is s...,dysfunctional selfish drags kids dysfunction #run,run,21,102,4.555556,10,1
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks #lyft credit cause offer wheelchair van...,lyft disapointed getthanked,22,122,5.315789,5,3
2,3,0.0,bihday your majesty,majesty,,5,21,5.666667,1,0
3,4,0.0,#model i love u take with u all the time in ...,#model,model,17,86,4.928571,5,1
4,5,0.0,factsguide: society now #motivation,factsguide society #motivation,motivation,8,39,8.0,1,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49159 entries, 0 to 49158
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          49159 non-null  int64  
 1   label       31962 non-null  float64
 2   tweet       49159 non-null  object 
 3   tidy_tweet  48810 non-null  object 
 4   hashtag     35894 non-null  object 
 5   word_count  49159 non-null  int64  
 6   char_count  49159 non-null  int64  
 7   avg_word    49159 non-null  float64
 8   stopwords   49159 non-null  int64  
 9   hashtags    49159 non-null  int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 3.8+ MB


In [4]:
df=df[['label','tidy_tweet']]
df.head(2)

Unnamed: 0,label,tidy_tweet
0,0.0,dysfunctional selfish drags kids dysfunction #run
1,0.0,thanks #lyft credit cause offer wheelchair van...


In [5]:
df=df.dropna()

In [6]:
df.isna().sum()

label         0
tidy_tweet    0
dtype: int64

In [7]:
df.label=df.label.astype('int')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31751 entries, 0 to 31961
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       31751 non-null  int32 
 1   tidy_tweet  31751 non-null  object
dtypes: int32(1), object(1)
memory usage: 620.1+ KB


In [9]:
df['tidy_tweet'] = df['tidy_tweet'].str.replace("[^a-zA-Z]"," ")
df.head(4)

  df['tidy_tweet'] = df['tidy_tweet'].str.replace("[^a-zA-Z]"," ")


Unnamed: 0,label,tidy_tweet
0,0,dysfunctional selfish drags kids dysfunction run
1,0,thanks lyft credit cause offer wheelchair van...
2,0,majesty
3,0,model


In [10]:
def split_words_reviews(dataframe):
    clean_text=list(dataframe['tidy_tweet'].values)
    tokenized=[word_tokenize(x) for x in clean_text]
    all_text=[]
    for tokens in tokenized:
        for token in tokens:
            all_text.append(token)
    return tokenized,set(all_text)
tidy_tweet,vocabs=split_words_reviews(df)

In [11]:
tidy_tweet[12]

['daddy', 'gettingfed']

In [12]:
for u in vocabs:
    print(u)

fuhered
cadiz
ulverstontri
graceful
prior
filmmaker
slimmingworld
readingterminalmarket
juneinstaspank
hormones
footfalls
claimed
outofcontrol
liye
twofaced
packyourshitobama
investigators
paulryan
snowden
villagewellness
hayley
lorealpro
penang
beverlyhills
goodnightahur
mayma
course
childhoodwisdom
bestanden
weiter
delicias
hoti
coat
sunnyday
harianja
grownassman
pple
veganfood
renewal
lingoliviawu
plaintain
ruthless
pleasestoptheviolence
gave
lethal
iomtt
lyf
tropical
timetodecide
allowing
percent
arubaweddings
cannon
probab
thelastship
discogs
businesses
database
altwaystoheal
practices
debutnovel
inbox
ethnocentrism
herewecome
individual
japonesas
legendofzelda
prayerchangesthings
loseweightnow
englandrussia
booski
suffer
bruh
imcheap
firearms
goodson
videoshoot
diploma
tragedy
vocals
reefers
bihdayfemikuti
summerinnyc
starin
feelgoodmusic
musicvideo
graders
yetta
rims
brunettes
ownit
absorbed
theaby
instrumental
baseboll
enhances
sleepovers
whatayear
agitator
sdcc
macbook
amanikn

In [13]:
def create_dict(words):
    word_to_int_dict={w:i+1 for i,w in enumerate(words)}
    int_to_word_dict={i:w for w,i in word_to_int_dict.items()}
    return word_to_int_dict,int_to_word_dict

word_to_int_dict,int_to_word_dict=create_dict(vocabs)

In [14]:
with open('assets/data/word_to_int_dict.json', 'w') as fp:
    json.dump(word_to_int_dict, fp)

In [15]:
print(np.max([len(x) for x in tidy_tweet]))
print(np.mean([len(x) for x in tidy_tweet]))

21
6.308368240370382


In [16]:
def pad_text(tweets,seq_len):
    tweets_pad=[]
    for tweet in tweets:
        if len(tweet)>=seq_len:
            tweets_pad.append(tweet[:seq_len])
        else:
            tweets_pad.append(['pad']*(seq_len-len(tweet))+tweet)
    return np.array(tweets_pad)
tweets_pad=pad_text(tidy_tweet,seq_len=12)


In [17]:
int_to_word_dict[0]='pad'
word_to_int_dict['pad']=0

In [18]:
encoded_sentences = np.array([[word_to_int_dict[word] for word in tweet] for tweet in tweets_pad])


In [19]:
encoded_sentences[0]

array([    0,     0,     0,     0,     0,     0, 14961,   366, 30695,
       20944, 33106,  3208])

In [20]:
class HateDetectLSTM(nn.Module):
    def __init__(self,vocab_size,embd_dim,n_hidden,n_output,n_layers):
        super(HateDetectLSTM,self).__init__()

        self.vocab_size=vocab_size
        self.n_layers=n_layers
        self.n_hidden=n_hidden

        self.embedding=nn.Embedding(vocab_size,embd_dim)
        self.lstm=nn.LSTM(embd_dim,n_hidden,n_layers,batch_first=True,dropout=0.6)
        self.dropout=nn.Dropout(0.6)
        self.fc=nn.Linear(n_hidden,n_output)
        self.sigmoid=nn.Sigmoid()

    def forward(self,x):
        embd_words=self.embedding(x)
        out,hidden=self.lstm(embd_words)
        out=self.dropout(out)
        out=out.contiguous().view(-1,self.n_hidden)
        fc_out=self.fc(out)
        sigmoid=self.sigmoid(fc_out)
        sigmoid=sigmoid.view(x.size(0),-1)
        sigmoid_last=sigmoid[:,-1]
        return sigmoid_last,hidden
    
    def init_hidden(self,bs,device):
        device=device
        weights=next(self.parameters()).data
        hidden=(weights.new(self.n_layers,bs, self.n_hidden).zero_().to(device),\
                weights.new(self.n_layers,bs, self.n_hidden).zero_().to(device))
        return hidden

In [21]:
vocab_size=len(word_to_int_dict)
embd_dim=50
n_hidden=100
n_output=1
n_layers=2

model=HateDetectLSTM(vocab_size=vocab_size,embd_dim=embd_dim,n_hidden=n_hidden,n_output=n_output,n_layers=n_layers)
model

HateDetectLSTM(
  (embedding): Embedding(36197, 50)
  (lstm): LSTM(50, 100, num_layers=2, batch_first=True, dropout=0.6)
  (dropout): Dropout(p=0.6, inplace=False)
  (fc): Linear(in_features=100, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [22]:
labels=np.array([x for x in df.label.values])

In [23]:
train_size=int(len(encoded_sentences)*0.8)
valid_size=int(len(encoded_sentences)*0.9)

In [24]:
train_size

25400

In [25]:
valid_size

28575

In [26]:
X_train,y_train=torch.Tensor(encoded_sentences[:train_size]).long(),torch.Tensor(labels[:train_size]).long()
X_val,y_val=torch.Tensor(encoded_sentences[train_size:valid_size]).long(),torch.Tensor(labels[train_size:valid_size]).long()
X_test,y_test=torch.Tensor(encoded_sentences[valid_size:]).long(),torch.Tensor(labels[valid_size:]).long()

train_data=TensorDataset(X_train,y_train)
valid_data=TensorDataset(X_val,y_val)
test_data=TensorDataset(X_test,y_test)

batch_size = 1

train_dl = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_dl = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_dl = DataLoader(test_data, batch_size = batch_size, shuffle = True)

In [27]:
len(train_dl)

25400

In [28]:
print_every = 6000
step = 0
n_epochs = 5
clip = 5  
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [29]:
model=model.to(device)

In [30]:
best_loss=float('inf')
best_loss

inf

In [31]:
for epoch in range(n_epochs):
    hidden=model.init_hidden(batch_size,device)

    for inputs,targets in train_dl:
        step+=1
        inputs=torch.autograd.Variable(inputs.to(device))
        targets=torch.autograd.Variable(targets.to(device))
        model.zero_grad()
        output,hidden=model(inputs)
        loss=criterion(output,targets.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(),clip)
        optimizer.step()

        if (step%print_every)==0:
            model.eval()
            val_losses=[]
            for inputs_v,targets_v in valid_dl:
                inputs_v=torch.autograd.Variable(inputs_v.to(device))
                targets_v=torch.autograd.Variable(targets_v.to(device))
                output,hidden=model(inputs_v)
                loss_v=criterion(output,targets_v.float())
                val_losses.append(loss_v.cpu().item())
            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.cpu().item()),
                  "Validation Loss: {:.4f}".format(np.mean(val_losses)))
            if best_loss>np.mean(val_losses):
                best_loss=np.mean(val_losses)
                torch.save(model.state_dict(), f'assets/model/lstm_model_{epoch}.pkl')
            model.train()




Epoch: 1/5 Step: 6000 Training Loss: 0.0084 Validation Loss: 0.2299
Epoch: 1/5 Step: 12000 Training Loss: 0.2240 Validation Loss: 0.1732
Epoch: 1/5 Step: 18000 Training Loss: 0.0586 Validation Loss: 0.1704
Epoch: 1/5 Step: 24000 Training Loss: 0.0034 Validation Loss: 0.1612
Epoch: 2/5 Step: 30000 Training Loss: 0.0056 Validation Loss: 0.1795
Epoch: 2/5 Step: 36000 Training Loss: 0.0011 Validation Loss: 0.1692
Epoch: 2/5 Step: 42000 Training Loss: 0.0197 Validation Loss: 0.1481
Epoch: 2/5 Step: 48000 Training Loss: 0.1096 Validation Loss: 0.1450
Epoch: 3/5 Step: 54000 Training Loss: 0.0002 Validation Loss: 0.1700
Epoch: 3/5 Step: 60000 Training Loss: 0.0079 Validation Loss: 0.2204
Epoch: 3/5 Step: 66000 Training Loss: 0.0008 Validation Loss: 0.1804
Epoch: 3/5 Step: 72000 Training Loss: 0.0009 Validation Loss: 0.1593
Epoch: 4/5 Step: 78000 Training Loss: 0.0000 Validation Loss: 0.2017
Epoch: 4/5 Step: 84000 Training Loss: 0.0000 Validation Loss: 0.1973
Epoch: 4/5 Step: 90000 Training Los

In [38]:
model = HateDetectLSTM(vocab_size=vocab_size,embd_dim=embd_dim,n_hidden=n_hidden,n_output=n_output,n_layers=n_layers)
model.load_state_dict(torch.load('assets/model/lstm_model_1.pkl'))
model=model.cpu()
model.eval()
test_losses = []
num_correct = 0
y_pred=[]
for inputs, targets in test_dl:

    test_output, test_h = model(inputs)
    loss = criterion(test_output, targets.float())
    test_losses.append(loss.item())
    
    preds = torch.round(test_output.squeeze())
    y_pred.append(preds.detach().item())
    correct_tensor = preds.eq(targets.float().view_as(preds))
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)
    
print("Test Loss: {:.4f}".format(np.mean(test_losses)))
print("Test Accuracy: {:.2f}".format(num_correct/len(test_dl.dataset)))    

Test Loss: 0.1512
Test Accuracy: 0.95


In [40]:
y_pred=np.array(y_pred)

In [41]:
y_test=y_test.numpy()

In [43]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_pred,y_test))
print(classification_report(y_pred,y_test))

[[2824  209]
 [ 125   18]]
              precision    recall  f1-score   support

         0.0       0.96      0.93      0.94      3033
         1.0       0.08      0.13      0.10       143

    accuracy                           0.89      3176
   macro avg       0.52      0.53      0.52      3176
weighted avg       0.92      0.89      0.91      3176

