<a href="https://colab.research.google.com/github/thayeylolu/cyberbullying/blob/lstm/notebooks/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchtext==0.10.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 15.4 MB/s 
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.8 kB/s 


In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
import torch
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from torch import nn, save, load
from torch.optim import Adam
# from torchtext.legacy import data
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torch.nn.functional as F

In [None]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
from sklearn.model_selection import train_test_split

warnings.filterwarnings(action="ignore")
%matplotlib inline

In [None]:
import torch
import pandas as pd

# nlp library of Pytorch
from torchtext.legacy import data

import warnings as wrn
wrn.filterwarnings('ignore')

ModuleNotFoundError: ignored

In [None]:
SEED = 2021

torch.manual_seed(SEED)
torch.backends.cuda.deterministic = True

# Data Preprocessing
Pytorch offers a good way of preprocessing text data: **torchtext**. Altough it seems like not stable and hard-to-use for newbies, it has nice features and it's easy to use.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def encode_label(label):
    if label == 'not_cyberbullying':
        return 0
    elif label == 'gender':
        return 1
    elif label == 'religion':
        return 2
    elif label == 'other_cyberbullying':
        return 3
    elif label == 'age':
        return 4
    else:
      return 5

In [None]:
url = "/content/drive/MyDrive/NLP/train_data.csv"
# url = "train_data.csv"


In [None]:
df = pd.read_csv(url)
df= df.dropna()
df['clean_txt_emoji'] = df['clean_txt_emoji'].str.replace('\d+', '')

df['clean_txt_emoji'].sample(3)
df['clean_txt_emoji'] = df['clean_txt_emoji'].str.lower()
df['label'] = df['cyberbullying_type'].apply(encode_label)
df = df[['label', 'clean_txt_emoji']]

train, test_valid = train_test_split(df,train_size= 0.70, random_state= 3040)
test, valid = train_test_split(test_valid,train_size= .50, random_state= 3040)
test['label'].value_counts()

X_train, Y_train = train['clean_txt_emoji'].to_list(), train['label'].to_list()
X_test, Y_test = test['clean_txt_emoji'].to_list(), test['label'].to_list()
X_valid, Y_valid = valid['clean_txt_emoji'].to_list(), valid['label'].to_list()

train_dat =list(zip(Y_train,X_train))
valid_dat =list(zip(Y_valid,X_valid))
test_dat=list(zip(Y_test,X_test))

In [None]:
# Field is a normal column 
# LabelField is the label column.

TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)

In [None]:
fields = [('clean_txt_emoji',TEXT),("cyberbullying_type",LABEL)]

In [None]:
training_data = data.TabularDataset(path=url,
                                    format="csv",
                                    fields=fields,
                                    skip_header=True
                                   )

print(vars(training_data.examples[0]))

In [None]:
import random
# train and validation splitting
train_data,valid_data = training_data.split(split_ratio=0.75,
                                            random_state=random.seed(SEED))


In [None]:
# Building vocabularies => (Token to integer)
TEXT.build_vocab(train_data,
                 min_freq=5)

LABEL.build_vocab(train_data)

In [None]:
print("Size of text vocab:",len(TEXT.vocab))

In [None]:
print("Size of label vocab:",len(LABEL.vocab))

In [None]:
TEXT.vocab.freqs.most_common(10)

In [None]:
# Creating GPU variable
device = torch.device("cuda")

BATCH_SIZE = 64

# We'll create iterators to get batches of data when we want to use them
"""
This BucketIterator batches the similar length of samples and reduces the need of 
padding tokens. This makes our future model more stable

"""
train_iterator,validation_iterator = data.BucketIterator.splits(
    (train_data,valid_data),
    batch_size = BATCH_SIZE,
    # Sort key is how to sort the samples
    sort_key = lambda x:len(x.text),
    sort_within_batch = True,
    device = device
)

# RNN Network
Now we'll use Pytorch to build an LSTM network in order to classify sms messages spam or not.

In [None]:
# Pytorch's nn module has lots of useful feature
import torch.nn as nn

class LSTMNet(nn.Module):
    
    def __init__(self,vocab_size,embedding_dim,hidden_dim,output_dim,n_layers,bidirectional,dropout):
        
        super(LSTMNet,self).__init__()
        
        # Embedding layer converts integer sequences to vector sequences
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        
        # LSTM layer process the vector sequences 
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
                            dropout = dropout,
                            batch_first = True
                           )
        
        # Dense layer to predict 
        self.fc = nn.Linear(hidden_dim * 2,output_dim)
        # Prediction activation function
        self.sigmoid = nn.Sigmoid()
        
    
    def forward(self,text,text_lengths):
        embedded = self.embedding(text)
        
        # Thanks to packing, LSTM don't see padding tokens 
        # and this makes our model better
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(),batch_first=True)
        
        packed_output,(hidden_state,cell_state) = self.lstm(packed_embedded)
        
        # Concatenating the final forward and backward hidden states
        hidden = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)
        
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.sigmoid(dense_outputs)
        
        return outputs
    

* Our model class is ready, let's declare hyperparameters

In [None]:
SIZE_OF_VOCAB = len(TEXT.vocab)
EMBEDDING_DIM = 100
NUM_HIDDEN_NODES = 64
NUM_OUTPUT_NODES = 1
NUM_LAYERS = 2
BIDIRECTION = True
DROPOUT = 0.2

# Training
Now let's create our model instance, optimizer and loss function

In [None]:
model = LSTMNet(SIZE_OF_VOCAB,
                EMBEDDING_DIM,
                NUM_HIDDEN_NODES,
                NUM_OUTPUT_NODES,
                NUM_LAYERS,
                BIDIRECTION,
                DROPOUT
               )

In [None]:
import torch.optim as optim
model = model.to(device)
optimizer = optim.Adam(model.parameters(),lr=1e-4)
criterion = nn.BCELoss()
criterion = criterion.to(device)

In [None]:
model

In [None]:
# We'll use this helper to compute accuracy
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model,iterator,optimizer,criterion):
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    model.train()
    
    for batch in iterator:
        
        # cleaning the cache of optimizer
        optimizer.zero_grad()
        
        text,text_lengths = batch.text
        
        # forward propagation and squeezing
        predictions = model(text,text_lengths).squeeze()
        
        # computing loss / backward propagation
        loss = criterion(predictions,batch.type)
        loss.backward()
        
        # accuracy
        acc = binary_accuracy(predictions,batch.type)
        
        # updating params
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    # It'll return the means of loss and accuracy
    return epoch_loss / len(iterator), epoch_acc / len(iterator)
        

* Also we need a function to evaluate model

In [None]:
def evaluate(model,iterator,criterion):
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    # deactivate the dropouts
    model.eval()
    
    # Sets require_grad flat False
    with torch.no_grad():
        for batch in iterator:
            text,text_lengths = batch.text
            
            predictions = model(text,text_lengths).squeeze()
              
            #compute loss and accuracy
            loss = criterion(predictions, batch.type)
            acc = binary_accuracy(predictions, batch.type)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

* Let's train the model

In [None]:
EPOCH_NUMBER = 15
for epoch in range(1,EPOCH_NUMBER+1):
    
    train_loss,train_acc = train(model,train_iterator,optimizer,criterion)
    
    valid_loss,valid_acc = evaluate(model,validation_iterator,criterion)
    
    # Showing statistics
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print()

# Conclusion
It's real fun to work with Pytorch. I dont't know why but, using and learning Pytorch after Keras API, using a lower-level API and seeing how the things work in deep learning is awesome.

Thanks for your attention. I'm waiting for your upvotes&questions.
