In this notebook, I attempt to use Recurrent Neural Networks (RNNs), Gated Recurrent Units (GRUs) and Long-Short Term Memory (LSTM) to conduct classification

In [1]:
import os
import torch
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

from torch import nn
from torch.utils.data import Dataset, DataLoader
import torchvision

torch.utils.data.datapipes.utils.common.DILL_AVAILABLE = torch.utils._import_utils.dill_available()
import torchtext
import torchdata.datapipes as dp
import torchtext.transforms as T
from torchtext.vocab import build_vocab_from_iterator

import spacy
spacy.load('en_core_web_sm')
# from torchvision import datasets, transforms

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<spacy.lang.en.English at 0x7ca21348af20>

Reading in the data

In [2]:
train_df = pd.read_csv('data/DL/DL_train.csv')
train_df.head()

Unnamed: 0,text,humor
0,watch this swimmer disappear into winter storm...,False
1,"they laughed at reagan, too: trump's ideas wil...",False
2,"hey, are you cold? go over to the corner, it i...",True
3,cannot get a standing desk? these are almost a...,False
4,want to hear a joke about my penis? never mind...,True


In [3]:
test_df = pd.read_csv('data/DL/DL_test.csv')
test_df.head()

Unnamed: 0,text,humor
0,thought up a reddit joke today. when is a tria...,True
1,how much do pirates pay for corn? a buck an ear!,True
2,hillary clinton sent her book to every gop can...,False
3,italian unions lambast new museum boss for wor...,False
4,life below the ocean’s surface wholly depends ...,False


Setting up the hyperparameters

In [20]:
INPUT_SIZE = 784
HIDDEN_SIZE = 300
NUM_CLASSES = 2
NUM_EPOCHS = 2
BATCH_SIZE = 64
LEARNING_RATE = 0.001

# RNN class

In [4]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftMax(dim=1)
    
    def forward(self, input_tensor, hidden_tensor):
        combined = torch.cat((input_tensor, hidden_tensor), 1)

        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

# Preparing the dataset

The custom dataset should inherit Dataset and override the following methods:
1. \_\_len\_\_ so that len(dataset) returns the size of the dataset.
2. \_\_getitem\_\_ to support the indexing such that dataset[i] can be used to get the ith sample.

Need to further preprocess the text data before we can convert the data into a tensor: https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial

Also, preprocessing the text data: https://medium.com/@theDrewDag/convert-texts-into-tensors-for-deep-learning-74b0cf48d416

In [5]:
eng = spacy.load("en_core_web_sm") # Load the English model to tokenize English text

train_data_pipe = dp.iter.IterableWrapper(['data/DL/DL_train.csv']) # creating an iterable of filenames
train_data_pipe = dp.iter.FileOpener(train_data_pipe, mode='rb') # pass the iterable to FileOpener which then opens the file in read mode
train_data_pipe = train_data_pipe.parse_csv(skip_lines=1, delimiter=',', as_tuple=True) # parse the file, which again returns an iterable of tuples representing each rows of the csv file

In [6]:
for sample in train_data_pipe:
    print(sample)
    break

('watch this swimmer disappear into winter storm jonas', 'False')


In [7]:
# function to tokenize the text
def engTokenize(text):
    """
    Tokenize an English text and return a list of tokens
    """
    return [token.text for token in eng.tokenizer(text)]

In [8]:
print(engTokenize("Have a good day!!!")) # the function is working as expected

['Have', 'a', 'good', 'day', '!', '!', '!']


In [9]:
def getTokens(data_iter):
    for text, humor in data_iter:
        yield engTokenize(text)

In [10]:
# <sos> for start of sentence
# <eos> for end of sentence
# <unk> for unknown words. An example of unknown word is the one skipped because of min_freq=2
# <pad> is the padding token. While training, a model we mostly train in batches. In a batch, there can be sentences of different length.
# So, we pad the shorter sentences with <pad> token to make length of all sequences in the batch equal.

train_vocab = build_vocab_from_iterator(
    getTokens(train_data_pipe),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)

# if some words are not in the vocabulary, we will use <unk> instead of that unknown word
train_vocab.set_default_index(train_vocab['<unk>'])

In [11]:
# similarly, we build the vocab for the test data

test_data_pipe = dp.iter.IterableWrapper(['data/DL/DL_test.csv']) # creating an iterable of filenames
test_data_pipe = dp.iter.FileOpener(test_data_pipe, mode='rb') # pass the iterable to FileOpener which then opens the file in read mode
test_data_pipe = test_data_pipe.parse_csv(skip_lines=1, delimiter=',', as_tuple=True) # parse the file, which again returns an iterable of tuples representing each rows of the csv file

test_vocab = build_vocab_from_iterator(
    getTokens(test_data_pipe),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)

# if some words are not in the vocabulary, we will use <unk> instead of that unknown word
test_vocab.set_default_index(train_vocab['<unk>'])

In [13]:
# train_vocab.get_itos() returns a list with tokens at index based on vocabulary
print(train_vocab.get_itos()[:9])
print(test_vocab.get_itos()[:9])

['<pad>', '<sos>', '<eos>', '<unk>', 'the', 'a', '.', '?', 'to']
['<pad>', '<sos>', '<eos>', '<unk>', 'the', 'a', '.', '?', 'to']


In [14]:
# after building the vocabulary, we will convert our sentences into corresponding indices
def getTransform(vocab):
    """
    Create transforms based on given vocabulary. The returned transform is applied to sequence
    of tokens.
    """
    text_transform = T.Sequential(
        ## converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab=vocab),
        ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
        # 1 as seen in previous section
        T.AddToken(1, begin=True),
        ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is
        # 2 as seen in previous section
        T.AddToken(2, begin=False)
    )
    return text_transform

In [15]:
# try the above function on a random sentence

temp_list = list(train_data_pipe)
some_sentence = temp_list[798][0]
print("Some sentence=", end="")
print(some_sentence)
transformed_sentence = getTransform(train_vocab)(engTokenize(some_sentence))
print("Transformed sentence=", end="")
print(transformed_sentence)
index_to_string = test_vocab.get_itos()
for index in transformed_sentence:
    print(index_to_string[index], end=" ")

Some sentence=how to keep your feet in good shape this summer
Transformed sentence=[1, 32, 8, 227, 35, 969, 14, 116, 2520, 53, 427, 2]
<sos> how to cross your skin in should survive will summer <eos> 

In [16]:
def applyTransform(sequence_pair):
    """
    Apply transforms to sequence of tokens in a sequence pair
    """

    return (
        getTransform(train_vocab)(engTokenize(sequence_pair[0])),
        sequence_pair[1]
    )

In [17]:
train_data_pipe = train_data_pipe.map(applyTransform) ## Apply the function to each element in the iterator
temp_list = list(train_data_pipe)
print(temp_list[0])

([1, 205, 53, 7860, 6221, 86, 791, 1882, 7533, 2], 'False')


In [18]:
test_data_pipe = test_data_pipe.map(applyTransform) ## Apply the function to each element in the iterator
temp_list = list(test_data_pipe)
print(temp_list[0])

([1, 425, 65, 5, 558, 114, 194, 6, 49, 9, 5, 5170, 5, 30083, 7, 49, 21, 2428, 2], 'True')


In [19]:
# While working for sequence to sequence models, it is recommended to keep the length of sequences in a batch similar.
# For that we will use bucketbatch function of data_pipe.

def sortBucket(bucket):
    """
    Function to sort a given bucket. Here, we want to sort based on the length of
    source and target sequence.
    """
    return sorted(bucket, key=lambda x: len(x[0]))

In [21]:
# apply the bucket batch function on train data
train_data_pipe = train_data_pipe.bucketbatch(
    batch_size=BATCH_SIZE,
    batch_num=5,
    bucket_num=1,
    use_in_batch_shuffle=False,
    sort_key=sortBucket
)

print(list(train_data_pipe)[0])

[([1, 538, 1753, 3787, 235, 4152, 10, 68, 1009, 3631, 113, 2679, 2], 'False'), ([1, 126, 183, 20, 51, 958, 23, 4, 119, 41, 56, 14653, 2], 'False'), ([1, 87, 1624, 8329, 23, 1531, 11363, 10, 3962, 6164, 7284, 315, 2], 'False'), ([1, 1410, 204, 1714, 18, 145, 156, 41, 56, 20, 151, 11, 2], 'False'), ([1, 591, 1219, 733, 237, 7119, 8, 863, 5862, 14714, 703, 1896, 2], 'False'), ([1, 3005, 1004, 1376, 9978, 13, 9529, 13, 1955, 107, 111, 1574, 2], 'False'), ([1, 707, 953, 342, 42, 4, 127, 795, 1753, 10, 5706, 388, 2], 'False'), ([1, 5, 1307, 1698, 1285, 6139, 9, 5, 5717, 465, 3190, 6, 2], 'True'), ([1, 16, 19, 6798, 232, 25, 89, 349, 7, 3460, 2933, 28, 2], 'True'), ([1, 409, 2003, 96, 34, 5, 784, 70, 14, 7274, 213, 524, 2], 'False'), ([1, 692, 11, 304, 45, 798, 212, 10, 129, 9, 67, 6, 2], 'True'), ([1, 1282, 47, 23, 6736, 8, 187, 656, 3634, 2384, 23, 1293, 2], 'False'), ([1, 1073, 3414, 2918, 313, 84, 56, 24, 4184, 130, 22, 233, 4461, 2], 'False'), ([1, 3044, 3851, 2272, 137, 8, 5, 234, 17, 2

In [24]:
# convert the data the form: ((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))
def separateSourceTarget(sequence_pairs):
    """
    input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`
    output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`
    """
    sources,targets = zip(*sequence_pairs)
    return sources,targets

train_data_pipe = train_data_pipe.map(separateSourceTarget)
print(list(train_data_pipe)[0])

(([1, 638, 3484, 10519, 476, 16322, 215, 12662, 217, 2], [1, 4, 5345, 7463, 2840, 11, 6753, 122, 44, 2], [1, 12592, 7603, 5760, 14, 5, 10888, 1222, 2867, 2], [1, 15437, 19452, 342, 2512, 14, 4456, 6271, 554, 2], [1, 2229, 1008, 2572, 23, 890, 2807, 1152, 4599, 2], [1, 1324, 17, 23619, 3062, 6440, 208, 42, 9632, 2], [1, 4, 248, 1099, 15, 12589, 35, 218, 318, 2], [1, 1459, 609, 15, 27, 233, 17, 27, 108, 2], [1, 1094, 690, 2083, 2765, 77, 475, 3024, 2068, 2], [1, 4, 2527, 873, 1840, 4374, 62, 3022, 2624, 2], [1, 1084, 8608, 463, 1129, 23, 5, 6160, 4417, 2], [1, 1615, 2577, 5885, 36, 26, 362, 2163, 2640, 2], [1, 3005, 2980, 183, 187, 20261, 124, 1293, 5533, 2], [1, 35, 158, 357, 9967, 34, 147, 784, 218, 1838, 2], [1, 966, 28, 4, 1444, 1740, 3064, 9, 20, 1060, 2], [1, 246, 156, 2963, 55, 48, 398, 2182, 675, 57, 2], [1, 16, 19, 11, 47, 5, 10407, 3306, 7, 3, 2], [1, 49, 381, 9, 4, 118, 4368, 78, 71, 659, 2], [1, 16, 19, 11, 47, 5, 1073, 834, 7, 1723, 2], [1, 1690, 3, 2836, 23900, 14, 26614, 1

In [32]:
# add padding to the sentences
def applyPadding(pair_of_sequences):
    """
    Convert sequences to tensors and apply padding
    """
    return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1])))
## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies
# padding. Here, `0` is passed to the constructor to specify the index of the `<pad>` token in the
# vocabulary.
train_data_pipe = train_data_pipe.map(applyPadding)

Continue from the section on making batches with bucket batch: https://pytorch.org/tutorials/beginner/torchtext_custom_dataset_tutorial.html

In [30]:
def preprocess_text(text: str) -> str:
    """This utility function sanitizes a string by:
    - removing links
    - removing special characters
    - removing numbers
    - removing stopwords
    - transforming in lowercase
    - removing excessive whitespaces
    Args:
        text (str): the input text you want to clean
    Returns:
        str: the cleaned list of tokens
    """

    # remove links
    text = re.sub(r"http\S+", "", text)
    # remove special chars and numbers
    text = re.sub("[^A-Za-z]+", " ", text)
    # remove stopwords
    # 1. tokenize
    tokens = nltk.word_tokenize(text)
    # 2. check if stopword
    tokens = [w.lower() for w in tokens if not w in stopwords.words("english")]
    return tokens

In [31]:
# creation of vocabulary

def get_vocab(training_corpus):

    # add special characters
    # padding, end of line, unknown term
    vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

    # build vocab from training corpus
    for item in training_corpus: 
        processed_text = preprocess_text(item) # apply preprocessing on each text
        for word in processed_text: # for each word in tokens
          if word not in vocab: # if word not in vocab
              vocab[word] = len(vocab) # create an entry in the vocab equal to the term, and its value is the length of the vocab 
    return vocab


In [34]:
vocab = get_vocab(train_df['text'])

In [36]:
# Function to convert the text data into a tensor
def text_to_tensor(text: str, vocab_dict: dict, unk_token='__UNK__', verbose=False):
    '''
    Arguments: 
        text - string containing the text
        vocab_dict - term vocabulary
        unk_token - special char used to map the unknown items
        verbose - print debug messages
    Returns:
        tensor_l - a tensor containing the indices of our terms
    '''     
    word_l = preprocess_text(text)
    
    if verbose:
        print("List of words in our text:")
        print(word_l)
        
    # initialize empty tensor
    tensor_l = [] 
    
    # take the __UNK__ value fro mthe vocabulary 
    unk_ID = vocab_dict[unk_token]
    
    if verbose:
        print(f"UNK has a value of {unk_ID}")
        
    # for each word in the lsit:
    for word in word_l:
        # take the index
        # if the word is not in vocab_dict, then assign UNK
        word_ID = vocab_dict.get(word, unk_ID)
        # append to tensor list
        tensor_l.append(word_ID)

    return tensor_l

In [41]:
train_df['tokenized'] = train_df['text'].apply(lambda x: text_to_tensor(x, vocab))
train_df.head()

Unnamed: 0,text,humor,tokenized
0,watch this swimmer disappear into winter storm...,False,"[3, 4, 5, 6, 7, 8]"
1,"they laughed at reagan, too: trump's ideas wil...",False,"[9, 10, 11, 12, 13, 14, 15]"
2,"hey, are you cold? go over to the corner, it i...",True,"[16, 17, 18, 19, 20]"
3,cannot get a standing desk? these are almost a...,False,"[21, 22, 23, 24, 25]"
4,want to hear a joke about my penis? never mind...,True,"[26, 27, 28, 29, 30, 31, 32]"


In [42]:
test_df['tokenized'] = test_df['text'].apply(lambda x: text_to_tensor(x, vocab))
test_df.head()

Unnamed: 0,text,humor,tokenized
0,thought up a reddit joke today. when is a tria...,True,"[2594, 4644, 28, 500, 6231, 25956, 11826]"
1,how much do pirates pay for corn? a buck an ear!,True,"[565, 2773, 1609, 450, 7213, 2974]"
2,hillary clinton sent her book to every gop can...,False,"[1156, 1157, 3507, 337, 602, 3108, 547, 2140, 98]"
3,italian unions lambast new museum boss for wor...,False,"[1874, 5630, 9196, 74, 3002, 6879, 999, 1901]"
4,life below the ocean’s surface wholly depends ...,False,"[150, 4423, 7975, 36527, 6120, 415]"


# Dataset class

In [43]:
class CustomDataset(Dataset):
    def __init__(self, df):
        # data loading
        self.x = torch.from_numpy(np.array(df['tokenized']))
        self.y = torch.from_numpy(np.array(df['humor'])) # converts the dataframe into a tensor

    # this method needs an index
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return len(self.df)

In [44]:
train_dataset = CustomDataset(train_df)
first = train_dataset[0]
features, labels = first
print(features, labels)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [20]:
train_dataset = CustomDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

test_dataset = CustomDataset(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [21]:
for X, y in test_dataloader:
    print(f"Text: {X}")
    print(f"Value: {y}")
    break

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found object