In this notebook, I attempt to use Recurrent Neural Networks (RNNs), Gated Recurrent Units (GRUs) and Long-Short Term Memory (LSTM) to conduct classification

In [29]:
import os
import torch
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

from torch import nn
from torch.utils.data import Dataset, DataLoader
import torchvision
# from torchvision import datasets, transforms

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Reading in the data

In [3]:
train_df = pd.read_csv('data/DL/DL_train.csv')
train_df.head()

Unnamed: 0,text,humor
0,watch this swimmer disappear into winter storm...,False
1,"they laughed at reagan, too: trump's ideas wil...",False
2,"hey, are you cold? go over to the corner, it i...",True
3,cannot get a standing desk? these are almost a...,False
4,want to hear a joke about my penis? never mind...,True


In [4]:
test_df = pd.read_csv('data/DL/DL_test.csv')
test_df.head()

Unnamed: 0,text,humor
0,thought up a reddit joke today. when is a tria...,True
1,how much do pirates pay for corn? a buck an ear!,True
2,hillary clinton sent her book to every gop can...,False
3,italian unions lambast new museum boss for wor...,False
4,life below the ocean’s surface wholly depends ...,False


Setting up the hyperparameters

In [6]:
INPUT_SIZE = 784
HIDDEN_SIZE = 300
NUM_CLASSES = 2
NUM_EPOCHS = 2
BATCH_SIZE = 64
LEARNING_RATE = 0.001

# RNN class

In [5]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftMax(dim=1)
    
    def forward(self, input_tensor, hidden_tensor):
        combined = torch.cat((input_tensor, hidden_tensor), 1)

        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

# Preparing the dataset

The custom dataset should inherit Dataset and override the following methods:
1. \_\_len\_\_ so that len(dataset) returns the size of the dataset.
2. \_\_getitem\_\_ to support the indexing such that dataset[i] can be used to get the ith sample.

Need to further preprocess the text data before we can convert the data into a tensor: https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial

Also, preprocessing the text data: https://medium.com/@theDrewDag/convert-texts-into-tensors-for-deep-learning-74b0cf48d416

In [30]:
def preprocess_text(text: str) -> str:
    """This utility function sanitizes a string by:
    - removing links
    - removing special characters
    - removing numbers
    - removing stopwords
    - transforming in lowercase
    - removing excessive whitespaces
    Args:
        text (str): the input text you want to clean
    Returns:
        str: the cleaned list of tokens
    """

    # remove links
    text = re.sub(r"http\S+", "", text)
    # remove special chars and numbers
    text = re.sub("[^A-Za-z]+", " ", text)
    # remove stopwords
    # 1. tokenize
    tokens = nltk.word_tokenize(text)
    # 2. check if stopword
    tokens = [w.lower() for w in tokens if not w in stopwords.words("english")]
    return tokens

In [31]:
# creation of vocavulary

def get_vocab(training_corpus):

    # add special characters
    # padding, end of line, unknown term
    vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

    # build vocab from training corpus
    for item in training_corpus: 
        processed_text = preprocess_text(item) # apply preprocessing on each text
        for word in processed_text: # for each word in tokens
          if word not in vocab: # if word not in vocab
              vocab[word] = len(vocab) # create an entry in the vocab equal to the term, and its value is the length of the vocab 
    return vocab


In [34]:
vocab = get_vocab(train_df['text'])

In [36]:
# Function to convert the text data into a tensor
def text_to_tensor(text: str, vocab_dict: dict, unk_token='__UNK__', verbose=False):
    '''
    Arguments: 
        text - string containing the text
        vocab_dict - term vocabulary
        unk_token - special char used to map the unknown items
        verbose - print debug messages
    Returns:
        tensor_l - a tensor containing the indices of our terms
    '''     
    word_l = preprocess_text(text)
    
    if verbose:
        print("List of words in our text:")
        print(word_l)
        
    # initialize empty tensor
    tensor_l = [] 
    
    # take the __UNK__ value fro mthe vocabulary 
    unk_ID = vocab_dict[unk_token]
    
    if verbose:
        print(f"UNK has a value of {unk_ID}")
        
    # for each word in the lsit:
    for word in word_l:
        # take the index
        # if the word is not in vocab_dict, then assign UNK
        word_ID = vocab_dict.get(word, unk_ID)
        # append to tensor list
        tensor_l.append(word_ID)

    return tensor_l

In [41]:
train_df['tokenized'] = train_df['text'].apply(lambda x: text_to_tensor(x, vocab))
train_df.head()

Unnamed: 0,text,humor,tokenized
0,watch this swimmer disappear into winter storm...,False,"[3, 4, 5, 6, 7, 8]"
1,"they laughed at reagan, too: trump's ideas wil...",False,"[9, 10, 11, 12, 13, 14, 15]"
2,"hey, are you cold? go over to the corner, it i...",True,"[16, 17, 18, 19, 20]"
3,cannot get a standing desk? these are almost a...,False,"[21, 22, 23, 24, 25]"
4,want to hear a joke about my penis? never mind...,True,"[26, 27, 28, 29, 30, 31, 32]"


In [42]:
test_df['tokenized'] = test_df['text'].apply(lambda x: text_to_tensor(x, vocab))
test_df.head()

Unnamed: 0,text,humor,tokenized
0,thought up a reddit joke today. when is a tria...,True,"[2594, 4644, 28, 500, 6231, 25956, 11826]"
1,how much do pirates pay for corn? a buck an ear!,True,"[565, 2773, 1609, 450, 7213, 2974]"
2,hillary clinton sent her book to every gop can...,False,"[1156, 1157, 3507, 337, 602, 3108, 547, 2140, 98]"
3,italian unions lambast new museum boss for wor...,False,"[1874, 5630, 9196, 74, 3002, 6879, 999, 1901]"
4,life below the ocean’s surface wholly depends ...,False,"[150, 4423, 7975, 36527, 6120, 415]"


# Dataset class

In [43]:
class CustomDataset(Dataset):
    def __init__(self, df):
        # data loading
        self.x = torch.from_numpy(np.array(df['tokenized']))
        self.y = torch.from_numpy(np.array(df['humor'])) # converts the dataframe into a tensor

    # this method needs an index
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return len(self.df)

In [44]:
train_dataset = CustomDataset(train_df)
first = train_dataset[0]
features, labels = first
print(features, labels)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [20]:
train_dataset = CustomDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

test_dataset = CustomDataset(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [21]:
for X, y in test_dataloader:
    print(f"Text: {X}")
    print(f"Value: {y}")
    break

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found object