# Introduction

Here are the steps we will take in this notebook to train our RNN

1. Import Libraries
2. Prepare Dataset
3. Create RNN Model
4. hidden layer dimension is 100
5. number of hidden layer is 1
6. Instantiate Model
7. Instantiate Loss (Cross entropy loss)
8. Instantiate Optimizer (SGD Optimizer or Adam optimizer)
9. Traning the Model
10. Prediction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator



# Reading data

In [2]:
train_df = pd.read_csv('data/ML/ML_train.csv')
test_df = pd.read_csv('data/ML/ML_test.csv')

# Creation of validation set

In [3]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df['text'], train_df['humor'], test_size=0.1, random_state=42)

Conversion of labels into numpy arrays

In [4]:
# convert our train target values into a numpy array
train_targets_np = y_train.values
train_targets_np

array([False,  True, False, ..., False,  True,  True])

In [5]:
# convert validation target values into numpy array
valid_targets_np = y_valid.values
valid_targets_np

array([False,  True, False, ..., False,  True, False])

In [6]:
# convert test target values into a numpy array
test_targets_np = test_df['humor'].values
test_targets_np

array([ True,  True, False, ...,  True, False, False])

Conversion of features into numpy arrays

In [7]:
# convert the features column (text) into numpy arrays
train_features_numpy = X_train.values
train_features_numpy

array(['donald trump position get behind',
       'die going kill could live without',
       'man faced unimaginable suffering wrote definitive book happiness',
       ..., 'satya nadella said woman asking raise',
       'hey girl fitness well fitness cock mouth',
       'neighbor vampire stabbed heart wooden stake died'], dtype=object)

In [8]:
# do the same for the validation set
valid_features_numpy = X_valid.values
valid_features_numpy

array(['heroic food truck coming rescue california fire victim',
       'gay friend must businessmen keep talking partner',
       'jada pinkett smith share relationship advice facebook', ...,
       'happens mix world greatest drug booze',
       'general patton thanksgiving gave tank',
       'blink 182 album rereleased cassette tape'], dtype=object)

In [9]:
# do the same for the test set
test_features_numpy = test_df.loc[:, test_df.columns != 'humor'].values
test_features_numpy
#  = np.array([t[0] for t in test_features_numpy])
# test_features_numpy

array([['thought reddit joke today triangle rectangle fails'],
       ['much pirate pay corn buck ear'],
       ['hillary clinton sent book every gop candidateexcept one'],
       ...,
       ['best drug sex birth control'],
       ['fit link tony hortons rule gym success first yoga class'],
       ['positive prognosis climate negotiation']], dtype=object)

In [10]:
# checking that the data was split correctly
print(f"Train features length: {len(train_features_numpy)}")
print(f"Train targets length: {len(train_targets_np)}")
print(f"Validation features length: {len(valid_features_numpy)}")
print(f"Validation targets length: {len(valid_targets_np)}")
print(f"Test features length: {len(test_features_numpy)}")
print(f"Test targets length: {len(test_targets_np)}")

Train features length: 144000
Train targets length: 144000
Validation features length: 16000
Validation targets length: 16000
Test features length: 40000
Test targets length: 40000


# Convert text into tokens, and numericalize the text

Tokenize the text using a tokenizer

In [25]:
tokenizer = get_tokenizer("basic_english")

def tokenize(sent, tokenizer, max_length):
    tokens = tokenizer(sent)[:max_length]
    length = len(tokens)
    return {"tokens": tokens, "length": length}

In [26]:
max_length = 100

X_train = [tokenize(x, tokenizer, max_length) for x in train_features_numpy]
X_train

[{'tokens': ['donald', 'trump', 'position', 'get', 'behind'], 'length': 5},
 {'tokens': ['die', 'going', 'kill', 'could', 'live', 'without'], 'length': 6},
 {'tokens': ['man',
   'faced',
   'unimaginable',
   'suffering',
   'wrote',
   'definitive',
   'book',
   'happiness'],
  'length': 8},
 {'tokens': ['alex',
   'poythress',
   'always',
   'score',
   'point',
   'prefers',
   'amazing',
   'gifs'],
  'length': 8},
 {'tokens': ['watch', 'kid', 'get', 'hilariously', 'honest', 'thankful'],
  'length': 6},
 {'tokens': ['message',
   'kentucky',
   'clerk',
   'kim',
   'davis',
   'girl',
   'take',
   'romper'],
  'length': 8},
 {'tokens': ['hear', 'deaf', 'guy', 'could', 'sing', 'ya', 'well'],
  'length': 7},
 {'tokens': ['university',
   'leader',
   'showed',
   'college',
   'failure',
   'sexual',
   'assault',
   'case'],
  'length': 8},
 {'tokens': ['computer', 'said', 'hello', 'think', 'dell'], 'length': 5},
 {'tokens': ['call', 'fat', 'pirate', 'vast', 'matey'], 'length':

In [27]:
X_valid = np.array([tokenize(x, tokenizer, max_length) for x in valid_features_numpy])
X_valid

array([{'tokens': ['heroic', 'food', 'truck', 'coming', 'rescue', 'california', 'fire', 'victim'], 'length': 8},
       {'tokens': ['gay', 'friend', 'must', 'businessmen', 'keep', 'talking', 'partner'], 'length': 7},
       {'tokens': ['jada', 'pinkett', 'smith', 'share', 'relationship', 'advice', 'facebook'], 'length': 7},
       ...,
       {'tokens': ['happens', 'mix', 'world', 'greatest', 'drug', 'booze'], 'length': 6},
       {'tokens': ['general', 'patton', 'thanksgiving', 'gave', 'tank'], 'length': 5},
       {'tokens': ['blink', '182', 'album', 'rereleased', 'cassette', 'tape'], 'length': 6}],
      dtype=object)

In [28]:
X_test = np.array([tokenize(x[0], tokenizer, max_length) for x in test_features_numpy])
X_test

array([{'tokens': ['thought', 'reddit', 'joke', 'today', 'triangle', 'rectangle', 'fails'], 'length': 7},
       {'tokens': ['much', 'pirate', 'pay', 'corn', 'buck', 'ear'], 'length': 6},
       {'tokens': ['hillary', 'clinton', 'sent', 'book', 'every', 'gop', 'candidateexcept', 'one'], 'length': 8},
       ...,
       {'tokens': ['best', 'drug', 'sex', 'birth', 'control'], 'length': 5},
       {'tokens': ['fit', 'link', 'tony', 'hortons', 'rule', 'gym', 'success', 'first', 'yoga', 'class'], 'length': 10},
       {'tokens': ['positive', 'prognosis', 'climate', 'negotiation'], 'length': 4}],
      dtype=object)

Build a vocabulary to numericalize the text data on

In [36]:
min_freq = 5
special_tokens = ["<unk>", "<pad>"] # <pad> to pad sequences when they are not long enough

vocab = build_vocab_from_iterator(
    [x['tokens'] for x in X_train], # get only the tokens of each element
    min_freq=min_freq,
    specials=special_tokens,
)

In [37]:
vocab.get_itos()

['<unk>',
 '<pad>',
 'like',
 'call',
 'say',
 'get',
 'trump',
 'new',
 'photo',
 'one',
 'woman',
 'people',
 'make',
 'know',
 'day',
 'would',
 'man',
 'time',
 'go',
 'want',
 'joke',
 'video',
 'cannot',
 'take',
 'year',
 'donald',
 'thing',
 'way',
 'could',
 'black',
 'got',
 'best',
 'life',
 'kid',
 'never',
 'girl',
 'first',
 'two',
 'guy',
 'love',
 'look',
 'good',
 'world',
 'hear',
 'show',
 'really',
 'think',
 'need',
 'tell',
 'see',
 'back',
 'dog',
 'baby',
 'white',
 'difference',
 'always',
 'gay',
 'right',
 'u',
 '5',
 'find',
 'going',
 'sex',
 '10',
 'friend',
 'house',
 'work',
 'walk',
 'give',
 'bar',
 'ever',
 'many',
 'come',
 'favorite',
 'child',
 'mom',
 'food',
 'home',
 'wife',
 'american',
 'someone',
 'week',
 'stop',
 'change',
 'yous',
 'knock',
 'men',
 'still',
 'last',
 'bad',
 'called',
 'school',
 'keep',
 'little',
 'help',
 'name',
 'may',
 'today',
 'big',
 '2',
 'clinton',
 'star',
 'watch',
 'much',
 'said',
 'every',
 'put',
 'eat',


In [38]:
# get indices of unknown index andpad index
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]

In [40]:
vocab.set_default_index(unk_index) # so that if some words are not in the vocabulary, we will use <unk> instead of that unknown word

Numericalizing the text data

In [None]:
# experiment with the other lstm notebook on colab when free