# 1. Import Modules

In [1]:
import numpy as np
import torch

import torch.nn.functional as F

from torch import nn
from torch.utils.data import TensorDataset, DataLoader

import helper
import problem_unittests as tests

# 2. Explore the Data

In [2]:
data_dir = './data/Seinfeld_Scripts.txt'
text = helper.load_data(data_dir)

In [3]:
view_line_range = (0, 10)

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))

lines = text.split('\n')
print('Number of lines: {}'.format(len(lines)))
word_count_line = [len(line.split()) for line in lines]
print('Average number of words in each line: {}'.format(np.average(word_count_line)))

print()
print('The lines {} to {}:'.format(*view_line_range))
print('\n'.join(text.split('\n')[view_line_range[0]:view_line_range[1]]))

Dataset Stats
Roughly the number of unique words: 46367
Number of lines: 109233
Average number of words in each line: 5.544240293684143

The lines 0 to 10:
jerry: do you know what this is all about? do you know, why were here? to be out, this is out...and out is one of the single most enjoyable experiences of life. people...did you ever hear people talking about we should go out? this is what theyre talking about...this whole thing, were all out now, no one is home. not one person here is home, were all out! there are people trying to find us, they dont know where we are. (on an imaginary phone) did you ring?, i cant find him. where did he go? he didnt tell me where he was going. he must have gone out. you wanna go out you get ready, you pick out the clothes, right? you take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...then youre standing around, what do you do? you go we gotta be getting back. once youre out, you wanna get back! y

# 3. Implement Pre-processing Functions

### Lookup Table

In [4]:
def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    
    vocab_to_int = dict()
    int_to_vocab = dict()
    
    sorted_word_set = sorted(set(text))
    
    for i, word in enumerate(sorted_word_set):
        vocab_to_int[word] = i
        int_to_vocab[i] = word
    
    return (vocab_to_int, int_to_vocab)

In [5]:
tests.test_create_lookup_tables(create_lookup_tables)

Tests Passed


### Tokenize Punctuation

In [6]:
def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenized dictionary where the key is the punctuation and the value is the token
    """
    token_dict = dict()
    
    punctuation_list = [
        '.', 
        ',', 
        '"', 
        ';', 
        '!', 
        '?', 
        '(', 
        ')',
        '-',
        '\n'
    ]
    
    token_list = [
        '||Period||', 
        '||Comma||', 
        '||Quotation_Mark||', 
        '||Semicolon||', 
        '||Exclamation_Mark||', 
        '||Question_Mark||', 
        '||Left_Parentheses||', 
        '||Right_Parentheses||', 
        '||Dash||', 
        '||Return||'
    ]
    
    for (punctuation, token) in zip(punctuation_list, token_list):
        token_dict[punctuation] = token
        
    return token_dict

In [7]:
tests.test_tokenize(token_lookup)

Tests Passed


## Pre-process & Save Data

In [8]:
helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

# **************** #1 Check Point ****************

In [9]:
int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

### Check Access to GPU

In [10]:
train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('No GPU found. Please use a GPU to train your neural network.')

No GPU found. Please use a GPU to train your neural network.


### Batch Input Data

In [11]:
def batch_data(words, sequence_length, batch_size):
    """
    Batch the neural network data using DataLoader
    :param words: The word ids of the TV scripts
    :param sequence_length: The sequence length of each batch
    :param batch_size: The size of each batch; the number of sequences in a batch
    :return: DataLoader with batched data
    """
    
    ###
    
    feature_stack = None
    
    target_arr = np.array([])
    
    words_length = len(words)
    
    arr = np.arange(words_length)
    np.random.shuffle(arr)
    
    for i in range(batch_size):
        
        start_i = arr[i] if i < words_length else np.random.choice(range(words_length))
        
        end_i = start_i + sequence_length
        
        feature_list = words[start_i:end_i] if end_i < words_length else words[start_i:words_length+1]
        
        remaing_length = sequence_length - len(feature_list)
        
        remaing_list = words[0:remaing_length]
        
        feature_list = np.concatenate((feature_list, remaing_list))
        
        target_element = words[end_i] if end_i < words_length else words[end_i % words_length]
        
        feature_stack = np.vstack((feature_stack, feature_list)) if i > 0 else feature_list
        
        target_arr = np.append(target_arr, target_element)
        
    ###
    
    feature_tensors = torch.from_numpy(feature_stack)
    target_tensors = torch.from_numpy(target_arr)
    
    ###
    
    data = TensorDataset(feature_tensors, target_tensors)
    data_loader = torch.utils.data.DataLoader(data, 
                                          batch_size=batch_size)
        
        
    return data_loader

In [12]:
test_text = range(10)
t_loader = batch_data(test_text, sequence_length=5, batch_size=13)

data_iter = iter(t_loader)
sample_x, sample_y = data_iter.next()

print(sample_x.shape)
print(sample_x)
print()
print(sample_y.shape)
print(sample_y)

torch.Size([13, 5])
tensor([[4., 5., 6., 7., 8.],
        [7., 8., 9., 0., 1.],
        [3., 4., 5., 6., 7.],
        [1., 2., 3., 4., 5.],
        [0., 1., 2., 3., 4.],
        [5., 6., 7., 8., 9.],
        [8., 9., 0., 1., 2.],
        [2., 3., 4., 5., 6.],
        [9., 0., 1., 2., 3.],
        [6., 7., 8., 9., 0.],
        [1., 2., 3., 4., 5.],
        [4., 5., 6., 7., 8.],
        [4., 5., 6., 7., 8.]], dtype=torch.float64)

torch.Size([13])
tensor([9., 2., 8., 6., 5., 0., 3., 7., 4., 1., 6., 9., 9.],
       dtype=torch.float64)
