## 2. Data Preparation

### 2.8 Load Text - Kafka on the Shore by Haruki Murakami

In [1]:
# define function to load entire text file into memory and return it.
def load_doc(filename):
    file = open(filename, 'r', 
                encoding='utf-8')    # open the file as read only
    text = file.read()               # read all the text
    file.close()                     # close the file
    return text

In [2]:
# load document, Text_Cleaned.txt 
in_filename = '../data/kots_text.txt'
doc = load_doc(in_filename)

#preview first 250 characters 
print(doc[:250])                 

“Cash isn’t the only thing I take from my father’s study when I leave home. I take a small, old gold lighter—I like the design and feel of it—and a folding knife with a really sharp blade. Made to skin deer, it has a five-inch blade and a nice heft. 


### 2.9 Clean Text

In [3]:
# import libraries
import re
import string

In [4]:
# define function to convert document into tokens
def clean_doc(doc):
    # replace '-' with ' ' 
    doc = doc.replace('-', ' ')
    # split doc into tokens by space
    tokens = doc.split()           
    # filter punctuated chars
    re_punc = re.compile('[%s]'% re.escape(string.punctuation)) 
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]  
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()] 
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

In [5]:
# convert document into tokens
tokens = clean_doc(doc)
# print first 200 tokens
print(tokens[:200])

['the', 'only', 'thing', 'i', 'take', 'from', 'my', 'study', 'when', 'i', 'leave', 'home', 'i', 'take', 'a', 'small', 'old', 'gold', 'like', 'the', 'design', 'and', 'feel', 'of', 'a', 'folding', 'knife', 'with', 'a', 'really', 'sharp', 'blade', 'made', 'to', 'skin', 'deer', 'it', 'has', 'a', 'five', 'inch', 'blade', 'and', 'a', 'nice', 'heft', 'probably', 'something', 'he', 'bought', 'on', 'one', 'of', 'his', 'trips', 'abroad', 'i', 'also', 'take', 'a', 'sturdy', 'bright', 'pocket', 'flashlight', 'out', 'of', 'a', 'drawer', 'plus', 'sky', 'blue', 'revo', 'sunglasses', 'to', 'disguise', 'my', 'age', 'i', 'think', 'about', 'taking', 'my', 'favorite', 'sea', 'dweller', 'oyster', 'rolex', 'a', 'beautiful', 'watch', 'but', 'something', 'flashy', 'will', 'only', 'attract', 'attention', 'my', 'cheap', 'plastic', 'casio', 'watch', 'with', 'an', 'alarm', 'and', 'stopwatch', 'will', 'do', 'just', 'fine', 'and', 'might', 'actually', 'be', 'more', 'useful', 'reluctantly', 'i', 'return', 'the', 'ro

In [6]:
# print total number of tokens 
print('Total number of tokens: ', len(tokens))
# convert tokens into a set and print the number of unique tokens
print('Number of unique tokens: ', len(set(tokens)))

Total number of tokens:  661
Number of unique tokens:  328


### 2.10 Save Clean Text

#### Sequence of 50 + 1 Words

In [7]:
# organize token list into sequences
length = 50+1
sequences = list()

for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    #convert into a line
    line = ' '.join(seq)
    # store each line in sequences
    sequences.append(line)
    
# print total number of sequences
print('Total Sequences: %d' % len(sequences))

Total Sequences: 610


In [8]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [9]:
# save sequences to file
out_filename = '../data/Text_Sequences_50_kots.txt'
save_doc(sequences, out_filename)

#### Sequence of 100 + 1 Words

In [10]:
# organize token list into sequences
length = 100+1
sequences = list()

for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    #convert into a line
    line = ' '.join(seq)
    # store each line in sequences
    sequences.append(line)
    
# print total number of sequences
print('Total Sequences: %d' % len(sequences))

Total Sequences: 560


In [11]:
# save sequences to file
out_filename = '../data/Text_Sequences_100_kots.txt'
save_doc(sequences, out_filename)