## 2. Data Preparation

### 2.5 Load Text - The Castle by Frank Kafka

In [2]:
# define function to load entire text file into memory and return it.
def load_doc(filename):
    file = open(filename, 'r', 
                encoding='utf-8')    # open the file as read only
    text = file.read()               # read all the text
    file.close()                     # close the file
    return text

In [3]:
# load document, Text_Cleaned.txt 
in_filename = '../data/castle_text.txt'
doc = load_doc(in_filename)

#preview first 250 characters 
print(doc[:250])                 

It was late evening when K. arrived. The village lay under deep
snow. There was no sign of the Castle hill, fog and darkness
surrounded it, not even the faintest gleam of light suggested the
large Castle. K. stood a long time on the wooden bridge tha


### 2.6 Clean Text

In [4]:
# import libraries
import re
import string

In [5]:
# define function to convert document into tokens
def clean_doc(doc):
    # replace '-' with ' ' 
    doc = doc.replace('-', ' ')
    # split doc into tokens by space
    tokens = doc.split()           
    # filter punctuated chars
    re_punc = re.compile('[%s]'% re.escape(string.punctuation)) 
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]  
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()] 
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

In [6]:
# convert document into tokens
tokens = clean_doc(doc)
# print first 200 tokens
print(tokens[:200])

['it', 'was', 'late', 'evening', 'when', 'k', 'arrived', 'the', 'village', 'lay', 'under', 'deep', 'snow', 'there', 'was', 'no', 'sign', 'of', 'the', 'castle', 'hill', 'fog', 'and', 'darkness', 'surrounded', 'it', 'not', 'even', 'the', 'faintest', 'gleam', 'of', 'light', 'suggested', 'the', 'large', 'castle', 'k', 'stood', 'a', 'long', 'time', 'on', 'the', 'wooden', 'bridge', 'that', 'leads', 'from', 'the', 'main', 'road', 'to', 'the', 'village', 'gazing', 'upward', 'into', 'the', 'seeming', 'emptiness', 'then', 'he', 'went', 'looking', 'for', 'a', 'nights', 'lodging', 'at', 'the', 'inn', 'they', 'were', 'still', 'awake', 'the', 'landlord', 'had', 'no', 'room', 'available', 'but', 'extremely', 'surprised', 'and', 'confused', 'by', 'the', 'latecomer', 'he', 'was', 'willing', 'to', 'let', 'k', 'sleep', 'on', 'a', 'straw', 'mattress', 'in', 'the', 'taproom', 'k', 'agreed', 'to', 'this', 'a', 'few', 'peasants', 'were', 'still', 'sitting', 'over', 'beer', 'but', 'he', 'did', 'not', 'want', 

In [7]:
# print total number of tokens 
print('Total number of tokens: ', len(tokens))
# convert tokens into a set and print the number of unique tokens
print('Number of unique tokens: ', len(set(tokens)))

Total number of tokens:  1052
Number of unique tokens:  428


### 2.7 Save Clean Text

#### Sequence of 50 + 1 Words

In [8]:
# organize token list into sequences
length = 50+1
sequences = list()

for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    #convert into a line
    line = ' '.join(seq)
    # store each line in sequences
    sequences.append(line)
    
# print total number of sequences
print('Total Sequences: %d' % len(sequences))

Total Sequences: 1001


In [9]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [10]:
# save sequences to file
out_filename = '../data/Text_Sequences_50_castle.txt'
save_doc(sequences, out_filename)

#### Sequence of 100 + 1 Words

In [11]:
# organize token list into sequences
length = 100+1
sequences = list()

for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    #convert into a line
    line = ' '.join(seq)
    # store each line in sequences
    sequences.append(line)
    
# print total number of sequences
print('Total Sequences: %d' % len(sequences))

Total Sequences: 951


In [12]:
# save sequences to file
out_filename = '../data/Text_Sequences_100_castle.txt'
save_doc(sequences, out_filename)