In [1]:
import string, re

In [2]:
# load doc into memory
def load_data(filename):
    # open the file as read only
    with open(filename, 'r', encoding='utf8') as f:
        # read all text
        text = f.read()
    return text

# clean text
def clean_text(t):
    # to lower
    t = t.lower()
    # remove quotes
    t = re.sub(r'"@.*', '', t)
    t = re.sub(r'^“.*”$', '', t)
    # remove URLs
    t = re.sub(r'https*:\/\/\S*', '', t)
    t = re.sub(r'pic\.twitter\.com\/\S*', '', t)
    # remove \n
    t = re.sub('\n', ' ', t)
    # remove extra whitespaces
    t = re.sub(r'\s+', ' ', t)
    # replace '&amp' with 'and'
    t = re.sub('&amp;', 'and', t)     
    # replace abbreviations
    t = re.sub("'ll", ' will', t)
    t = re.sub("won't", 'will not', t)
    t = re.sub("n't", ' not', t) 
    # remove @mention
    t = re.sub(r'@[A-Za-z0-9_]+', '', t) 
    # remove #tag
    t = re.sub(r'#[A-Za-z0-9_]+', '', t) 
    # remove special characters
    t = re.sub(r'[^a-zA-Z ]', '', t) 
    # remove multiple spaces 
    t = re.sub("\s\s+", " ", t) 
    # split into tokens by white space
    words = t.split()
    return words

# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [3]:
# load document
in_filename = 'data/trump_raw_text.txt'
doc = load_data(in_filename)
#print(doc[:200])

# clean document
words = clean_text(doc)
print(words[:200])
print('Total words: %d' % len(words))
print('Unique words: %d' % len(set(words)))

# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(words)):
    # select sequence of tokens
    seq = words[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

# save sequences to file
out_filename = 'data/trump_sequences.txt'
save_doc(sequences, out_filename)

['great', 'to', 'be', 'with', 'our', 'wonderful', 'men', 'and', 'women', 'of', 'the', 'what', 'a', 'job', 'they', 'are', 'doing', 'the', 'problem', 'is', 'not', 'the', 'very', 'talented', 'lowflying', 'helicopter', 'pilots', 'wanting', 'to', 'save', 'our', 'city', 'the', 'problem', 'is', 'the', 'arsonists', 'looters', 'criminals', 'and', 'anarchists', 'wanting', 'to', 'destroy', 'it', 'and', 'our', 'country', 'michael', 'is', 'tough', 'smart', 'and', 'loves', 'our', 'country', 'this', 'has', 'been', 'a', 'big', 'battle', 'in', 'congress', 'for', 'years', 'thank', 'you', 'to', 'our', 'great', 'republican', 'senate', 'congratulations', 'to', 'michael', 'pack', 'nobody', 'has', 'any', 'idea', 'what', 'a', 'big', 'victory', 'this', 'is', 'for', 'america', 'why', 'because', 'he', 'is', 'going', 'to', 'be', 'running', 'the', 'voice', 'of', 'america', 'and', 'everything', 'associated', 'with', 'it', 'to', 'the', 'united', 'states', 'we', 'have', 'now', 'brought', 'more', 'than', 'american', '