### Get text from internet

In [1]:
import numpy as np
import re
import requests

In [2]:
# get raw text from internet
book = requests.get('https://www.gutenberg.org/files/35/35-0.txt')
text = book.text
print(type(text))
print(len(text))
print(text[:2000])

<class 'str'>
182973
*** START OF THE PROJECT GUTENBERG EBOOK 35 ***




The Time Machine

An Invention

by H. G. Wells


CONTENTS

 I Introduction
 II The Machine
 III The Time Traveller Returns
 IV Time Travelling
 V In the Golden Age
 VI The Sunset of Mankind
 VII A Sudden Shock
 VIII Explanation
 IX The Morlocks
 X When Night Came
 XI The Palace of Green Porcelain
 XII In the Darkness
 XIII The Trap of the White Sphinx
 XIV The Further Vision
 XV The Time Traveller’s Return
 XVI After the Story
 Epilogue




 I.
 Introduction


The Time Traveller (for so it will be convenient to speak of him) was
expounding a recondite matter to us. His pale grey eyes shone and
twinkled, and his usually pale face was flushed and animated. The fire
burnt brightly, and the soft radiance of the incandescent lights in the
lilies of silver caught the bubbles that flashed and passed in our
glasses. Our chairs, being his patents, embraced and caressed us rather

In [3]:
# character strings to replace with space
strings2replace = [
                 '\r\n\r\nâ\x80\x9c', # new paragraph
                 'â\x80\x9c',         # open quote
                 'â\x80\x9d',         # close quote
                 '\r\n',              # new line
                 'â\x80\x94',         # hyphen
                 'â\x80\x99',         # single apostrophe
                 'â\x80\x98',         # single quote
                 '_',                 # underscore, used for stressing
                 ]

# e.g., 'â\x80\x9d'.encode('latin1').decode('utf8')
# use regular expression (re) to replace those strings with space
for str2match in strings2replace:
  regexp = re.compile(r'%s'%str2match)
  text = regexp.sub(' ',text)

# remove non-ASCII characters
text = re.sub(r'[^\x00-\x7F]+', ' ', text)

# remove numbers
text = re.sub(r'\d+','',text)

# and make everything lower-case
text = text.lower()

# let's have a look!
text[:2000]

'*** start of the project gutenberg ebook  ***     the time machine  an invention  by h. g. wells   contents   i introduction  ii the machine  iii the time traveller returns  iv time travelling  v in the golden age  vi the sunset of mankind  vii a sudden shock  viii explanation  ix the morlocks  x when night came  xi the palace of green porcelain  xii in the darkness  xiii the trap of the white sphinx  xiv the further vision  xv the time traveller s return  xvi after the story  epilogue      i.  introduction   the time traveller (for so it will be convenient to speak of him) was expounding a recondite matter to us. his pale grey eyes shone and twinkled, and his usually pale face was flushed and animated. the fire burnt brightly, and the soft radiance of the incandescent lights in the lilies of silver caught the bubbles that flashed and passed in our glasses. our chairs, being his patents, embraced and caressed us rather than submitted to be sat upon, and there was that luxurious after-

### Parse text into words

In [4]:
# split by punctuation
import string
print(string.punctuation)
puncts4re = fr'[{string.punctuation}\s]+'

words = re.split(puncts4re,text)
words = [item.strip() for item in words if item.strip()]

# remove single-character words
words = [item for item in words if len(item)>1]

# let's have a look!
words[:50]

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


['start',
 'of',
 'the',
 'project',
 'gutenberg',
 'ebook',
 'the',
 'time',
 'machine',
 'an',
 'invention',
 'by',
 'wells',
 'contents',
 'introduction',
 'ii',
 'the',
 'machine',
 'iii',
 'the',
 'time',
 'traveller',
 'returns',
 'iv',
 'time',
 'travelling',
 'in',
 'the',
 'golden',
 'age',
 'vi',
 'the',
 'sunset',
 'of',
 'mankind',
 'vii',
 'sudden',
 'shock',
 'viii',
 'explanation',
 'ix',
 'the',
 'morlocks',
 'when',
 'night',
 'came',
 'xi',
 'the',
 'palace',
 'of']

In [5]:
# create the vocab! (set of unique words)
vocab = sorted(set(words))

# convenience variables for later
nWords = len(words)
nLex = len(vocab)

print(f'{nWords} words')
print(f' {nLex} unique tokens')

30698 words
 4589 unique tokens


### Create Tokens Dictionary

In [6]:
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for i,w in enumerate(vocab)}

# print out a few
for i in list(word2idx.items())[0:10000:87]:
  print(i)

('abandon', 0)
('aimlessly', 87)
('apologise', 174)
('attained', 261)
('behaved', 348)
('both', 435)
('can', 522)
('cheerfully', 609)
('coat', 696)
('contents', 783)
('culminating', 870)
('delay', 957)
('dimness', 1044)
('dragging', 1131)
('edition', 1218)
('everywhere', 1305)
('facilities', 1392)
('find', 1479)
('footfall', 1566)
('furnishing', 1653)
('gold', 1740)
('hallo', 1827)
('high', 1914)
('ideas', 2001)
('inextinguishable', 2088)
('invest', 2175)
('lamp', 2262)
('likewise', 2349)
('manhood', 2436)
('minerals', 2523)
('mysteries', 2610)
('novelty', 2697)
('outbreaks', 2784)
('paws', 2871)
('plato', 2958)
('previously', 3045)
('questionings', 3132)
('reflecting', 3219)
('return', 3306)
('sandals', 3393)
('senses', 3480)
('shrinking', 3567)
('slit', 3654)
('special', 3741)
('stick', 3828)
('sudden', 3915)
('tap', 4002)
('thrice', 4089)
('treat', 4176)
('unfrozen', 4263)
('vertical', 4350)
('wearisome', 4437)
('wonderful', 4524)


In [7]:
# encoder function (using for-loop instead of list-comp)
def encoder(words,encode_dict):

  # initialize a vector of numerical indices
  idxs = np.zeros(len(words),dtype=int)

  # loop through the words and find their token in the vocab
  for i,w in enumerate(words):
    idxs[i] = encode_dict[w]

  # return the indices!
  return idxs


# also need a decoder function
def decoder(idxs,decode_dict):
  return ' '.join([decode_dict[i] for i in idxs])


In [8]:
# test the encoder
print(encoder(['the','time','machine'],word2idx))

# test the decoder
print(decoder([1,3,10],idx2word))

[4042 4109 2416]
abandoned abnormally absent
