In [1]:
import string

import numpy as np
import torch
torch.set_printoptions(edgeitems=2, precision=2, linewidth=75)

In [4]:
with open('../../data/p1ch4/jane-austen/1342-0.txt', encoding='utf8') as f:
    text = f.read()

In [5]:
lines = text.split('\n')

In [6]:
line = lines[200]
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

In [8]:
letter_t = torch.zeros(len(line), 128)

In [9]:
for i, letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter) < 128 else 0
    letter_t[i][letter_index] = 1

In [10]:
letter_t

tensor([[1., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.],
        ...,
        [0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.]])

In [11]:
def clean_words(input_str):
    punctuation = '.,;:"!?“”_-'
    word_list = input_str.lower().replace('\n', ' ').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

In [12]:
words_in_line = clean_words(line)

In [13]:
words_in_line

['impossible',
 'mr',
 'bennet',
 'impossible',
 'when',
 'i',
 'am',
 'not',
 'acquainted',
 'with',
 'him']

In [14]:
word_list = sorted(set(clean_words(text)))

In [15]:
word_list

['',
 '#1342]',
 '$5,000)',
 "'_she",
 "'after",
 "'ah",
 "'as-is'",
 "'bingley",
 "'had",
 "'having",
 "'i",
 "'keep",
 "'lady",
 "'lately",
 "'lydia",
 "'mr",
 "'my",
 "'oh",
 "'s",
 "'this",
 "'tis",
 "'violently",
 "'yes,'",
 "'you",
 '($1',
 '(801)',
 '(a)',
 '(an',
 '(and',
 '(any',
 '(available',
 '(b)',
 '(by',
 '(c)',
 '(comparatively',
 '(does',
 '(for',
 '(glancing',
 '(if',
 '(lady',
 '(like',
 '(most',
 '(my',
 '(or',
 '(trademark/copyright)',
 '(unasked',
 '(what',
 '(who',
 '(www.gutenberg.org)',
 '(“the',
 '*',
 '***',
 '*****',
 '1',
 '1.a',
 '1.b',
 '1.c',
 '1.d',
 '1.e',
 '1.e.1',
 '1.e.2',
 '1.e.3',
 '1.e.4',
 '1.e.5',
 '1.e.6',
 '1.e.7',
 '1.e.8',
 '1.e.9',
 '1.f',
 '1.f.1',
 '1.f.2',
 '1.f.3',
 '1.f.4',
 '1.f.5',
 '1.f.6',
 '10',
 '11',
 '12',
 '13',
 '1342-0.txt',
 '1342-0.zip',
 '14',
 '15',
 '1500',
 '15th',
 '16',
 '17',
 '18',
 '18th',
 '19',
 '1998',
 '2',
 '20',
 '20%',
 '2001',
 '2008',
 '2018',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '26th',
 '27',
 '2

In [16]:
word2index_dict = {word: i for (i, word) in enumerate(word_list)}

In [17]:
word2index_dict

{'': 0,
 '#1342]': 1,
 '$5,000)': 2,
 "'_she": 3,
 "'after": 4,
 "'ah": 5,
 "'as-is'": 6,
 "'bingley": 7,
 "'had": 8,
 "'having": 9,
 "'i": 10,
 "'keep": 11,
 "'lady": 12,
 "'lately": 13,
 "'lydia": 14,
 "'mr": 15,
 "'my": 16,
 "'oh": 17,
 "'s": 18,
 "'this": 19,
 "'tis": 20,
 "'violently": 21,
 "'yes,'": 22,
 "'you": 23,
 '($1': 24,
 '(801)': 25,
 '(a)': 26,
 '(an': 27,
 '(and': 28,
 '(any': 29,
 '(available': 30,
 '(b)': 31,
 '(by': 32,
 '(c)': 33,
 '(comparatively': 34,
 '(does': 35,
 '(for': 36,
 '(glancing': 37,
 '(if': 38,
 '(lady': 39,
 '(like': 40,
 '(most': 41,
 '(my': 42,
 '(or': 43,
 '(trademark/copyright)': 44,
 '(unasked': 45,
 '(what': 46,
 '(who': 47,
 '(www.gutenberg.org)': 48,
 '(“the': 49,
 '*': 50,
 '***': 51,
 '*****': 52,
 '1': 53,
 '1.a': 54,
 '1.b': 55,
 '1.c': 56,
 '1.d': 57,
 '1.e': 58,
 '1.e.1': 59,
 '1.e.2': 60,
 '1.e.3': 61,
 '1.e.4': 62,
 '1.e.5': 63,
 '1.e.6': 64,
 '1.e.7': 65,
 '1.e.8': 66,
 '1.e.9': 67,
 '1.f': 68,
 '1.f.1': 69,
 '1.f.2': 70,
 '1.f.3': 7

In [18]:
word_t = torch.zeros(len(words_in_line), len(word2index_dict))

In [19]:
for i, word in enumerate(words_in_line):
    word_index = word2index_dict[word]
    word_t[i][word_index] = 1
    print('{:2} {:4} {}'.format(i, word_index, word))

 0 3394 impossible
 1 4305 mr
 2  813 bennet
 3 3394 impossible
 4 7078 when
 5 3315 i
 6  415 am
 7 4436 not
 8  239 acquainted
 9 7148 with
10 3215 him
