[View in Colaboratory](https://colab.research.google.com/github/ysterin/deep-pointing/blob/wikisource-text/deep_pointing_keras.ipynb)

In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [9]:
file_id = '1t4zss4j8GkqkPEKIBXpYD6brm2G3Y8DG'
downloaded = drive.CreateFile({'id': file_id})
print('Downloaded content "{}"'.format(downloaded.GetContentFile('wikisource.zip')))

Downloaded content "None"


In [0]:
!unzip wikisource.zip
!ls

In [8]:
import os
from random import shuffle


TypeError: ignored

In [0]:
from random import shuffle
import os
from os import path
text = ''
files_list = os.listdir('./wikisource')
shuffle(files_list)
for f in files_list:
  with open("./wikisource/"+f, 'rb') as file:
    text += file.read().decode('utf-8')

In [0]:
import re

from collections import Counter
text = re.sub(chr(64288), chr(1506), text)
text = re.sub(chr(64291), chr(1492), text)
text = re.sub(chr(1469), '', text)
text = re.sub(chr(1472), '', text)
taam_pattern = '[' + ''.join([chr(i) for i in range(1425, 1456)]) + ']'
text = re.sub(taam_pattern, '', text)
counter = Counter(text)
for c in counter:
  if counter[c] < 500:
    text = text.replace(c, ' ')

In [0]:
from collections import Counter
from pprint import pprint
print(len(text))
print(len(counter))
for k, v in sorted(list(counter.items()), key=lambda a:a[0]):
  print(ord(k), k, v)

In [0]:
import numpy as np
ids = [i for i, c in enumerate(text) if ord(c)==8212]

for i in np.random.choice(ids, 30):
  print(text[i-20: i+20])
  

In [0]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision
import torch

In [14]:
len(text)

64061639

In [15]:

text[:100], text[-100:]

('\n /ניקוד\n\n\nוַיְהִי דְבַר יְהוָה אֶל יֵהוּא בֶן חֲנָנִי עַל בַּעְשָׁא לֵאמֹר.\nיַעַן אֲשֶׁר הֲרִימֹתִי',
 'ָ וַתְּחַלְלֶהָ. \nוְלֹא תַעֲלֶה בְמַעֲלֹת עַל מִזְבְּחִי אֲשֶׁר לֹא תִגָּלֶה עֶרְוָתְךָ עָלָיו. \n\n\n\n')

In [17]:
import re
chars = sorted(set(text))
pointing_chars = [c for c in chars if 1455<ord(c)<1468 or ord(c) in [1479, 65533]]
non_pointing_chars = [c for c in chars if not (1455<ord(c)<1480 or ord(c) == 65533) ]
pointing_chars, non_pointing_chars
special_chars = ['.', '[',']','}', '{', '(', ')', '\\', '\/']
np_chars_pattern = ['\\'+c for c in non_pointing_chars if c in special_chars] + [c for c in non_pointing_chars if c not in special_chars]
print(np_chars_pattern)
np_pattern = '[' + ''.join(np_chars_pattern) + ']'
np_pattern

['\\(', '\\)', '\\.', '\\[', '\\]', '\n', ' ', '!', '"', "'", ',', '-', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'ך', 'כ', 'ל', 'ם', 'מ', 'ן', 'נ', 'ס', 'ע', 'ף', 'פ', 'ץ', 'צ', 'ק', 'ר', 'ש', 'ת', '׳', '״', '–', '—']


'[\\(\\)\\.\\[\\]\n !"\',-/0123456789:;=?אבגדהוזחטיךכלםמןנסעףפץצקרשת׳״–—]'

In [18]:
unpointed_text = ''.join(re.findall(np_pattern, text))
pointings = re.split(np_pattern, text)[1:]
print(len(unpointed_text), len(pointings))
len(''.join([''.join(z) for z in zip(pointings, unpointed_text)])) == len(text)
_text = ''.join([''.join(z) for z in zip(unpointed_text, pointings)])
for i in range(len(text)):
  if _text[i] != text[i]:
    print(i, text[i], _text[i])
    

38219017 38219017


In [0]:
# Create & upload a text file.
uploaded = drive.CreateFile({'title': 'wikisource_text.txt'})
uploaded.SetContentString(text)
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

In [0]:
Counter([len(p) for p in pointings])

In [0]:
ps2ids = {p:i for i, p in enumerate(pointing_chars)}
cs2ids = {c:i for i, c in enumerate(non_pointing_chars)}
ids2ps = {i:p for i, p in enumerate(pointing_chars)}
ids2cs = {i:c for i, c in enumerate(non_pointing_chars)}

In [20]:
from keras.utils import to_categorical
import numpy as np
from keras import utils
vocab_size = len(cs2ids)
X = np.asarray([cs2ids[c] for c in unpointed_text])
#X = utils.to_categorical(X)
N_points = len(pointing_chars)
N = len(unpointed_text)
y = np.zeros(N, dtype=np.int32)
for i in range(N):
  for j in range(N_points):
    if ids2ps[j] in pointings[i]:
      y[i] = j+1
      break
      
print(y[:10])
print(X[:10])
y = to_categorical(y)

Using TensorFlow backend.


[0 0 0 0 0 0 0 0 0 0]
[ 0  1 10 43 36 50 32 30  0  0]


In [14]:
np.sum(y, axis=0)/N

array([4.38975602e-01, 9.61286367e-02, 2.68766724e-03, 1.46530718e-02,
       4.71780841e-04, 7.10740415e-02, 3.63907057e-02, 4.08467334e-02,
       8.04724517e-02, 8.67580137e-02, 4.39818481e-02, 2.28158668e-05,
       2.56142119e-03, 1.74441954e-04, 8.22365473e-05])

In [0]:
from numpy.random import permutation
from keras.utils import Sequence, to_categorical
class batchSeq(Sequence):
    def __init__(self, X, y, length, stride, batch_size, randlen=True, shuffle = True):
      self._N = X.shape[0]
      self.X = X
      self.y = y
      self.length = length
      self.stride = stride
      self.bs = batch_size
      self.index = 0
      self.randlen = randlen
      self.shuffle = shuffle
    
    def __len__(self):
      return ((self._N-self.length)//self.stride)//self.bs

    def __getitem__(self, idx):
      length = self.length
#       if np.random.rand(1)[0] < 0.05:
#         length = length//2
      if self.randlen: length = np.random.randint(length-10, length+10)
      bx = [self.X[self.stride*(self.bs*idx+i):self.stride*(self.bs*idx+i)+length] for i in range(self.bs)]
      by = [self.y[self.stride*(self.bs*idx+i):self.stride*(self.bs*idx+i)+length] for i in range(self.bs)]
      return np.asarray(bx), np.asarray(by)
    
    def __iter__(self):
      self.index = 0
      if self.shuffle: self.permute = permutation(self.__len__())
      return self
    
    def __next__(self):
      if self.index < self.__len__():
        self.index += 1
        if self.shuffle: return self.__getitem__(self.permute[self.index-1])
        else: return self.__getitem__[self.index-1]
      else:
        raise StopIteration

In [0]:
val_id = int(X.shape[0]*0.1)
test_id = int(X.shape[0]*0.8)
X_val, X_trn, X_test = X[:val_id], X[val_id:test_id], X[test_id:]
y_val, y_trn, y_test = y[:val_id], y[val_id:test_id], y[test_id:]

In [58]:
X_val.shape

(3821901,)

In [36]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print('GPU device not found')
print('Found GPU at: {}'.format(device_name))
from keras.models import Sequential
from keras.regularizers import Regularizer, l2, l1
from keras.layers import Dense, Activation, Input, Dropout, GaussianNoise, concatenate, Reshape
from keras.layers import LSTM, SimpleRNN, Bidirectional, GRU, CuDNNLSTM, CuDNNGRU, Embedding
from keras.optimizers import RMSprop, Nadam, SGD
from keras.models import Model
from keras.layers import ActivityRegularization, Masking, TimeDistributed, Concatenate, Multiply, Add
from keras.callbacks import TerminateOnNaN
if device_name == '/device:GPU:0':
  lstm = CuDNNLSTM 
else:
  lstm = LSTM
lstm

Found GPU at: /device:GPU:0


keras.layers.cudnn_recurrent.CuDNNLSTM

In [0]:
def create_model_func(nhidden=128, bptt=60):
  output_dim = y.shape[-1]
  inp = Input(shape=(None,))
  emb = Embedding(vocab_size, nhidden)(inp)
  rnn1 = Bidirectional(lstm(nhidden, return_sequences = True))(emb)
  rnn2 = Bidirectional(lstm(nhidden, return_sequences = True))(rnn1)
  add1 = Add()([rnn1, rnn2])
  rnn3 = Bidirectional(lstm(nhidden, return_sequences = True))(add1)
  add2 = Add()([add1, rnn3])
  rnn4 = Bidirectional(lstm(nhidden, return_sequences = True))(add2)
  add3 = Add()([add2, rnn4])
  dense = Dense(output_dim, activation='softmax')(add3)
  
  return Model(inputs=inp, outputs=dense)
  

In [0]:
def create_model():
  output_dim = y.shape[-1]
  model = Sequential()
  model.add(Embedding(vocab_size, 256))
  model.add(Bidirectional(lstm(128, return_sequences=True)))
  model.add(Bidirectional(lstm(128, return_sequences=True)))
  model.add(Bidirectional(lstm(128, return_sequences=True)))
  model.add(Bidirectional(lstm(128, return_sequences=True)))
  model.add(Dense(output_dim, activation='softmax'))
  #model.summary()
  return model


In [136]:
bptt = 60

lr = 1e-5
i = 0
seq = batchSeq(X_trn, y_trn, bptt, 20, 64)
while lr<3e-2:
  model = create_model_func(nhidden=512)
  opt = RMSprop(lr)
  model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
  bx, by = seq[i]
  print(bx.shape)
  model.train_on_batch(bx, by)
  bx, by = seq[i+1]
  loss = model.train_on_batch(bx, by)
  print(lr, loss)
  lr *= 2
  i += 0


(64, 54)
1e-05 [2.6848588, 0.3927557]
(64, 62)
2e-05 [2.6449108, 0.5]
(64, 57)
4e-05 [2.5874763, 0.49804688]
(64, 65)
8e-05 [2.4971118, 0.49783653]
(64, 62)
0.00016 [2.3067336, 0.5035377]
(64, 59)
0.00032 [1.9925966, 0.49783653]
(64, 67)
0.00064 [1.971943, 0.5015121]
(64, 64)
0.00128 [6.756238, 0.5031829]
(64, 56)
0.00256 [8.067442, 0.49947917]
(64, 67)
0.00512 [8.002027, 0.5035377]
(64, 50)
0.01024 [8.007746, 0.5031829]
(64, 63)
0.02048 [8.050791, 0.5005123]


In [0]:
X_trn.shape

In [138]:
bptt = 60
seq = batchSeq(X_trn, y_trn, bptt, bptt, 64)
val = batchSeq(X_val, y_val, bptt, bptt, 64)
model = create_model_func(nhidden=512)
model.summary()
opt = RMSprop(0.0003)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.fit_generator(seq, validation_data=val, steps_per_epoch=2000, epochs=10)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_50 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
embedding_82 (Embedding)        (None, None, 512)    29696       input_50[0][0]                   
__________________________________________________________________________________________________
bidirectional_256 (Bidirectiona (None, None, 1024)   4202496     embedding_82[0][0]               
__________________________________________________________________________________________________
bidirectional_257 (Bidirectiona (None, None, 1024)   6299648     bidirectional_256[0][0]          
__________________________________________________________________________________________________
add_127 (A



Epoch 2/10
 326/2000 [===>..........................] - ETA: 11:49 - loss: 0.2462 - acc: 0.9196



Epoch 3/10
 327/2000 [===>..........................] - ETA: 11:54 - loss: 0.1674 - acc: 0.9460



Epoch 4/10
 327/2000 [===>..........................] - ETA: 11:58 - loss: 0.1249 - acc: 0.9597



Epoch 5/10
 327/2000 [===>..........................] - ETA: 11:52 - loss: 0.1137 - acc: 0.9634



Epoch 6/10
 327/2000 [===>..........................] - ETA: 11:50 - loss: 0.1027 - acc: 0.9669



Epoch 7/10
 327/2000 [===>..........................] - ETA: 11:52 - loss: 0.0969 - acc: 0.9686



Epoch 8/10
 327/2000 [===>..........................] - ETA: 11:51 - loss: 0.0796 - acc: 0.9739



Epoch 9/10
 327/2000 [===>..........................] - ETA: 12:01 - loss: 0.0725 - acc: 0.9763



Epoch 10/10
 326/2000 [===>..........................] - ETA: 11:49 - loss: 0.0755 - acc: 0.9752





<keras.callbacks.History at 0x7f63eb22aef0>

In [0]:
model.save("my_model.h5")
model.summary()
opt = RMSprop(0.0003)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.fit_generator(seq, validation_data=val, steps_per_epoch=2000, epochs=2)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_50 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
embedding_82 (Embedding)        (None, None, 512)    29696       input_50[0][0]                   
__________________________________________________________________________________________________
bidirectional_256 (Bidirectiona (None, None, 1024)   4202496     embedding_82[0][0]               
__________________________________________________________________________________________________
bidirectional_257 (Bidirectiona (None, None, 1024)   6299648     bidirectional_256[0][0]          
__________________________________________________________________________________________________
add_127 (A



Epoch 2/2
 327/2000 [===>..........................] - ETA: 11:57 - loss: 0.0603 - acc: 0.9800



In [0]:
model.save("my_model_512.h5")
#test = batchSeq(X_test, y_test, bptt, bptt, 128)
#model.evaluate_generator(test)

In [1]:
test = batchSeq(X_test, y_test, bptt, bptt, 64, randlen=False)
i = 0
n = len(test)
accuracy = np.zeros((n, bptt))
for bx, by in test:
  prediction = model.predict_on_batch(bx)
  #print(prediction.shape, by.shape)
  predicts = np.argmax(prediction, axis=-1)
  truth = np.argmax(by, axis=-1)
  eq = np.equal(predicts, truth)
  accuracy[i,:] = np.mean(eq, axis=0)
  #accuracy += acc/n
  i += 1
  if i==n:
    break
np.mean(accuracy, axis=0)

NameError: ignored

In [144]:
np.mean(accuracy[:,5:-5])

0.9835456972361809

In [134]:
stride = 10
bs = 128
test_seq = batchSeq(X_test, y_test, bptt, stride, bs, randlen=False)
n = len(test)
N = n*bs*(bptt//stride)
predictions = np.zeros((N, 15))
truth = np.zeros((N, 15))
accuracy = np.zeros((n, bptt))
for i in range(n):
  bx, by = test_seq[i]
  prediction = model.predict_on_batch(bx)
  #print(prediction.shape, by.shape)
  for j in range(bptt):
    for k in range(bs):
      idx = (bs*i+k)*(bptt//stride)+j
      if 0<=idx<N: 
        predictions[idx, :] += prediction[k, j, :]/(bptt//stride)
        truth[idx, : ] += by[k, j, :]/(bptt//stride)
  #predicts = np.argmax(prediction, axis=-1)
  #truth = np.argmax(by, axis=-1)
predicts = np.argmax(predictions, axis=-1)
print(predictions[20:30])
print(predicts[20:30])
truth = np.argmax(truth, axis=-1)
eq = np.equal(predicts, truth)
acc = np.mean(eq)
acc

[[1.66440375e-01 1.66555998e-01 5.45430667e-07 1.37127699e-06
  1.35857382e-08 1.73643918e-05 1.79739603e-06 1.13637903e-05
  1.66734772e-01 2.87334701e-04 1.66614780e-01 8.57745926e-10
  1.62097910e-07 7.25320541e-07 4.62454226e-08]
 [1.66666554e-01 1.66099045e-01 2.87560062e-05 5.01873289e-04
  1.04844120e-06 1.66607004e-01 1.12395787e-05 1.66320229e-01
  3.96618167e-04 4.76814796e-06 2.78298442e-05 3.41668239e-08
  6.65284809e-08 1.31906920e-06 2.86523779e-07]
 [3.33411851e-01 1.66162859e-01 4.92438056e-06 1.26863552e-04
  3.07985881e-07 1.15845475e-04 6.37416362e-06 2.65019825e-05
  1.66571734e-01 9.97469544e-05 1.24080060e-04 2.70862569e-08
  9.66325904e-06 5.50666297e-06 4.09186421e-07]
 [5.00067218e-01 6.44284815e-06 6.93440639e-09 8.35946053e-08
  5.71389556e-09 1.11877667e-06 9.28912774e-05 7.85073762e-07
  1.12680068e-06 3.68234111e-08 1.66495026e-01 8.63286394e-10
  1.83451153e-06 7.57978273e-08 7.84488829e-09]
 [3.37699311e-01 1.66803725e-01 7.81902245e-05 3.75425302e-04
  

0.9800748534338358

In [20]:
opt = RMSprop(0.0002)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.fit_generator(seq, validation_data=val, epochs=2)

Epoch 1/2

Epoch 2/2
 327/4478 [=>............................] - ETA: 33:13 - loss: 0.0678 - acc: 0.9772





<keras.callbacks.History at 0x7fefcd6f4e10>

In [2]:
from keras import models
model = models.load_model("my_model.h5")


Using TensorFlow backend.


OSError: ignored