In [3]:
import os
from tqdm import tqdm
import pickle
import numpy as np
import h5py

In [4]:
data_dir = "./data"
epochs = ['średniowiecze', 'współczesność', 'modernizm', 'romantyzm', 'barok', 'oświecenie', 'renesans', 'pozytywizm', 'dwudziestolecie', 'starożytność']


In [5]:
sizes = ! du ./data
epoch_sizes = {}
for s in sizes:
    size = s.split()[0]
    name = s.split()[1].split('/')[-1]
    epoch_sizes[name] = int(size)
epoch_sizes

{'barok': 7520,
 'data': 478860,
 'dwudziestolecie': 30784,
 'modernizm': 46736,
 'nie': 1376,
 'oświecenie': 11556,
 'pozytywizm': 68124,
 'renesans': 12992,
 'romantyzm': 41012,
 'starożytność': 4172,
 'współczesność': 17560,
 'średniowiecze': 1800}

In [6]:
m = min(epoch_sizes.values())
skips = 2*np.log(np.array(list(epoch_sizes.values())) / m)
epoch_skips = {a: int(np.floor(b)) for a,b in zip(epoch_sizes.keys(),skips)}
epoch_skips

{'barok': 3,
 'data': 11,
 'dwudziestolecie': 6,
 'modernizm': 7,
 'nie': 0,
 'oświecenie': 4,
 'pozytywizm': 7,
 'renesans': 4,
 'romantyzm': 6,
 'starożytność': 2,
 'współczesność': 5,
 'średniowiecze': 0}

In [7]:
!cd data; cat */*.txt > big_blob.txt
with open('./data/big_blob.txt', 'r', encoding="utf-8") as blob:
    b = blob.read()
    chars = set(b)
    freq = {c: 0 for c in chars}
    for c in list(b):
        freq[c] +=1
        
    del b
    
freq_sorted = sorted(freq.items(), key=lambda x: x[1])
freq_filtered = [x[0] for x in freq_sorted if x[1] > 1000]
print("".join(sorted(freq_filtered)))
freq_filtered += ["NULL"]

char_idx = {c: i for i, c in enumerate(sorted(freq_filtered))}
pickle.dump(char_idx, open('charmap.pickle','wb'))


 !"'()*,-./0123456789:;<>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz«­»ÓßàäèéêóöüĄąćčėĘęįŁłŃńŚśŠšūųŹźŻżž–—’”„…


In [8]:
def process_text_with_epochs(files_dir, all_epochs, epoch, char_idx, epoch_skips, hdf_path ="./dataset.h5f", seq_maxlen=25, redun_step=3):
    def map_char(char):
        idx = char_idx.get(char)
        if idx is None:
            return char_idx['NULL']
        else:
            return idx
        
    len_chars = len(char_idx)
    len_epochs = len(epochs)
    vector_len = len_chars + len_epochs
    
    with h5py.File(hdf_path, "a") as hdf5_file:
        Xt = hdf5_file.create_dataset('X', (0, seq_maxlen, vector_len), 
                                        maxshape=(None, seq_maxlen, vector_len),
                                        dtype='bool')
        
        Yt = hdf5_file.create_dataset('Y', (0, len_chars), 
                                        maxshape=(None, len_chars),
                                        dtype='bool')
        


        print("Processing: ",epoch, 'step=',redun_step+epoch_skips[epoch])
        epoch_path = os.path.join(files_dir, epoch)
        txt_files = [os.path.join(epoch_path, f) for f in os.listdir(epoch_path) if os.path.isfile(os.path.join(epoch_path, f))]

        for file in tqdm(txt_files):
            with open(file, 'rt') as txt:
                string = txt.read()
                string = string[:-1236] #removing footnote

                sequences = []
                next_chars = []
                step = redun_step + epoch_skips[epoch]
                for i in range(0, len(string) - seq_maxlen, redun_step):
                    sequences.append(string[i: i + seq_maxlen])
                    next_chars.append(string[i + seq_maxlen])

                x = np.zeros((len(sequences), seq_maxlen, vector_len), dtype=np.bool)
                y = np.zeros((len(sequences), len_chars), dtype=np.bool)
                for i, seq in enumerate(sequences):
                    for t, char in enumerate(seq):
                        x[i, t, map_char(char)] = 1
                        x[i, t, len_chars + epochs.index(epoch)] = 1
                    y[i, map_char(next_chars[i])] = 1

                x_len = x.shape[0]
                Xt.resize(Xt.shape[0]+x_len, axis=0)   
                Xt[-x_len:] = x

                y_len = y.shape[0]
                Yt.resize(Yt.shape[0]+y_len, axis=0)   
                Yt[-y_len:] = y
                
        shapes = (Xt.shape, Yt.shape)
        print(shapes)
                
    return shapes

In [9]:
#shapes = process_text_with_epochs(data_dir, epochs, 'średniowiecze', char_idx, epoch_skips)

  0%|          | 0/10 [00:00<?, ?it/s]

Processing:  średniowiecze step= 3


100%|██████████| 10/10 [00:19<00:00,  1.95s/it]


In [10]:
#shapes

((550534, 25, 134), (550534, 124))

In [18]:
from multiprocessing import Pool
pool = Pool()

for epoch in epochs:
    pool.apply_async(process_text_with_epochs, [data_dir, epochs, epoch, 
                                                char_idx, epoch_skips,
                                                "dataset_"+epoch+".h5f",
                                                25, 3])
    
pool.close()
pool.join()


Processing:  współczesność step= 8
Processing:  romantyzm step= 9
Processing:  barok step= 6
Processing:  renesans step= 7
Processing:  nie step= 3
Processing:  średniowiecze step= 3
Processing:  oświecenie step= 7


  0%|          | 0/10 [00:00<?, ?it/s]]

Processing:  modernizm step= 10


  2%|▏         | 16/830 [00:06<05:52,  2.31it/s]]





Process ForkPoolWorker-11:
Process ForkPoolWorker-14:
Process ForkPoolWorker-15:
Process ForkPoolWorker-12:
Process ForkPoolWorker-9:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwarg