/
data.py
67 lines (52 loc) · 1.82 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
SMS_FILENAME = 'data/sms/sms.txt'
MADURAI_FILENAME = 'data/madurai/sample.txt'
MADURAI_PATH = 'data/madurai/'
SMS_PATH = 'data/sms/'
import csv
import numpy as np
import pickle as pkl
def read_lines_sms(filename):
with open(filename, 'r') as f:
reader = csv.reader(f, delimiter='\t')
return [ row[-1] for row in list(reader) ]
def read_lines(filename):
with open(filename) as f:
return f.read().split('\n')
def index_(lines):
vocab = list(set('\n'.join(lines)))
ch2idx = { k:v for v,k in enumerate(vocab) }
return vocab, ch2idx
def to_array(lines, seqlen, ch2idx):
# combine into one string
raw_data = '\n'.join(lines)
num_chars = len(raw_data)
# calc data_len
data_len = num_chars//seqlen
# create numpy arrays
X = np.zeros([data_len, seqlen])
Y = np.zeros([data_len, seqlen])
# fill in
for i in range(0, data_len):
X[i] = np.array([ ch2idx[ch] for ch in raw_data[i*seqlen:(i+1)*seqlen] ])
Y[i] = np.array([ ch2idx[ch] for ch in raw_data[(i*seqlen) + 1 : ((i+1)*seqlen) + 1] ])
# return ndarrays
return X, Y
def process_data(path, filename, seqlen=10):
lines = read_lines(filename)
idx2ch, ch2idx = index_(lines)
X, Y = to_array(lines, seqlen, ch2idx)
np.save(path+ 'idx_x.npy', X)
np.save(path+ 'idx_y.npy', Y)
with open(path+ 'metadata.pkl', 'wb') as f:
pkl.dump( {'idx2ch' : idx2ch, 'ch2idx' : ch2idx }, f )
if __name__ == '__main__':
process_data(path = MADURAI_PATH,
filename = MADURAI_FILENAME)
def load_data(path):
# read data control dictionaries
with open(path + 'metadata.pkl', 'rb') as f:
metadata = pkl.load(f)
# read numpy arrays
X = np.load(path + 'idx_x.npy')
Y = np.load(path + 'idx_y.npy')
return X, Y, metadata['idx2ch'], metadata['ch2idx']