-
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathutil.py
268 lines (234 loc) · 9.09 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# https://deeplearningcourses.com/c/deep-learning-recurrent-neural-networks-in-python
# https://udemy.com/deep-learning-recurrent-neural-networks-in-python
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
import numpy as np
import string
import os
import sys
import operator
from nltk import pos_tag, word_tokenize
from datetime import datetime
def init_weight(Mi, Mo):
return np.random.randn(Mi, Mo) / np.sqrt(Mi + Mo)
def all_parity_pairs(nbit):
# total number of samples (Ntotal) will be a multiple of 100
# why did I make it this way? I don't remember.
N = 2**nbit
remainder = 100 - (N % 100)
Ntotal = N + remainder
X = np.zeros((Ntotal, nbit))
Y = np.zeros(Ntotal)
for ii in range(Ntotal):
i = ii % N
# now generate the ith sample
for j in range(nbit):
if i % (2**(j+1)) != 0:
i -= 2**j
X[ii,j] = 1
Y[ii] = X[ii].sum() % 2
return X, Y
def all_parity_pairs_with_sequence_labels(nbit):
X, Y = all_parity_pairs(nbit)
N, t = X.shape
# we want every time step to have a label
Y_t = np.zeros(X.shape, dtype=np.int32)
for n in range(N):
ones_count = 0
for i in range(t):
if X[n,i] == 1:
ones_count += 1
if ones_count % 2 == 1:
Y_t[n,i] = 1
X = X.reshape(N, t, 1).astype(np.float32)
return X, Y_t
# unfortunately Python 2 and 3 translates work differently
def remove_punctuation_2(s):
return s.translate(None, string.punctuation)
def remove_punctuation_3(s):
return s.translate(str.maketrans('','',string.punctuation))
if sys.version.startswith('2'):
remove_punctuation = remove_punctuation_2
else:
remove_punctuation = remove_punctuation_3
def get_robert_frost():
word2idx = {'START': 0, 'END': 1}
current_idx = 2
sentences = []
for line in open('../hmm_class/robert_frost.txt'):
line = line.strip()
if line:
tokens = remove_punctuation(line.lower()).split()
sentence = []
for t in tokens:
if t not in word2idx:
word2idx[t] = current_idx
current_idx += 1
idx = word2idx[t]
sentence.append(idx)
sentences.append(sentence)
return sentences, word2idx
def my_tokenizer(s):
s = remove_punctuation(s)
s = s.lower() # downcase
return s.split()
def get_wikipedia_data(n_files, n_vocab, by_paragraph=False):
prefix = '../large_files/'
if not os.path.exists(prefix):
print("Are you sure you've downloaded, converted, and placed the Wikipedia data into the proper folder?")
print("I'm looking for a folder called large_files, adjacent to the class folder, but it does not exist.")
print("Please download the data from https://dumps.wikimedia.org/")
print("Quitting...")
exit()
input_files = [f for f in os.listdir(prefix) if f.startswith('enwiki') and f.endswith('txt')]
if len(input_files) == 0:
print("Looks like you don't have any data files, or they're in the wrong location.")
print("Please download the data from https://dumps.wikimedia.org/")
print("Quitting...")
exit()
# return variables
sentences = []
word2idx = {'START': 0, 'END': 1}
idx2word = ['START', 'END']
current_idx = 2
word_idx_count = {0: float('inf'), 1: float('inf')}
if n_files is not None:
input_files = input_files[:n_files]
for f in input_files:
print("reading:", f)
for line in open(prefix + f):
line = line.strip()
# don't count headers, structured data, lists, etc...
if line and line[0] not in ('[', '*', '-', '|', '=', '{', '}'):
if by_paragraph:
sentence_lines = [line]
else:
sentence_lines = line.split('. ')
for sentence in sentence_lines:
tokens = my_tokenizer(sentence)
for t in tokens:
if t not in word2idx:
word2idx[t] = current_idx
idx2word.append(t)
current_idx += 1
idx = word2idx[t]
word_idx_count[idx] = word_idx_count.get(idx, 0) + 1
sentence_by_idx = [word2idx[t] for t in tokens]
sentences.append(sentence_by_idx)
# restrict vocab size
sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True)
word2idx_small = {}
new_idx = 0
idx_new_idx_map = {}
for idx, count in sorted_word_idx_count[:n_vocab]:
word = idx2word[idx]
print(word, count)
word2idx_small[word] = new_idx
idx_new_idx_map[idx] = new_idx
new_idx += 1
# let 'unknown' be the last token
word2idx_small['UNKNOWN'] = new_idx
unknown = new_idx
assert('START' in word2idx_small)
assert('END' in word2idx_small)
assert('king' in word2idx_small)
assert('queen' in word2idx_small)
assert('man' in word2idx_small)
assert('woman' in word2idx_small)
# map old idx to new idx
sentences_small = []
for sentence in sentences:
if len(sentence) > 1:
new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence]
sentences_small.append(new_sentence)
return sentences_small, word2idx_small
def get_tags(s):
tuples = pos_tag(word_tokenize(s))
return [y for x, y in tuples]
def get_poetry_classifier_data(samples_per_class, load_cached=True, save_cached=True):
datafile = 'poetry_classifier_data.npz'
if load_cached and os.path.exists(datafile):
npz = np.load(datafile)
X = npz['arr_0']
Y = npz['arr_1']
V = int(npz['arr_2'])
return X, Y, V
word2idx = {}
current_idx = 0
X = []
Y = []
for fn, label in zip(('../hmm_class/edgar_allan_poe.txt', '../hmm_class/robert_frost.txt'), (0, 1)):
count = 0
for line in open(fn):
line = line.rstrip()
if line:
print(line)
# tokens = remove_punctuation(line.lower()).split()
tokens = get_tags(line)
if len(tokens) > 1:
# scan doesn't work nice here, technically could fix...
for token in tokens:
if token not in word2idx:
word2idx[token] = current_idx
current_idx += 1
sequence = np.array([word2idx[w] for w in tokens])
X.append(sequence)
Y.append(label)
count += 1
print(count)
# quit early because the tokenizer is very slow
if count >= samples_per_class:
break
if save_cached:
np.savez(datafile, X, Y, current_idx)
return X, Y, current_idx
def get_stock_data():
input_files = os.listdir('stock_data')
min_length = 2000
# first find the latest start date
# so that each time series can start at the same time
max_min_date = datetime(2000, 1, 1)
line_counts = {}
for f in input_files:
n = 0
for line in open('stock_data/%s' % f):
# pass
n += 1
line_counts[f] = n
if n > min_length:
# else we'll ignore this symbol, too little data
# print 'stock_data/%s' % f, 'num lines:', n
last_line = line
date = line.split(',')[0]
date = datetime.strptime(date, '%Y-%m-%d')
if date > max_min_date:
max_min_date = date
print("max min date:", max_min_date)
# now collect the data up to min date
all_binary_targets = []
all_prices = []
for f in input_files:
if line_counts[f] > min_length:
prices = []
binary_targets = []
first = True
last_price = 0
for line in open('stock_data/%s' % f):
if first:
first = False
continue
date, price = line.split(',')[:2]
date = datetime.strptime(date, '%Y-%m-%d')
if date < max_min_date:
break
prices.append(float(price))
target = 1 if last_price < price else 0
binary_targets.append(target)
last_price = price
all_prices.append(prices)
all_binary_targets.append(binary_targets)
# D = number of symbols
# T = length of series
return np.array(all_prices).T, np.array(all_binary_targets).T # make it T x D