-
Notifications
You must be signed in to change notification settings - Fork 3
/
dataset.py
67 lines (58 loc) · 2.13 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# -*- coding: utf-8 -*-
import numpy as np
from collections import defaultdict
import linecache
import codecs
import random
# --- Processing Data : corpus ---
"""
Class taking 1. paths to text and 2. corresponding tag files as inputs, to create a 'sampler' that will output
the necessary information for the network, sentence by sentence:
n-grams of characters indexes
index of the tag
length of each word
position of the characters that start each word
word indexes
Sentences are shuffled
The last input indicates the level of detail for the tags : 'ups' is the simplest, 'pos' is Part-of-Speeach, 'mph' contains all the morpho syntaxic informations.
"""
class Dataset():
def __init__(self, path_to_data_file, path_to_output_file, wordvocab, outvocab, batch_size = 1):
self.infile = path_to_data_file
self.outputfile = path_to_output_file
self.wordvocab = wordvocab
self.outvocab = outvocab
self.batch_size = batch_size
self.y = list()
self.wid = list()
with open(self.infile) as data_file:
with open(self.outputfile) as tags_file:
for line_d, line_t in zip(data_file, tags_file):
words = [ w.lower() for w in line_d.strip().split() ] #lower case
if len(words) > 1:
labels = line_t.strip().split()
assert(len(words)==len(labels))
nwords = len(words)
w_id = list()
for w in words:
w_id.append(self.wordvocab.get(w,0))
self.wid.append(w_id)
pids = [self.outvocab.get(p,0) for p in labels]
self.y.append(np.asarray([pids[i] for i in range(nwords)],dtype='int32'))
assert(len(self.wid)==len(self.y))
self.cpt=0
self.tot=len(self.wid)
self.ids=range(len(self.wid))
random.shuffle(self.ids)
def sampler(self):
while True:
if (self.cpt+self.batch_size > self.tot):
self.cpt=0
random.shuffle(self.ids)
ylist = list()
idlist = list()
for i in range(self.batch_size):
self.cpt+=1
ylist.append(self.y[self.ids[self.cpt-1]])
idlist.append(self.wid[self.ids[self.cpt-1]])
yield idlist, ylist