Skip to content

Commit

Permalink
fix bugs in file iterator and update compute features
Browse files Browse the repository at this point in the history
  • Loading branch information
GreatYYX committed Apr 21, 2017
1 parent 8fef832 commit 125daf3
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 106 deletions.
208 changes: 121 additions & 87 deletions rltk/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,6 @@ def load_feature_configuration(self, name, file_path):
.. code-block:: javascript
{
// the id_path for id field. The id should be a string or number.
// only need one element if json dicts have the same structure.
"id_path": ["id", "index"],
// default value for missing result value.
"missing_value_default": 0,
// ignore or exception.
Expand Down Expand Up @@ -319,14 +316,6 @@ def load_feature_configuration(self, name, file_path):
logger.setLevel(log_level)
config['logging'] = logger_name # replace json object to logger name

# id path (pre-compiled)
if 'id_path' not in config or len(config['id_path']) == 0:
raise ValueError('Missing value of id_path')
if len(config['id_path']) >= 1:
config['id_path'][0] = parse(config['id_path'][0])
if len(config['id_path']) == 2:
config['id_path'][1] = parse(config['id_path'][1])

# features (pre-compiled)
if 'features' not in config:
raise ValueError('Missing value of features')
Expand Down Expand Up @@ -365,7 +354,7 @@ def load_feature_configuration(self, name, file_path):
item['data'] = config
self._rs_dict[name] = item

def compute_feature_vector(self, obj1, obj2, name):
def _compute_feature_vector(self, obj1, obj2, name):
"""
Compute feature vector for two objects.
Expand Down Expand Up @@ -437,84 +426,129 @@ def compute_feature_vector(self, obj1, obj2, name):
vector.append(config['missing_value_default'])

# return vector
try:
# id path
matches1 = config['id_path'][0].find(obj1)
id1 = [match.value for match in matches1]
if len(id1) == 0:
raise ValueError('Missing id in Object1')
matches2 = None
if len(config['id_path']) > 1:
matches2 = config['id_path'][1].find(obj2)
else:
matches2 = config['id_path'][0].find(obj2)
id2 = [match.value for match in matches2]
if len(id2) == 0:
raise ValueError('Missing id in Object2')
ret_dict = {
'id': [id1[0], id2[0]],
'feature_vector': vector
}
return ret_dict
except Exception as e:
logger.error('[{0}] {1}'.format(name, e.message))
if config['error_handling'] == 'exception':
raise e
else: # ignore
pass
return vector

def featurize_ground_truth(self, feature_file_path, ground_truth_file_path, output_file_path=None):
def compute_labeled_features(self, iter1, label_file_path, feature_config_name, feature_output_path, iter2=None):
"""
Featurize the ground truth by feature vector.
Args:
feature_file_path (str): Json line file of feature vector dicts. \
Each json object should contains a field of id with the array of two elements.
ground_truth_file_path (str): Json line file of ground truth.\
Each json object should contains a field of id with the array of two elements. \
It also need to contains a field named `label` for ground truth.
output_file_path (str, optional): If it is None, the featurized ground truth will print to STDOUT. \
Defaults to None.
"""
def hashed_id(ids):
if len(ids) != 2:
raise ValueError('Incorrect number of id')
ids = sorted(ids)

# in order to solve the collision in hashing differentiate types of data
# and to keep just one level comparison of hash key,
# add fixed length of type mark first
# here str != unicode (maybe it needs to compare on their base class basestring)
return '{0}-{1}-{2}-{3}'\
.format(type(ids[0]).__name__, type(ids[1]).__name__, str(ids[0]), str(ids[1]))

# read ground truth into memory
ground_truth = dict()
with open(self._get_abs_path(ground_truth_file_path), 'r') as f:
iter1 (FileIterator): File iterator 1.
iter2 (FileIterator): File iterator 2.
"""
self._has_resource(feature_config_name, 'feature_configuration')

labels = {}
with open(self._get_abs_path(label_file_path), 'r') as f:
for line in f:
data = json.loads(line)
k, v = hashed_id(data['id']), data['label']
ground_truth[k] = v
j = json.loads(line)
id1, id2 = j['id'][0], j['id'][1]
labels[id1] = labels.get(id1, {})
labels[id1][id2] = labels.get(id2, j['label'])

with open(self._get_abs_path(feature_output_path), 'w') as output:
for id1, value1 in iter1:
if id1 not in labels:
continue
curr_iter2 = iter2.copy()
for id2, value2 in curr_iter2:
if id2 not in labels[id1]:
continue
v = self._compute_feature_vector(value1, value2, feature_config_name)

# featurize feature file
if output_file_path is None:
with open(self._get_abs_path(feature_file_path), 'r') as f:
ret_dict = {
'id': [id1, id2],
'feature_vector': v,
'label': labels[id1][id2]
}
output.write(json.dumps(ret_dict))
output.write('\n')

def compute_features(self, iter1, feature_config_name, feature_output_path, iter2=None, blocking_path=None):
"""
Args:
iter1 (FileIterator): File iterator 1.
iter2 (FileIterator): File iterator 2.
"""
self._has_resource(feature_config_name, 'feature_configuration')

blocking = {}
if blocking_path is not None:
with open(self._get_abs_path(blocking_path), 'r') as f:
for line in f:
data = json.loads(line)
k = hashed_id(data['id'])
if k in ground_truth:
data['label'] = ground_truth[k]
print data
else:
with open(self._get_abs_path(feature_file_path), 'r') as f:
with open(self._get_abs_path(output_file_path), 'w') as out:
for line in f:
data = json.loads(line)
k = hashed_id(data['id'])
if k in ground_truth:
data['label'] = ground_truth[k]
out.write(json.dumps(data))
out.write('\n')
j = json.loads(line)
for k, v in j:
blocking[k] = set(v)

# if there's no blocking, compare all the possible pairs, O(n^2)
with open(self._get_abs_path(feature_output_path), 'w') as output:
for id1, value1 in iter1:
if blocking_path is not None and id1 not in blocking:
continue
# if iter2 exists, it always iterate from start;
# else starts from the next element of iter1
curr_iter2 = next(iter1.copy()) if iter2 is None else iter2.copy()
for id2, value2 in curr_iter2:
if blocking_path is not None and id2 not in blocking[id1]:
continue
v = self._compute_feature_vector(value1, value2, feature_config_name)
ret_dict = {
'id': [id1, id2],
'feature_vector': v
}
output.write(json.dumps(ret_dict))
output.write('\n')

# def featurize_ground_truth(self, feature_file_path, ground_truth_file_path, output_file_path=None):
# """
# Featurize the ground truth by feature vector.
#
# Args:
# feature_file_path (str): Json line file of feature vector dicts. \
# Each json object should contains a field of id with the array of two elements.
# ground_truth_file_path (str): Json line file of ground truth.\
# Each json object should contains a field of id with the array of two elements. \
# It also need to contains a field named `label` for ground truth.
# output_file_path (str, optional): If it is None, the featurized ground truth will print to STDOUT. \
# Defaults to None.
# """
# def hashed_id(ids):
# if len(ids) != 2:
# raise ValueError('Incorrect number of id')
# ids = sorted(ids)
#
# # in order to solve the collision in hashing differentiate types of data
# # and to keep just one level comparison of hash key,
# # add fixed length of type mark first
# # here str != unicode (maybe it needs to compare on their base class basestring)
# return '{0}-{1}-{2}-{3}'\
# .format(type(ids[0]).__name__, type(ids[1]).__name__, str(ids[0]), str(ids[1]))
#
# # read ground truth into memory
# ground_truth = dict()
# with open(self._get_abs_path(ground_truth_file_path), 'r') as f:
# for line in f:
# data = json.loads(line)
# k, v = hashed_id(data['id']), data['label']
# ground_truth[k] = v
#
# # featurize feature file
# if output_file_path is None:
# with open(self._get_abs_path(feature_file_path), 'r') as f:
# for line in f:
# data = json.loads(line)
# k = hashed_id(data['id'])
# if k in ground_truth:
# data['label'] = ground_truth[k]
# print data
# else:
# with open(self._get_abs_path(feature_file_path), 'r') as f:
# with open(self._get_abs_path(output_file_path), 'w') as out:
# for line in f:
# data = json.loads(line)
# k = hashed_id(data['id'])
# if k in ground_truth:
# data['label'] = ground_truth[k]
# out.write(json.dumps(data))
# out.write('\n')

def train_classifier(self, featurized_ground_truth, config):
"""
Expand Down Expand Up @@ -570,10 +604,10 @@ def get_file_iterator(self, file_path, *args, **kwargs):
Args:
file_path (str): File path.
type (str): It can be `json_line`, `text`, `csv`. \
For `json_line` file, `id_path` and `value_path` should also be set. \
For `json_line` file, `id_path` should also be set. \
For `text` file, id will be auto generated.
For `csv` file, `id_column` and `value_columns` (list) should be set. \
if there's no header, please set `field_names` (list).
For `csv` file, `id_column` should be set. \
if there's no header, please set `column_names` (list).
Returns:
str, list: id, value list. \
If the extracted id is a int, it will be convert to string with a 'int-' prefix,\
Expand Down
32 changes: 13 additions & 19 deletions rltk/file_iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class FileIterator(object):
_type = None
_kwargs = {}
_file_handler = None
_count = 0
_line_count = 0

def __init__(self, file_path, type='text', **kwargs):
self._file_path = file_path
Expand All @@ -23,16 +23,14 @@ def __init__(self, file_path, type='text', **kwargs):
if type == 'json_line':
# pre-compile json path, raise exception if not exists
self._id_path_parser = parse(kwargs['id_path'])
self._value_path_parser = parse(kwargs['value_path'])
elif type == 'csv':
self._id_column = kwargs['id_column'] # raise exception if not exists
self._value_columns = kwargs['value_columns']
delimiter = kwargs['delimiter'] if 'delimiter' in kwargs else ','
quotechar = kwargs['quotechar'] if 'quotechar' in kwargs else '"'
quote_char = kwargs['quote_char'] if 'quote_char' in kwargs else '"'
quoting = kwargs['quoting'] if 'quoting' in kwargs else csv.QUOTE_MINIMAL
field_names = kwargs['field_names'] if 'field_names' in kwargs else None
column_names = kwargs['column_names'] if 'column_names' in kwargs else None
self._csv_reader = csv.DictReader(
self._file_handler, delimiter=delimiter, quotechar=quotechar, quoting=quoting, fieldnames=field_names)
self._file_handler, delimiter=delimiter, quotechar=quote_char, quoting=quoting, fieldnames=column_names)
else: # text
self._id_prefix = hashlib.md5(file_path).hexdigest()[:6]

Expand All @@ -45,9 +43,10 @@ def __copy__(self):
# self._file_handler, new_iter = itertools.tee(self._file_handler)

new_iter = FileIterator(self._file_path, self._type, **self._kwargs)
for _ in new_iter:
if new_iter._count == self._count:
break
if self._line_count > 0:
for _ in new_iter:
if new_iter._line_count == self._line_count:
break
return new_iter

def copy(self):
Expand All @@ -69,23 +68,18 @@ def next(self):
if len(extracted_id) == 0:
raise ValueError('Can\'t find id in json line file by id_path')
oid = extracted_id[0]

matches = self._value_path_parser.find(line)
extracted_value = [match.value for match in matches]
if len(extracted_value) == 0:
raise ValueError('Can\'t find value in json line file by value_path')
value = extracted_value
value = line
elif self._type == 'csv':
line = self._csv_reader.next()
oid = line[self._id_column]
value = [line[k] for k in self._value_columns]
value = line
else: # text
line = self._file_handler.next()
line = line.strip('\n')
oid = self._id_prefix + '-' + str(self._count)
value = [line]
oid = self._id_prefix + '-' + str(self._line_count)
value = {'id': oid, 'text': line}

self._count += 1
self._line_count += 1
# id should be a string
if isinstance(oid, int):
oid = 'int-' + str(oid)
Expand Down

0 comments on commit 125daf3

Please sign in to comment.