Skip to content

Commit

Permalink
merge readfile in load_df_corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
GreatYYX committed Mar 2, 2017
1 parent 0893ce1 commit 0bd3229
Showing 1 changed file with 26 additions and 40 deletions.
66 changes: 26 additions & 40 deletions rltk/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,17 @@ def load_df_corpus(self, name, file_path, file_type='text', mode='append', json_
>>> tk.load_df_corpus('B1', 'df_corpus_1.txt', file_type='text', mode='replace')
>>> tk.load_df_corpus('B2', 'jl_file_1.jsonl', file_type='json_lines', json_path='desc[*]')
"""
def count_for_token(tokens_):
for t in tokens_:
if t not in item['data']:
item['data'][t] = 0
item['data'][t] += 1

self._check_valid_resource(name, 'df_corpus')

if file_type not in ('text', 'json_lines'):
raise ValueError('Unsupported file type')

# get original dict item for appending
# or create / replace it to a new one
item = {
Expand All @@ -77,61 +86,38 @@ def load_df_corpus(self, name, file_path, file_type='text', mode='append', json_
'doc_size': 0
} if not (mode == 'update' and name in self._rs_dict) else self._rs_dict[name]

if file_type == 'text':
with open(file_path, 'r') as f:
for line in f:
line = line.rstrip('\n')

with open(file_path) as f:
for line in f:
token = line.rstrip('\n').split(' ')
if len(token) == 0: # empty line or error in format
if file_type == 'text':
tokens = line.split(' ')
if len(tokens) == 0:
continue
token = set(token)

# count for token
for t in token:
if t not in item['data']:
item['data'][t] = 0
item['data'][t] += 1
tokens = set(tokens)
count_for_token(tokens)

# count for docs
item['doc_size'] += 1
elif file_type == 'json_lines':
if json_path is None:
raise ValueError('Invalid json path')

self._rs_dict[name] = item

elif file_type == 'json_lines':

if json_path is None:
raise ValueError('Invalid json path')
crf_tokenizer = CrfTokenizer()
with open(file_path) as f:
for line in f:
line = line.rstrip('\n')
line = json.loads(line)
doc_parts = [match.value for match in parse(json_path).find(line)]

if doc_parts is None:
continue

# count
crf_tokenizer = CrfTokenizer()
for part in doc_parts:
if not isinstance(part, basestring):
raise TypeError('json_path must points to a string')
tokens = crf_tokenizer.tokenize(part)
count_for_token(tokens)

token = crf_tokenizer.tokenize(part)
# count for docs (each line is a doc)
item['doc_size'] += 1

# count for token
for t in token:
if t not in item['data']:
item['data'][t] = 0
item['data'][t] += 1

# count for docs
item['doc_size'] += 1

self._rs_dict[name] = item


else:
raise ValueError('Unsupported file type')
self._rs_dict[name] = item

def hamming_distance(self, s1, s2):
"""
Expand Down

0 comments on commit 0bd3229

Please sign in to comment.