cleanups

vecto-ai · Jan 22, 2019 · 654f5c7 · 654f5c7
1 parent 7e66ffb
commit 654f5c7
Show file tree

Hide file tree

Showing 6 changed files with 24 additions and 18 deletions.
diff --git a/vecto/benchmarks/relation_extraction/relation_extraction.py b/vecto/benchmarks/relation_extraction/relation_extraction.py
@@ -21,11 +21,11 @@ def getPrecision(pred_test, yTest, targetLabel):
     targetLabelCount = 0
     correctTargetLabelCount = 0
 
-    for idx in range(len(pred_test)):
-        if pred_test[idx] == targetLabel:
+    for idx, prediction in enumerate(pred_test):
+        if prediction == targetLabel:
             targetLabelCount += 1
 
-            if pred_test[idx] == yTest[idx]:
+            if prediction == yTest[idx]:
                 correctTargetLabelCount += 1
 
     if correctTargetLabelCount == 0:

diff --git a/vecto/benchmarks/text_classification/__init__.py b/vecto/benchmarks/text_classification/__init__.py
@@ -1,4 +1,4 @@
-"""Text classification benchmark. 
+"""Text classification benchmark.
 
     One of the pre-defined models is trained to convergence
     to predict labels for text fragments in a provided dataset.

diff --git a/vecto/benchmarks/text_classification/nlp_utils.py b/vecto/benchmarks/text_classification/nlp_utils.py
@@ -1,19 +1,12 @@
-import collections
-import io
+# import collections
+# import io
 
 import numpy
 
 import chainer
 from chainer.backends import cuda
 
 
-def split_text(text, char_based=False):
-    if char_based:
-        return list(text)
-    else:
-        return text.split()
-
-
 def normalize_text(text):
     return text.strip().lower()
 

diff --git a/vecto/benchmarks/text_classification/text_classification.py b/vecto/benchmarks/text_classification/text_classification.py
@@ -12,6 +12,7 @@
 from vecto.benchmarks.text_classification import nets
 from vecto.benchmarks.text_classification import text_datasets
 from vecto.benchmarks.text_classification import nlp_utils
+from vecto.corpus.tokenization import word_tokenize_txt
 from ..base import Benchmark
 
 
@@ -45,7 +46,11 @@ def predict(model, sentence):
     model, vocab, setup = model
     sentence = sentence.strip()
     text = nlp_utils.normalize_text(sentence)
-    words = nlp_utils.split_text(text, char_based=setup['char_based'])
+    # words = nlp_utils.split_text(text, char_based=setup['char_based'])
+    if setup['char_based']:
+        words = list(text)
+    else:
+        words = word_tokenize_txt(text)
     xs = nlp_utils.transform_to_array([words], vocab, with_label=False)
     xs = nlp_utils.convert_seq(xs, device=-1, with_label=False)  # todo use GPU
     with chainer.using_config('train', False), chainer.no_backprop_mode():
@@ -61,7 +66,10 @@ def get_vectors(model, sentences):
     for sentence in sentences:
         sentence = sentence.strip()
         text = nlp_utils.normalize_text(sentence)
-        words = nlp_utils.split_text(text, char_based=setup['char_based'])
+        if setup['char_based']:
+            words = list(text)
+        else:
+            words = word_tokenize_txt(text)
         xs = nlp_utils.transform_to_array([words], vocab, with_label=False)
         xs = nlp_utils.convert_seq(xs, device=-1, with_label=False)  # todo use GPU
         with chainer.using_config('train', False), chainer.no_backprop_mode():

diff --git a/vecto/benchmarks/text_classification/text_datasets.py b/vecto/benchmarks/text_classification/text_datasets.py
@@ -11,7 +11,8 @@
 import chainer
 
 from vecto.benchmarks.text_classification.nlp_utils import normalize_text
-from vecto.benchmarks.text_classification.nlp_utils import split_text
+from vecto.corpus.tokenization import word_tokenize_txt
+# from vecto.benchmarks.text_classification.nlp_utils import split_text
 
 # TODO: use vecto.corpus
 from vecto.benchmarks.text_classification.nlp_utils import transform_to_array
@@ -30,7 +31,11 @@ def read_lines_separated(path, shrink=1, char_based=False):
                 continue
             label, text = l.strip().split(None, 1)
             label = int(label) % 2  # TODO: don't do this, implement shift
-            tokens = split_text(normalize_text(text), char_based)
+            text = normalize_text(text)
+            if char_based:
+                tokens = list(text)
+            else:
+                tokens = word_tokenize_txt(text)
             dataset.append((tokens, label))
     return dataset
 

diff --git a/vecto/benchmarks/visualize.py b/vecto/benchmarks/visualize.py
@@ -70,7 +70,7 @@ def df_from_dir(path):
                 full_path = os.path.join(dirpath, filename)
                 try:
                     dfs.append(df_from_file(full_path))
-                except KeyError as e:
+                except KeyError:
                     logger.warning(f"error reading {full_path}")
     dframe = pandas.concat(dfs, sort=True)
     # print(dframe["experiment_setup.task"])