delete cython in utils.pywithout the need for Build Tools for Visual …

…Studio
taishi-i · Jun 19, 2020 · 23b8138 · 23b8138
1 parent 256b0e8
commit 23b8138
Show file tree

Hide file tree

Showing 7 changed files with 69 additions and 124 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1 @@
 recursive-include nagisa/data *
-recursive-include nagisa *.pyx
diff --git a/nagisa/__init__.py b/nagisa/__init__.py
@@ -1,4 +1,4 @@
-import utils
+from nagisa import utils
 from nagisa.tagger import Tagger
 from nagisa.train import fit
 

diff --git a/nagisa/prepro.py b/nagisa/prepro.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 
-import utils
+from nagisa import utils
 
 OOV = utils.OOV
 PAD = utils.PAD

diff --git a/nagisa/tagger.py b/nagisa/tagger.py
@@ -5,8 +5,8 @@
 import os
 import re
 import sys
-import utils
 import nagisa.model as model
+from nagisa import utils
 
 base = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(base)

diff --git a/nagisa/train.py b/nagisa/train.py
@@ -7,10 +7,11 @@
 import logging
 from collections import OrderedDict
 
-import utils
 import model
 import prepro
 import mecab_system_eval
+
+from nagisa import utils
 from tagger import Tagger
 
 logging.basicConfig(level=logging.INFO, format='%(message)s')

diff --git a/nagisa/utils.pyx → nagisa/utils.py b/nagisa/utils.pyx → nagisa/utils.py
@@ -11,64 +11,61 @@
 
 from six.moves import cPickle
 
-reload(sys)
+# reload(sys)
 if sys.version_info.major == 2:
     sys.setdefaultencoding('utf-8')
 
-cdef unicode __OOV = u'oov'
-cdef unicode __PAD = u'pad'
+__OOV = u'oov'
+__PAD = u'pad'
 
 OOV = __OOV
 PAD = __PAD
 
 _hiragana = re.compile(u'[\u3040-\u309F]')
 _katakana = re.compile(u'[\u30A1-\u30FA]')
-_kanji    = re.compile(u'[\u4e00-\u9fa5]')
-_alpha    = re.compile(u'[a-zA-Z]')
-_numeric  = re.compile(u'[0-9]')
+_kanji = re.compile(u'[\u4e00-\u9fa5]')
+_alpha = re.compile(u'[a-zA-Z]')
+_numeric = re.compile(u'[0-9]')
 
 
-cpdef unicode utf8rstrip(text):
-    if type(text) != unicode:
+def utf8rstrip(text):
+    if type(text) != str:
         return unicode(text.rstrip(), 'utf-8')
     else:
         return text.rstrip()
 
 
-cpdef unicode normalize(unicode text):
+def normalize(text):
     return unicodedata.normalize('NFKC', text)
 
 
-cpdef unicode preprocess(text):
+def preprocess(text):
     text = utf8rstrip(text)
     text = normalize(text)
     text = text.replace(' ', '　')
     return text
 
 
-cpdef unicode preprocess_without_rstrip(text):
-    if type(text) != unicode:
+def preprocess_without_rstrip(text):
+    if type(text) != str:
         text = unicode(text, 'utf-8')
     text = normalize(text)
     text = text.replace(' ', '　')
     return text
 
 
-cpdef list get_unigram(unicode text):
-    cdef unicode uni
+def get_unigram(text):
     return [uni for uni in text]
 
 
-cpdef list get_bigram(unicode text):
-    cdef:
-        int i
-        int length_text = len(text)
-        unicode end_symbol = u'<E>'
+def get_bigram(text):
+    length_text = len(text)
+    end_symbol = u'<E>'
     return [text[i]+end_symbol if i == length_text-1 else text[i:i+2]
             for i in range(length_text)]
 
 
-cpdef int get_chartype(unicode character):
+def get_chartype(character):
     if _hiragana.search(character):
         return 0
     elif _katakana.search(character):
@@ -83,14 +80,9 @@
         return 5
 
 
-cpdef list get_words_starting_at_i(unicode text, dict dictionary):
-    cdef:
-        int i
-        int j
-        int length_text = len(text)
-        list subwords
-        list words_starting_at_i = []
-        unicode sub
+def get_words_starting_at_i(text, dictionary):
+    length_text = len(text)
+    words_starting_at_i = []
 
     for i in range(length_text):
         subwords = []
@@ -104,13 +96,9 @@
     return words_starting_at_i
 
 
-cpdef list get_words_ending_at_i(unicode text, dict dictionary):
-    cdef:
-        int i
-        int j
-        int length_text = len(text)
-        list subwords
-        list words_ending_at_i = []
+def get_words_ending_at_i(text, dictionary):
+    length_text = len(text)
+    words_ending_at_i = []
 
     text = text[::-1]
     for i in range(length_text):
@@ -125,31 +113,31 @@
     return words_ending_at_i[::-1]
 
 
-cpdef list conv_tokens_to_ids(list words, dict word2id):
-    cdef unicode word
-    return [word2id[word] if word in word2id else word2id[__OOV] for word in words]
+def conv_tokens_to_ids(words, word2id):
+    return [word2id[word] if word in word2id else word2id[__OOV]
+            for word in words]
 
 
-cpdef list context_window(list l, int win, int pad_id=1):
-    cdef:
-        int length_l = len(l)
+def context_window(l, win, pad_id=1):
+    length_l = len(l)
 
     assert (win % 2) == 1
-    assert win >=1
+    assert win >= 1
     lpadded = int(win/2) * [pad_id] + l + int(win/2) * [pad_id]
     out = [lpadded[i:i+win] for i in range(length_l)]
     assert len(out) == len(l)
     return out
 
 
-cpdef list feature_extraction(unicode text, dict uni2id, dict bi2id,
-                              dict dictionary, int window_size):
+def feature_extraction(text, uni2id, bi2id, dictionary, window_size):
     # character-level features
     unigrams = get_unigram(text)
     bigrams = get_bigram(text)
     uids = context_window(conv_tokens_to_ids(unigrams, uni2id), window_size)
     bids = context_window(conv_tokens_to_ids(bigrams, bi2id), window_size)
-    cids = context_window([get_chartype(uni) for uni in unigrams], window_size, pad_id=6)
+    cids = context_window(
+        [get_chartype(uni) for uni in unigrams], window_size, pad_id=6
+    )
 
     # word-level features
     wids_s = get_words_starting_at_i(text, dictionary)
@@ -159,22 +147,19 @@
     return features
 
 
-cpdef dict load_dictionary(dict_path):
-    cdef dict word_dict = {__OOV:0, __PAD:1}
+def load_dictionary(dict_path):
+    word_dict = {__OOV: 0, __PAD: 1}
     with open(dict_path, 'r') as words:
         for word in words:
             word = utf8rstrip(word)
-            if not word in word_dict:
+            if word not in word_dict:
                 word_dict[word] = len(word_dict)
     return word_dict
 
 
-cpdef list make_tags_as_bmes(unicode text):
-    cdef:
-        int i
-        int len_word
-        list tags = []
-        list words = text.split(u' ')
+def make_tags_as_bmes(text):
+    tags = []
+    words = text.split(u' ')
     for word in words:
         len_word = len(word)
         if len_word < 2:
@@ -193,12 +178,9 @@
     return tags
 
 
-cpdef list segmenter_for_bmes(unicode chars, list tags):
-    cdef:
-        int tag
-        list words = []
-        unicode partical_word = u''
-        unicode character
+def segmenter_for_bmes(chars, tags):
+    words = []
+    partical_word = u''
 
     assert len(chars) == len(tags)
     for character, tag in zip(chars, tags):
@@ -213,25 +195,21 @@
     return words
 
 
-cpdef dump_data(data, fn):
+def dump_data(data, fn):
     with gzip.open(fn, 'wb') as gf:
         cPickle.dump(data, gf, protocol=2)
         gf.close()
 
 
-cpdef load_data(fn):
+def load_data(fn):
     with gzip.open(fn, 'rb') as gf:
         return cPickle.load(gf)
 
 
-cpdef list np_viterbi(trans, observations):
-    cdef:
-        int idx, best_tag_id
-        list bptrs_t, vvars_t, backpointer, indice, best_path
-
+def np_viterbi(trans, observations):
     for_expr = np.array([-1e10]*6)
-    for_expr[4] = 0 # sp_s = 4
-    indice = [0,1,2,3,4,5]
+    for_expr[4] = 0  # sp_s = 4
+    indice = [0, 1, 2, 3, 4, 5]
     backpointer = []
 
     for obs in observations:
@@ -245,7 +223,7 @@
         for_expr = np.array(vvars_t) + obs
         backpointer.append(bptrs_t)
 
-    terminal_expr = for_expr + trans[5] # sp_e = 5
+    terminal_expr = for_expr + trans[5]  # sp_e = 5
     best_tag_id = np.argmax(terminal_expr)
     best_path = [best_tag_id]
 
@@ -258,11 +236,7 @@
     return best_path
 
 
-cpdef load_file(filename, delimiter='\t', newline='EOS'):
-    cdef:
-        list X, Y, words, tags
-        unicode word, tag
-
+def load_file(filename, delimiter='\t', newline='EOS'):
     X = []
     Y = []
     words = []

diff --git a/setup.py b/setup.py
@@ -5,7 +5,6 @@
 import sys
 
 from setuptools import setup
-from setuptools.extension import Extension
 
 readme = 'README.md'
 
@@ -38,49 +37,21 @@
 ]
 
 
-class defer_cythonize(list):
-    def __init__(self, callback):
-        self._list, self.callback = None, callback
-
-    def c_list(self):
-        if self._list is None:
-            self._list = self.callback()
-        return self._list
-
-    def __iter__(self):
-        for elem in self.c_list():
-            yield elem
-
-    def __getitem__(self, ii):
-        return self.c_list()[ii]
-
-    def __len__(self):
-        return len(self.c_list())
-
-def extensions():
-    from Cython.Build import cythonize
-    import numpy
-    extensions = [Extension('utils',
-                  ['nagisa/utils.pyx'],
-                  include_dirs = [numpy.get_include()])]
-    return cythonize(extensions)
-
 setup(
-    name = 'nagisa',
+    name='nagisa',
     packages=['nagisa'],
-    author = 'Taishi Ikeda',
-    author_email = 'taishi.ikeda.0323@gmail.com',
-    version = '0.2.6',
-    description = 'A Japanese tokenizer based on recurrent neural networks',
-    long_description = long_description,
-    url = 'https://github.com/taishi-i/nagisa',
-    download_url = 'https://github.com/taishi-i/nagisa/archive/0.2.6.tar.gz',
-    license = 'MIT License',
-    platforms = 'Unix',
-    setup_requires=['six', 'cython', 'numpy',],
-    install_requires = ['six', 'numpy','DyNet'],
-    classifiers = classifiers,
-    include_package_data = True,
-    test_suite = 'test.nagisa_test.suite',
-    ext_modules = defer_cythonize(extensions)
+    author='Taishi Ikeda',
+    author_email='taishi.ikeda.0323@gmail.com',
+    version='0.2.6',
+    description='A Japanese tokenizer based on recurrent neural networks',
+    long_description=long_description,
+    url='https://github.com/taishi-i/nagisa',
+    download_url='https://github.com/taishi-i/nagisa/archive/0.2.6.tar.gz',
+    license='MIT License',
+    platforms='Unix',
+    setup_requires=['six'],
+    install_requires=['six', 'numpy', 'DyNet'],
+    classifiers=classifiers,
+    include_package_data=True,
+    test_suite='test.nagisa_test.suite'
 )