Skip to content

Commit

Permalink
delete cython in utils.pywithout the need for Build Tools for Visual …
Browse files Browse the repository at this point in the history
…Studio
  • Loading branch information
taishi-i committed Jun 19, 2020
1 parent 256b0e8 commit 23b8138
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 124 deletions.
1 change: 0 additions & 1 deletion MANIFEST.in 100644 → 100755
@@ -1,2 +1 @@
recursive-include nagisa/data *
recursive-include nagisa *.pyx
2 changes: 1 addition & 1 deletion nagisa/__init__.py
@@ -1,4 +1,4 @@
import utils
from nagisa import utils
from nagisa.tagger import Tagger
from nagisa.train import fit

Expand Down
2 changes: 1 addition & 1 deletion nagisa/prepro.py
Expand Up @@ -6,7 +6,7 @@

import numpy as np

import utils
from nagisa import utils

OOV = utils.OOV
PAD = utils.PAD
Expand Down
2 changes: 1 addition & 1 deletion nagisa/tagger.py
Expand Up @@ -5,8 +5,8 @@
import os
import re
import sys
import utils
import nagisa.model as model
from nagisa import utils

base = os.path.dirname(os.path.abspath(__file__))
sys.path.append(base)
Expand Down
3 changes: 2 additions & 1 deletion nagisa/train.py
Expand Up @@ -7,10 +7,11 @@
import logging
from collections import OrderedDict

import utils
import model
import prepro
import mecab_system_eval

from nagisa import utils
from tagger import Tagger

logging.basicConfig(level=logging.INFO, format='%(message)s')
Expand Down
124 changes: 49 additions & 75 deletions nagisa/utils.pyx → nagisa/utils.py 100644 → 100755
Expand Up @@ -11,64 +11,61 @@

from six.moves import cPickle

reload(sys)
# reload(sys)
if sys.version_info.major == 2:
sys.setdefaultencoding('utf-8')

cdef unicode __OOV = u'oov'
cdef unicode __PAD = u'pad'
__OOV = u'oov'
__PAD = u'pad'

OOV = __OOV
PAD = __PAD

_hiragana = re.compile(u'[\u3040-\u309F]')
_katakana = re.compile(u'[\u30A1-\u30FA]')
_kanji = re.compile(u'[\u4e00-\u9fa5]')
_alpha = re.compile(u'[a-zA-Z]')
_numeric = re.compile(u'[0-9]')
_kanji = re.compile(u'[\u4e00-\u9fa5]')
_alpha = re.compile(u'[a-zA-Z]')
_numeric = re.compile(u'[0-9]')


cpdef unicode utf8rstrip(text):
if type(text) != unicode:
def utf8rstrip(text):
if type(text) != str:
return unicode(text.rstrip(), 'utf-8')
else:
return text.rstrip()


cpdef unicode normalize(unicode text):
def normalize(text):
return unicodedata.normalize('NFKC', text)


cpdef unicode preprocess(text):
def preprocess(text):
text = utf8rstrip(text)
text = normalize(text)
text = text.replace(' ', ' ')
return text


cpdef unicode preprocess_without_rstrip(text):
if type(text) != unicode:
def preprocess_without_rstrip(text):
if type(text) != str:
text = unicode(text, 'utf-8')
text = normalize(text)
text = text.replace(' ', ' ')
return text


cpdef list get_unigram(unicode text):
cdef unicode uni
def get_unigram(text):
return [uni for uni in text]


cpdef list get_bigram(unicode text):
cdef:
int i
int length_text = len(text)
unicode end_symbol = u'<E>'
def get_bigram(text):
length_text = len(text)
end_symbol = u'<E>'
return [text[i]+end_symbol if i == length_text-1 else text[i:i+2]
for i in range(length_text)]


cpdef int get_chartype(unicode character):
def get_chartype(character):
if _hiragana.search(character):
return 0
elif _katakana.search(character):
Expand All @@ -83,14 +80,9 @@
return 5


cpdef list get_words_starting_at_i(unicode text, dict dictionary):
cdef:
int i
int j
int length_text = len(text)
list subwords
list words_starting_at_i = []
unicode sub
def get_words_starting_at_i(text, dictionary):
length_text = len(text)
words_starting_at_i = []

for i in range(length_text):
subwords = []
Expand All @@ -104,13 +96,9 @@
return words_starting_at_i


cpdef list get_words_ending_at_i(unicode text, dict dictionary):
cdef:
int i
int j
int length_text = len(text)
list subwords
list words_ending_at_i = []
def get_words_ending_at_i(text, dictionary):
length_text = len(text)
words_ending_at_i = []

text = text[::-1]
for i in range(length_text):
Expand All @@ -125,31 +113,31 @@
return words_ending_at_i[::-1]


cpdef list conv_tokens_to_ids(list words, dict word2id):
cdef unicode word
return [word2id[word] if word in word2id else word2id[__OOV] for word in words]
def conv_tokens_to_ids(words, word2id):
return [word2id[word] if word in word2id else word2id[__OOV]
for word in words]


cpdef list context_window(list l, int win, int pad_id=1):
cdef:
int length_l = len(l)
def context_window(l, win, pad_id=1):
length_l = len(l)

assert (win % 2) == 1
assert win >=1
assert win >= 1
lpadded = int(win/2) * [pad_id] + l + int(win/2) * [pad_id]
out = [lpadded[i:i+win] for i in range(length_l)]
assert len(out) == len(l)
return out


cpdef list feature_extraction(unicode text, dict uni2id, dict bi2id,
dict dictionary, int window_size):
def feature_extraction(text, uni2id, bi2id, dictionary, window_size):
# character-level features
unigrams = get_unigram(text)
bigrams = get_bigram(text)
uids = context_window(conv_tokens_to_ids(unigrams, uni2id), window_size)
bids = context_window(conv_tokens_to_ids(bigrams, bi2id), window_size)
cids = context_window([get_chartype(uni) for uni in unigrams], window_size, pad_id=6)
cids = context_window(
[get_chartype(uni) for uni in unigrams], window_size, pad_id=6
)

# word-level features
wids_s = get_words_starting_at_i(text, dictionary)
Expand All @@ -159,22 +147,19 @@
return features


cpdef dict load_dictionary(dict_path):
cdef dict word_dict = {__OOV:0, __PAD:1}
def load_dictionary(dict_path):
word_dict = {__OOV: 0, __PAD: 1}
with open(dict_path, 'r') as words:
for word in words:
word = utf8rstrip(word)
if not word in word_dict:
if word not in word_dict:
word_dict[word] = len(word_dict)
return word_dict


cpdef list make_tags_as_bmes(unicode text):
cdef:
int i
int len_word
list tags = []
list words = text.split(u' ')
def make_tags_as_bmes(text):
tags = []
words = text.split(u' ')
for word in words:
len_word = len(word)
if len_word < 2:
Expand All @@ -193,12 +178,9 @@
return tags


cpdef list segmenter_for_bmes(unicode chars, list tags):
cdef:
int tag
list words = []
unicode partical_word = u''
unicode character
def segmenter_for_bmes(chars, tags):
words = []
partical_word = u''

assert len(chars) == len(tags)
for character, tag in zip(chars, tags):
Expand All @@ -213,25 +195,21 @@
return words


cpdef dump_data(data, fn):
def dump_data(data, fn):
with gzip.open(fn, 'wb') as gf:
cPickle.dump(data, gf, protocol=2)
gf.close()


cpdef load_data(fn):
def load_data(fn):
with gzip.open(fn, 'rb') as gf:
return cPickle.load(gf)


cpdef list np_viterbi(trans, observations):
cdef:
int idx, best_tag_id
list bptrs_t, vvars_t, backpointer, indice, best_path

def np_viterbi(trans, observations):
for_expr = np.array([-1e10]*6)
for_expr[4] = 0 # sp_s = 4
indice = [0,1,2,3,4,5]
for_expr[4] = 0 # sp_s = 4
indice = [0, 1, 2, 3, 4, 5]
backpointer = []

for obs in observations:
Expand All @@ -245,7 +223,7 @@
for_expr = np.array(vvars_t) + obs
backpointer.append(bptrs_t)

terminal_expr = for_expr + trans[5] # sp_e = 5
terminal_expr = for_expr + trans[5] # sp_e = 5
best_tag_id = np.argmax(terminal_expr)
best_path = [best_tag_id]

Expand All @@ -258,11 +236,7 @@
return best_path


cpdef load_file(filename, delimiter='\t', newline='EOS'):
cdef:
list X, Y, words, tags
unicode word, tag

def load_file(filename, delimiter='\t', newline='EOS'):
X = []
Y = []
words = []
Expand Down
59 changes: 15 additions & 44 deletions setup.py 100644 → 100755
Expand Up @@ -5,7 +5,6 @@
import sys

from setuptools import setup
from setuptools.extension import Extension

readme = 'README.md'

Expand Down Expand Up @@ -38,49 +37,21 @@
]


class defer_cythonize(list):
def __init__(self, callback):
self._list, self.callback = None, callback

def c_list(self):
if self._list is None:
self._list = self.callback()
return self._list

def __iter__(self):
for elem in self.c_list():
yield elem

def __getitem__(self, ii):
return self.c_list()[ii]

def __len__(self):
return len(self.c_list())

def extensions():
from Cython.Build import cythonize
import numpy
extensions = [Extension('utils',
['nagisa/utils.pyx'],
include_dirs = [numpy.get_include()])]
return cythonize(extensions)

setup(
name = 'nagisa',
name='nagisa',
packages=['nagisa'],
author = 'Taishi Ikeda',
author_email = 'taishi.ikeda.0323@gmail.com',
version = '0.2.6',
description = 'A Japanese tokenizer based on recurrent neural networks',
long_description = long_description,
url = 'https://github.com/taishi-i/nagisa',
download_url = 'https://github.com/taishi-i/nagisa/archive/0.2.6.tar.gz',
license = 'MIT License',
platforms = 'Unix',
setup_requires=['six', 'cython', 'numpy',],
install_requires = ['six', 'numpy','DyNet'],
classifiers = classifiers,
include_package_data = True,
test_suite = 'test.nagisa_test.suite',
ext_modules = defer_cythonize(extensions)
author='Taishi Ikeda',
author_email='taishi.ikeda.0323@gmail.com',
version='0.2.6',
description='A Japanese tokenizer based on recurrent neural networks',
long_description=long_description,
url='https://github.com/taishi-i/nagisa',
download_url='https://github.com/taishi-i/nagisa/archive/0.2.6.tar.gz',
license='MIT License',
platforms='Unix',
setup_requires=['six'],
install_requires=['six', 'numpy', 'DyNet'],
classifiers=classifiers,
include_package_data=True,
test_suite='test.nagisa_test.suite'
)

0 comments on commit 23b8138

Please sign in to comment.