<a href="https://colab.research.google.com/github/yarak001/machine_learning_common/blob/main/TextVectorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf

In [None]:
text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])
max_features = 5000  # Maximum vocab size.
max_len = 10  # Sequence length to pad the outputs to.

# Create the layer.
vectorize_layer = tf.keras.layers.TextVectorization(
 max_tokens=max_features,
 output_mode='int',
 output_sequence_length=max_len)

vectorize_layer = tf.keras.layers.TextVectorization(
 max_tokens=max_features,
 output_mode='multi_hot'
 )

# Now that the vocab layer has been created, call `adapt` on the text-only
# dataset to create the vocabulary. You don't have to batch, but for large
# datasets this means we're not keeping spare copies of the dataset.
vectorize_layer.adapt(text_dataset.batch(64))

# Create the model that uses the vectorize text layer
model = tf.keras.models.Sequential()

# Start by creating an explicit input layer. It needs to have a shape of
# (1,) (because we need to guarantee that there is exactly one string
# input per batch), and the dtype needs to be 'string'.
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))

# The first layer in our model is the vectorization layer. After this
# layer, we have a tensor of shape (batch_size, max_len) containing vocab
# indices.
model.add(vectorize_layer)

# Now, the model can map strings to integers, and you can add an embedding
# layer to map these integers to learned embeddings.
input_data = [["foo qux bar"], ["qux baz"]]
model.predict(input_data)

ValueError: ignored

In [None]:
vectorize_layer.get_vocabulary()

['', '[UNK]', 'foo', 'baz', 'bar']

In [None]:
vocab_data = ["earth", "wind", "and", "fire"]
max_len = 50  # Sequence length to pad the outputs to.

# Create the layer, passing the vocab directly. You can also pass the
# vocabulary arg a path to a file containing one vocabulary word per
# line.
vectorize_layer = tf.keras.layers.TextVectorization(
 max_tokens=max_features,
 output_mode='int',
 output_sequence_length=max_len,
 vocabulary=vocab_data)

# Because we've passed the vocabulary directly, we don't need to adapt
# the layer - the vocabulary is already set. The vocabulary contains the
# padding token ('') and OOV token ('[UNK]') as well as the passed tokens.
vectorize_layer.get_vocabulary()

['', '[UNK]', 'earth', 'wind', 'and', 'fire']

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
dataset = [
    "An apparatus and method for networking the keyboard / graphics array monitor / mouse of the multiple computer and managing with the single key board / graphics array monitor / mouse.",
    "A manufacturing method of プシコース using プシコース epimerization enzyme and this",
    "* 123 Schizandra &#x000B7; back mixing fermented wine and a manufacturing method thereof."
]

text_dataset = tf.data.Dataset.from_tensor_slices(dataset)

In [None]:
import re
import string
import sys
from nltk.corpus import stopwords


def clean_doc_tokens(doc):

    doc = doc.lower()
    tokens = doc.split()
    # re_punc = re.compile('[%s]' % re.escape(string.punctuation))

    # 정규식 pattern 객체 생성
    # printable = digits + ascii_letters + punctuation + whitespace
    re_print = re.compile('[^%s]' % re.escape(string.printable))

    # 구두점 ''로 교체
    table = str.maketrans('', '', string.punctuation)
    tokens = [word.translate(table) for word in tokens]
    # printable이 아닌것들 ''로 교체
    tokens_sentence = [re_print.sub('', w) for w in tokens]
    # alphabet으로 이뤄진 token만 남김
    tokens = [word for word in tokens_sentence if word.isalpha()]

    # 불용어(stopword) 제거
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    #길이가 2이상이 token만 남김

    tokens_keyword = [word for word in tokens if len(word) > 1]

    return ' '.join(tokens_sentence), ' '.join(tokens_keyword)

for data in dataset:
    sentence, keyword = clean_doc_tokens(data)
    print(f'sentence: {sentence} \r\n keyword: {keyword}')

sentence: an apparatus and method for networking the keyboard  graphics array monitor  mouse of the multiple computer and managing with the single key board  graphics array monitor  mouse 
 keyword: apparatus method networking keyboard graphics array monitor mouse multiple computer managing single key board graphics array monitor mouse
sentence: a manufacturing method of  using  epimerization enzyme and this 
 keyword: manufacturing method using epimerization enzyme
sentence:  123 schizandra x000b7 back mixing fermented wine and a manufacturing method thereof 
 keyword: schizandra back mixing fermented wine manufacturing method thereof


In [None]:
# TextVectorize Layer의 standardize인자에서 사용
# @tf.keras.utils.register_keras_serializable()
def sentence_standardization(input_string):
    tokens = tf.strings.lower(input_string, encoding='utf-8')  # 소문자로
    tokens = tf.strings.regex_replace(tokens, '[%s]' % re.escape(string.punctuation), ' ')  # 구두점 제거
    tokens = tf.strings.regex_replace(tokens, '[^%s]' % re.escape(string.printable), ' ')  # printable이 아닌것들 ''로 제거
    tokens = tf.strings.regex_replace(tokens, b'\s+', ' ')  # 연속된 공백을 하나로
    tokens = tf.strings.strip(tokens)  # 앞, 뒤 공백 제거
    return tokens

# TextVectorize Layer의 standardize인자에서 사용
# @tf.keras.utils.register_keras_serializable()
def keywords_standardization(input_string):
    tokens = tf.strings.lower(input_string, encoding='utf-8')  # 소문자로
    tokens = tf.strings.regex_replace(tokens, '[%s]' % re.escape(string.punctuation), ' ')  # 구두점 제거
    tokens = tf.strings.regex_replace(tokens, '[^%s]' % re.escape(string.printable), ' ')  # printable이 아닌것들 ''로 제거
    tokens = tf.strings.regex_replace(tokens, b'\s+', ' ')  # 연속된 공백을 하나로

    tokens = tf.strings.regex_replace(tokens, '[^a-zA-Z]', ' ')  # alphabet이 아닌 것 제거
    tokens = tf.strings.regex_replace(tokens, r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*', ' ')  # 불용어 제거
    tokens = tf.strings.regex_replace(tokens, r'\b\w{1}\b', '')  # 길이가 2미안은 제거
    tokens = tf.strings.regex_replace(tokens, b'\s+', ' ')  # 연속된 공백을 하나로

    tokens = tf.strings.strip(tokens)  # 앞, 뒤 공백 제거
    return tokens


In [None]:
for data in dataset:
    sentence = sentence_standardization(data)
    keyword = keywords_standardization(data)
    print(f"{'*'* 30} \r\n origin: {data} \r\n sentence: {sentence} \r\n keyword: {keyword} \r\n{'*'* 30}")

****************************** 
 origin: An apparatus and method for networking the keyboard / graphics array monitor / mouse of the multiple computer and managing with the single key board / graphics array monitor / mouse. 
 sentence: b'an apparatus and method for networking the keyboard graphics array monitor mouse of the multiple computer and managing with the single key board graphics array monitor mouse' 
 keyword: b'apparatus method networking keyboard graphics array monitor mouse multiple computer managing single key board graphics array monitor mouse' 
******************************
****************************** 
 origin: A manufacturing method of プシコース using プシコース epimerization enzyme and this 
 sentence: b'a manufacturing method of using epimerization enzyme and this' 
 keyword: b'manufacturing method using epimerization enzyme' 
******************************
****************************** 
 origin: * 123 Schizandra &#x000B7; back mixing fermented wine and a manufa

In [None]:
max_features = 5000  # Maximum vocab size.
max_len = 50  # Sequence length to pad the outputs to.

vectorize_layer_int = tf.keras.layers.TextVectorization(
 standardize = sentence_standardization,
 max_tokens=max_features,
 output_mode='int',
 output_sequence_length=max_len)

vectorize_layer_multi_hot = tf.keras.layers.TextVectorization(
 max_tokens=max_features,
 output_mode='multi_hot'
 )

In [None]:
vectorize_layer_int.adapt(text_dataset.batch(1))
# vectorize_layer_multi_hot.adapt(text_dataset.batch(1))



In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer_int)
predict = model.predict(['i love you'])
predict

array([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]])

In [None]:
vectorizer_int_vocab = vectorize_layer_int.get_vocabulary(include_special_tokens=True)
vectorizer_int_vocab

['',
 '[UNK]',
 'and',
 'the',
 'method',
 'of',
 'mouse',
 'monitor',
 'manufacturing',
 'graphics',
 'array',
 'a',
 'x000b7',
 'with',
 'wine',
 'using',
 'this',
 'thereof',
 'single',
 'schizandra',
 'networking',
 'multiple',
 'mixing',
 'managing',
 'keyboard',
 'key',
 'for',
 'fermented',
 'epimerization',
 'enzyme',
 'computer',
 'board',
 'back',
 'apparatus',
 'an',
 '123']

In [None]:
[' '.join([vectorizer_int_vocab[i] for i in item]) for item in predict]

['[UNK] [UNK] [UNK]                                               ',
 '[UNK] [UNK] [UNK]                                               ',
 '[UNK] [UNK] [UNK]                                               ',
 '[UNK] [UNK] [UNK]                                               ']

In [None]:
tf.strings.regex_replace("**121safa** afdasfa 123", "\w*\d\w*", "")

<tf.Tensor: shape=(), dtype=string, numpy=b'**** afdasfa '>

In [None]:
r'|'.join(stopwords.words('english'))

"i|me|my|myself|we|our|ours|ourselves|you|you're|you've|you'll|you'd|your|yours|yourself|yourselves|he|him|his|himself|she|she's|her|hers|herself|it|it's|its|itself|they|them|their|theirs|themselves|what|which|who|whom|this|that|that'll|these|those|am|is|are|was|were|be|been|being|have|has|had|having|do|does|did|doing|a|an|the|and|but|if|or|because|as|until|while|of|at|by|for|with|about|against|between|into|through|during|before|after|above|below|to|from|up|down|in|out|on|off|over|under|again|further|then|once|here|there|when|where|why|how|all|any|both|each|few|more|most|other|some|such|no|nor|not|only|own|same|so|than|too|very|s|t|can|will|just|don|don't|should|should've|now|d|ll|m|o|re|ve|y|ain|aren|aren't|couldn|couldn't|didn|didn't|doesn|doesn't|hadn|hadn't|hasn|hasn't|haven|haven't|isn|isn't|ma|mightn|mightn't|mustn|mustn't|needn|needn't|shan|shan't|shouldn|shouldn't|wasn|wasn't|weren|weren't|won|won't|wouldn|wouldn't"

In [None]:
import token
from IPython.core.displayhook import tokenize
from keras_preprocessing.text import Tokenizer

dataset = ['l love you.', 'you love me', 'I hate he', 'he loves you']

tokenizer = Tokenizer()
tokenizer.fit_on_sequences(dataset)

print(tokenizer.document_count)
print(tokenizer.word_counts)

4
OrderedDict()
