In [1]:
import tensorflow
from tensorflow.keras.layers import Dense, Embedding, Conv1D, MaxPooling1D, Dropout, Input, Reshape
from tensorflow.keras.regularizers import l2
import yaml

def loss_wrapper(weights_loss):
    def custom_loss(y_true, y_pred):
        return tf.nn.softmax_cross_entropy_with_logits()
def get_yamldict(yaml_path):
    with open(yaml_path, "r", encoding="utf-8") as yaml_file:
        yaml_dict = yaml.load(yaml_file, Loader=yaml.Loader)
    return yaml_dict
def getModel(word_seq_len, char_seq_len, ngram_seq_len, chars_dict_len, word_dict_len, ngram_dict_len, reg_lambda,
             emb_dim):
    ########### char-level embedding, conv layer #############
    char_input = Input(shape=(char_seq_len), name="char_input")
    print("shape of char_input: ", char_input.shape)
    emb_char = Embedding(chars_dict_len, emb_dim)(char_input)
    print("char-Embedding: ", emb_char.shape)
    pooled_result = []
    for filter_size in [3, 4, 5, 6]:
        char_conv_output = Conv1D(filters=256, kernel_size=filter_size, strides=1, activation="relu", padding="VALID")(
            emb_char)
        print("char-convOutput : ", char_conv_output.shape)
        max_pooled_output = MaxPooling1D(pool_size=char_seq_len  - filter_size + 1, strides=1, padding="VALID")(
            char_conv_output)
        print("char - maxPool shape: ", max_pooled_output.shape)
        pooled_result.append(max_pooled_output)
    num_filters_total = 256 * 4
    filter_pooled = tensorflow.concat(pooled_result, axis=1)
    print("4 filter output pooled: ", filter_pooled.shape)
    char_flat = tensorflow.reshape(filter_pooled, [-1, num_filters_total])
    char_drop_out = Dropout(0.5)(char_flat)
    ######### word-level embedding, conv_layer ###########
    input_word = Input(shape=(word_seq_len), name="word_input")
    input_ngram = Input(shape=(word_seq_len * ngram_seq_len), name="ngram_input")
    reshaping_layer = Reshape((word_seq_len, ngram_seq_len), input_shape=(word_seq_len * ngram_seq_len,))
    reshepd_ngram = reshaping_layer(input_ngram)
    # input_ngram_padded = Input(shape=(200,20,32,), name="ngram_padded_input")
    emb_word = Embedding(word_dict_len + 1, emb_dim)(input_word)
    print("word-embedding: ", emb_word.shape)
    emb_ngram = Embedding(ngram_dict_len + 1, emb_dim)(reshepd_ngram)
    emb_ngram_sum = tensorflow.reduce_sum(emb_ngram, 2)
    tot_word_emb = tensorflow.add(emb_ngram_sum, emb_word)
    word_pooled_result = []
    for filter_size in [3, 4, 5, 6]:
        layer_name = "word_layer_fsize_" + str(filter_size)
        word_conv_output = Conv1D(filters=256, kernel_size=filter_size, strides=1, activation="relu", padding="VALID",
                                  name=layer_name)(tot_word_emb)
        max_pooled_output_word = MaxPooling1D(pool_size=word_seq_len - filter_size + 1, strides=1, padding="VALID")(
            word_conv_output)
        word_pooled_result.append(max_pooled_output_word)
    word_filter_pooled = tensorflow.concat(word_pooled_result, axis=1)
    word_flat = tensorflow.reshape(word_filter_pooled, [-1, num_filters_total])
    print("word_flat shape: ", word_flat.shape)
    word_drop_out = Dropout(0.5)(word_flat)
    ############# Fully connected Layer ##########
    char_dense_layer = Dense(512, input_shape=(num_filters_total,), kernel_initializer="glorot_uniform",
                             kernel_regularizer=l2(reg_lambda))(char_drop_out)
    word_dense_layer = Dense(512, input_shape=(num_filters_total,), kernel_initializer="glorot_uniform",
                             kernel_regularizer=l2(reg_lambda))(word_drop_out)
    conv_final_output = tensorflow.concat([char_dense_layer, word_dense_layer], 1)
    ############### Dense layer - before_softmax #######################
    d0 = Dense(512, input_shape=(1024,), kernel_initializer="glorot_uniform", kernel_regularizer=l2(reg_lambda))(
        conv_final_output)
    d1 = Dense(256, input_shape=(512,), kernel_initializer="glorot_uniform", kernel_regularizer=l2(reg_lambda))(d0)
    d2 = Dense(128, input_shape=(256,), kernel_initializer="glorot_uniform", kernel_regularizer=l2(reg_lambda))(d1)
    score = Dense(2, input_shape=(128,), kernel_initializer="glorot_uniform", kernel_regularizer=l2(reg_lambda))(d2)
    ############### Score & Predictions & Probability ##############
    predictions = tensorflow.argmax(score, 1, name="predictions")
    prob = tensorflow.nn.softmax(score, name="prob")
    model = tensorflow.keras.Model([char_input, input_word, input_ngram], prob)
    return model

def input_fn(path, maxWordPerUrl, maxCharPerUrl, maxNgramPerUrl, shuffle_buffer_size, batch_size):
    dataset = tensorflow.data.TFRecordDataset(path)
    feature_map = {
        "engineeredChar": tensorflow.io.FixedLenFeature((maxCharPerUrl,), tensorflow.int64),
        "engineeredWord": tensorflow.io.FixedLenFeature((maxWordPerUrl,), tensorflow.int64),
        "engineeredNgram": tensorflow.io.FixedLenFeature((maxWordPerUrl * maxNgramPerUrl,), tensorflow.int64),
        "label": tensorflow.io.FixedLenFeature([], tensorflow.int64)
    }
    def _parse_fn(record, feature_map):
        example = tensorflow.io.parse_single_example(serialized=record, features=feature_map)
        onehot_label = tensorflow.cast(tensorflow.one_hot(tensorflow.where(tensorflow.equal(example["label"], 1), 1, 0), depth=2), dtype=tensorflow.int64)
        return {"char_input": example["engineeredChar"], "word_input": example["engineeredWord"],
                "ngram_input": example["engineeredNgram"]}, onehot_label
    dataset = dataset.map(lambda record: _parse_fn(record, feature_map))
    return dataset.shuffle(shuffle_buffer_size).batch(batch_size)
def read_tfrecord(path, maxWordPerUrl, maxCharPerUrl, maxNgramPerUrl, read_batch_size):
    dataset = tensorflow.data.TFRecordDataset(path)
    feature_map = {
        "engineeredChar": tensorflow.io.FixedLenFeature((maxCharPerUrl,), tensorflow.int64),
        "engineeredWord": tensorflow.io.FixedLenFeature((maxWordPerUrl,), tensorflow.int64),
        "engineeredNgram": tensorflow.io.FixedLenFeature((maxWordPerUrl * maxNgramPerUrl,), tensorflow.int64),
        "label": tensorflow.io.FixedLenFeature([], tensorflow.int64)
    }
    def _parse_fn(record, feature_map):
        example = tensorflow.io.parse_single_example(serialized=record, features=feature_map)
        onehot_label = tensorflow.cast(tensorflow.one_hot(tensorflow.where(tensorflow.equal(example["label"], 1), 1, 0), depth=2), dtype=tensorflow.int64)
        return {"char_input": example["engineeredChar"], "word_input": example["engineeredWord"],
                "ngram_input": example["engineeredNgram"]}, onehot_label
    dataset = dataset.map(lambda record: _parse_fn(record, feature_map))
    return dataset.batch(read_batch_size)
 
def input_small_fn(path, maxWordPerUrl, maxCharPerUrl, maxNgramPerUrl):
    dataset = tensorflow.data.TFRecordDataset(path)
    feature_map = {
        "engineeredChar": tensorflow.io.FixedLenFeature((maxCharPerUrl,), tensorflow.int64),
        "engineeredWord": tensorflow.io.FixedLenFeature((maxWordPerUrl,), tensorflow.int64),
        "engineeredNgram": tensorflow.io.FixedLenFeature((maxWordPerUrl * maxNgramPerUrl,), tensorflow.int64),
        "label": tensorflow.io.FixedLenFeature([], tensorflow.int64)
    }
    def _parse_fn(record, feature_map):
        example = tensorflow.io.parse_single_example(serialized=record, features=feature_map)
        onehot_label = tensorflow.cast(tensorflow.one_hot(tensorflow.where(tensorflow.equal(example["label"], 1), 1, 0), depth=2), dtype=tensorflow.int64)
        return {"char_input": example["engineeredChar"], "word_input": example["engineeredWord"],
                "ngram_input": example["engineeredNgram"]}, onehot_label
    dataset = dataset.map(lambda record: _parse_fn(record, feature_map))
    return dataset.take(100000).shuffle(10000).batch(1000)
 
def get_label_and_urlstring(path):
    dataset = tensorflow.data.TFRecordDataset(path)
    feature_map = {
        "url": tensorflow.io.FixedLenFeature([], tensorflow.string),
        "label":tensorflow.io.FixedLenFeature([], tensorflow.int64),
    }
    def _parse_fn(record, feature_map):
        example = tensorflow.io.parse_single_example(serialized=record, features=feature_map)
        one_hot_label = tensorflow.cast(tensorflow.one_hot(tensorflow.where(tensorflow.equal(example["label"], 1),1,0), depth=2), dtype=tensorflow.int64)
        return {"url": example["url"], "label": one_hot_label}
    
    return dataset.map(lambda record: _parse_fn(record, feature_map))
    
    
def get_label_url_source(path):
    dataset = tensorflow.data.TFRecordDataset(path)
    feature_map = {
        "url": tensorflow.io.FixedLenFeature([], tensorflow.string),
        "label":tensorflow.io.FixedLenFeature([], tensorflow.int64),
        "urlSource":tensorflow.io.FixedLenFeature([], tensorflow.string)
    }
    def _parse_fn(record, feature_map):
        example = tensorflow.io.parse_single_example(serialized=record, features=feature_map)
        one_hot_label = tensorflow.cast(tensorflow.one_hot(tensorflow.where(tensorflow.equal(example["label"], 1),1,0), depth=2), dtype=tensorflow.int64)
        return {"url": example["url"], "label": one_hot_label, "urlSource": example["urlSource"]}
    
    dataset = dataset.map(lambda record: _parse_fn(record, feature_map))
    return dataset

In [2]:
import pandas as pd 

In [3]:
pd.read_parquet("/home/youngjai/sampleData/ailab/workspace/youngjai_kwon/pml-data/engineered-data/totalSet/2020-11-09/small_dictionary/wordTokenIndex")

Unnamed: 0,wordToken,Index
0,"""",1
1,"""""",2
2,"""""""",3
3,',4
4,''''''''''''''''''''''''''''''''''''''''''''''...,5
...,...,...
10501,驱动精灵@,10502
10502,로,10503
10503,으,10504
10504,을,10505


In [4]:
pd.read_parquet("/home/youngjai/sampleData/ailab/workspace/youngjai_kwon/pml-data/engineered-data/totalSet/2020-11-09/small_dictionary/charTokenIndex")

Unnamed: 0,charToken,Index
0,,1
1,,2
2,,3
3,,4
4,,5
...,...,...
1399,２,1400
1400,４,1401
1401,：,1402
1402,�,1403


In [5]:
word_seq_len = 200
char_seq_len = 200
ngram_seq_len = 20
chars_dict_len = 1406 
word_dict_len = 10507
ngram_dict_len = 1406
reg_lambda = 0 
emb_dim = 32

In [12]:
model = getModel(word_seq_len, char_seq_len, ngram_seq_len, chars_dict_len, word_dict_len, ngram_dict_len, reg_lambda, emb_dim)

shape of char_input:  (None, 200)
char-Embedding:  (None, 200, 32)
char-convOutput :  (None, 198, 256)
char - maxPool shape:  (None, 1, 256)
char-convOutput :  (None, 197, 256)
char - maxPool shape:  (None, 1, 256)
char-convOutput :  (None, 196, 256)
char - maxPool shape:  (None, 1, 256)
char-convOutput :  (None, 195, 256)
char - maxPool shape:  (None, 1, 256)
4 filter output pooled:  (None, 4, 256)
word-embedding:  (None, 200, 32)
word_flat shape:  (None, 1024)


In [7]:
train_base_dir = "/home/youngjai/sampleData/ailab/workspace/youngjai_kwon/pml-data/engineered-data/totalSet/2020-11-09/small_splitted/train.tfrecord/part-r-"
train_data_dir = [train_base_dir+str(idx).zfill(5) for idx in range(5)]

val_base_dir =  "/home/youngjai/sampleData/ailab/workspace/youngjai_kwon/pml-data/engineered-data/totalSet/2020-11-09/small_splitted/test.tfrecord/part-r-"
val_data_dir = [val_base_dir+str(idx).zfill(5) for idx in range(5)]

oop_base_dir = "/home/youngjai/sampleData/ailab/workspace/youngjai_kwon/pml-data/engineered-data/totalSet/2020-11-09/small_oop/tfrecords"
oop_data_dir = [oop_base_dir + str(idx).zfill(5) for idx in range(5)]

In [8]:
tr_dataset = input_fn(train_data_dir, word_seq_len, char_seq_len, ngram_seq_len, 10000, 256)
val_dataset = input_fn(val_data_dir, word_seq_len, char_seq_len, ngram_seq_len, 10000, 256)
oop_dataset = input_fn(oop_data_dir, word_seq_len, char_seq_len, ngram_seq_len, 10000, 256)



In [9]:
val_dataset.unbatch().take(3)

<TakeDataset shapes: ({char_input: (200,), word_input: (200,), ngram_input: (4000,)}, (2,)), types: ({char_input: tf.int64, word_input: tf.int64, ngram_input: tf.int64}, tf.int64)>

In [10]:
model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

In [11]:
model.fit(tr_dataset, epochs=3, validation_data=val_dataset)

Epoch 1/3
     15/Unknown - 8s 507ms/step - loss: 1.2896 - accuracy: 0.5862

KeyboardInterrupt: 

In [17]:
model.save_weights("/home/youngjai/sampleData/ailab/workspace/youngjai_kwon/pml-data/model/2020-11-09/small_70k")

In [14]:
model = getModel(word_seq_len, char_seq_len, ngram_seq_len, chars_dict_len, word_dict_len, ngram_dict_len, reg_lambda, emb_dim)
model.load_weights("/home/youngjai/sampleData/ailab/workspace/youngjai_kwon/pml-data/model/2020-11-09/small_70k")

shape of char_input:  (None, 200)
char-Embedding:  (None, 200, 32)
char-convOutput :  (None, 198, 256)
char - maxPool shape:  (None, 1, 256)
char-convOutput :  (None, 197, 256)
char - maxPool shape:  (None, 1, 256)
char-convOutput :  (None, 196, 256)
char - maxPool shape:  (None, 1, 256)
char-convOutput :  (None, 195, 256)
char - maxPool shape:  (None, 1, 256)
4 filter output pooled:  (None, 4, 256)
word-embedding:  (None, 200, 32)
word_flat shape:  (None, 1024)


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f0fb8450668>

In [21]:
layers_wt = model.get_layer("word_layer_fsize_3").weights


In [26]:
def get_shape(lst, shape=()):
    from collections.abc import Sequence
    """
    returns the shape of nested lists similarly to numpy's shape.

    :param lst: the nested list
    :param shape: the shape up to the current recursion depth
    :return: the shape including the current depth
            (finally this will be the full depth)
    """

    if not isinstance(lst, Sequence):
        # base case
        return shape

    # peek ahead and assure all lists in the next depth
    # have the same length
    if isinstance(lst[0], Sequence):
        l = len(lst[0])
        if not all(len(item) == l for item in lst):
            msg = 'not all lists have the same length'
            raise ValueError(msg)

    shape += (len(lst), )

    # recurse
    shape = get_shape(lst[0], shape)

    return shape

In [27]:
get_shape(model.get_layer("word_layer_fsize_3").get_weights())

(2,)

In [28]:
get_shape(model.get_layer("word_layer_fsize_3").weights)

(2,)

In [31]:
val_dataset

<BatchDataset shapes: ({char_input: (None, 200), word_input: (None, 200), ngram_input: (None, 4000)}, (None, 2)), types: ({char_input: tf.int64, word_input: tf.int64, ngram_input: tf.int64}, tf.int64)>

In [45]:
model.inputs

[<tf.Tensor 'char_input:0' shape=(None, 200) dtype=float32>,
 <tf.Tensor 'word_input:0' shape=(None, 200) dtype=float32>,
 <tf.Tensor 'ngram_input:0' shape=(None, 4000) dtype=float32>]

In [46]:
model.get_layer("word_layer_fsize_3").output

<tf.Tensor 'word_layer_fsize_3/Identity:0' shape=(None, 198, 256) dtype=float32>

In [48]:
grad_model = tf.keras.models.Model(
            [model.inputs], [model.get_layer(LAYER_NAME).output, model.output]
)

In [52]:
grads = tf.GradientTape().gradient(model.output[:,0], layers_wt)[0]

RuntimeError: GradientTape.gradient can only be called once on non-persistent tapes.

결국, 전체 output에 대해서, featuremap이 얼마나 바뀌었는지를 본다. 
output 을 / feature map으로 ...       
       
최적화 대상이 되는 식을 tf.gradient 안으로 넣는다. 


In [None]:

for feature_map in feature_maps
    with tf.GradientTape() as tape:
        y_c = self.logits_layer[:,_class_idx]
    tf.gradients(y_c, feature_map)[0]


In [35]:
from model_class import URLNet

url_class = URLNet(word_seq_len, char_seq_len, ngram_seq_len, chars_dict_len, word_dict_len, ngram_dict_len, reg_lambda, emb_dim)

In [36]:
urlnet = url_class.build()

TypeError: 'int' object is not iterable

In [28]:
urlnet.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

AttributeError: 'function' object has no attribute 'compile'