In [1]:
# Read CSVs
import load_data as ld

# To load train and test data
# No need to repeat loading data if just need to change training parameters
in_height = 64
in_width = 64
num_rows = 1000

inputs = ld.read_data('train.csv', in_height, in_width, nrows=num_rows)
labels = ld.read_label('train_label.csv', nrows=num_rows)
x_predict = ld.read_data('test.csv', in_height, in_width, nrows=num_rows)

./data/train.csv  - data shape =  (1000, 4096)
./data/train_label.csv  - label shape =  (1000,)
./data/test.csv  - data shape =  (1000, 4096)


In [2]:
# split inputs for training and testing
import numpy as np

train_ratio = 0.8

np.random.seed(0)
mask = np.random.rand(inputs.shape[0]) <= train_ratio

x_train = inputs[mask]
y_train = labels[mask]
x_test = inputs[~mask]
y_test = labels[~mask]

print("x_train.shape", x_train.shape)
print("y_train.shape", y_train.shape)
print("x_test.shape", x_test.shape)
print("y_test.shape", y_test.shape)

x_train.shape (797, 4096)
y_train.shape (797,)
x_test.shape (203, 4096)
y_test.shape (203,)


In [3]:
import tensorflow as tf
from conv_net import conv_net

learning_rate = 0.0001
num_classes = 2  # total classes (0 or 1)
dropout = 0.25  # Dropout, probability to drop a unit


def model_fn(features, labels, mode):
    # Build the neural network
    # Because Dropout have different behavior at training and prediction time, we
    # need to create 2 distinct computation graphs that still share the same weights.
    logits_train = conv_net(features, num_classes, dropout, reuse=False,
                            is_training=True)
    logits_test = conv_net(features, num_classes, dropout, reuse=True,
                           is_training=False)

    # Predictions
    pred_classes = tf.argmax(logits_test, axis=1)
    pred_probas = tf.nn.softmax(logits_test)

    # If prediction mode, early return
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode, predictions=pred_probas)

    # Define loss and optimizer
    loss_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits_train, labels=tf.cast(labels, dtype=tf.int32)))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    train_op = optimizer.minimize(loss_op,
                                  global_step=tf.train.get_global_step())

    # Evaluate the accuracy of the model
    acc_op = tf.metrics.accuracy(labels=labels, predictions=pred_classes)

    print("Current accuracy of model", acc_op)

    # TF Estimators requires to return a EstimatorSpec, that specify
    # the different ops for training, evaluating, ...
    estim_specs = tf.estimator.EstimatorSpec(
        mode=mode,
        predictions=pred_classes,
        loss=loss_op,
        train_op=train_op,
        eval_metric_ops={'accuracy': acc_op})

    return estim_specs

In [4]:
# Train the model
import tensorflow as tf
import csv

train_epoch = 5
batch_size = 128
num_steps = None
ckpt_steps = 10
max_ckpt = 50

rc = tf.estimator.RunConfig(model_dir = "./model", keep_checkpoint_max=max_ckpt, save_checkpoints_steps=ckpt_steps)
model = tf.estimator.Estimator(model_fn, config=rc)
# model = tf.estimator.Estimator(model_fn)

# Define the input function for training
input_fn = tf.estimator.inputs.numpy_input_fn(
    x= {'file': x_train}, y=y_train,
    batch_size=batch_size, num_epochs=train_epoch, shuffle=True)
# Train the Model
model.train(input_fn, steps=num_steps)

INFO:tensorflow:Using config: {'_model_dir': './model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 10, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 50, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11973ea20>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Use the retry module or similar alternatives.
Current accuracy of model (<tf.Tensor 'accuracy/value:0' shape=() dtype=float32>, <tf.Tensor 'accuracy/update_op:0' shape=() dtype=float32>)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tenso

<tensorflow.python.estimator.estimator.Estimator at 0x11973e9e8>

In [5]:
# Evaluate the Model
# Define the input function for evaluating
input_fn = tf.estimator.inputs.numpy_input_fn(
    x={'file': x_test}, y=y_test,
    batch_size=batch_size, shuffle=False)

# Use the Estimator 'evaluate' method
e = model.evaluate(input_fn)
total_steps = e['global_step']
print("global_step:", e['global_step'])
print('accuracy = ', e['accuracy'], "loss = ", e['loss'])

INFO:tensorflow:Calling model_fn.
Current accuracy of model (<tf.Tensor 'accuracy/value:0' shape=() dtype=float32>, <tf.Tensor 'accuracy/update_op:0' shape=() dtype=float32>)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-04-05-16:32:43
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./model/model.ckpt-32
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-04-05-16:32:44
INFO:tensorflow:Saving dict for global step 32: accuracy = 0.8669951, global_step = 32, loss = 1.0535719
global_step: 32
accuracy =  0.8669951 loss =  1.0535719


In [8]:
# Evaluate checkpoints
import pandas as pd

print('total_steps = ', total_steps)

total_ckpts = total_steps//ckpt_steps
eval_results = np.zeros((total_ckpts, 3))

for i in range(0, total_ckpts):
    j = (i + 1) * ckpt_steps + 1
    ckpt_path = './model/model.ckpt-' + str(j)
    print(ckpt_path)
    e = model.evaluate(input_fn, checkpoint_path=ckpt_path)
    eval_results[i,:] = [j, e['accuracy'], e['loss']]
    
df = pd.DataFrame(eval_results)
header = ["step","accuracy", "loss"]
df.to_csv('./data/eval_ckpts.csv', header=header, index=None)

total_steps =  32
./model/model.ckpt-11
INFO:tensorflow:Calling model_fn.
Current accuracy of model (<tf.Tensor 'accuracy/value:0' shape=() dtype=float32>, <tf.Tensor 'accuracy/update_op:0' shape=() dtype=float32>)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-04-05-16:33:54
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./model/model.ckpt-11
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-04-05-16:33:55
INFO:tensorflow:Saving dict for global step 11: accuracy = 0.6896552, global_step = 11, loss = 3.9306483
./model/model.ckpt-21
INFO:tensorflow:Calling model_fn.
Current accuracy of model (<tf.Tensor 'accuracy/value:0' shape=() dtype=float32>, <tf.Tensor 'accuracy/update_op:0' shape=() dtype=float32>)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-04-05-16:33:56
INFO:tensorflow:Graph was finalized.
INFO:te

In [9]:
# Predict
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={'file': x_predict},
    batch_size=batch_size, num_epochs=1, shuffle=False)

results = model.predict(input_fn=predict_input_fn)

i = 0
with open('result.csv', 'w') as csvfile:
    csv_writer = csv.writer(csvfile,)
    csv_writer.writerow(["sample_id", "malware"])
    for result in results:
        csv_writer.writerow([i, result[1]])
        i = i+1

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./model/model.ckpt-32
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
