In [1]:
import argparse
import logging
import os
import pickle
import sys
import datetime

from  mpvae_replica import MODEL
from evals import compute_metrics

import theano
import theano.tensor as T
import lasagne

import dagshub
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

from common.tools import *
import common.tools as ct



In [2]:
l2_coeff=1.0
nll_coeff=0.1
c_coeff=200.
weight_regularizer=1e-4
latent_dim=16
random_seed=42
learning_rate_start=1e-3
lr_decay_ratio=0.5
lr_decay_times=3
max_epoch=15
batch_size=32
keep_prob=0.33

In [3]:
# parser = argparse.ArgumentParser()
# parser.add_argument("GRAPH_VER", help="version of the graph you want regex to label your CSV with", type=str)
# parser.add_argument("DATASET_PATH", help="path to your input CSV", type=str)
# args = parser.parse_args()

# GRAPH_VER = args.GRAPH_VER
# DATASET_PATH = args.DATASET_PATH

# CODE_COLUMN = "code_block"
# TARGET_COLUMN = "graph_vertex_id"

In [4]:
GRAPH_VER = "7"
DATASET_PATH = "../data/markup_data.csv"

MODEL_DIR = "../models/semi_vae_graph_v{}.sav".format(GRAPH_VER)
TFIDF_DIR = "../models/tfidf_semi_vae_graph_v{}.pickle".format(GRAPH_VER)
SUMMARY_DIR = "../models/vae_summary/"

CODE_COLUMN = "code_block"
TARGET_COLUMN = "graph_vertex_id"
RESUME = False


# ------------

df = load_data(DATASET_PATH)
label_dim = int(np.max(df[TARGET_COLUMN].unique()) - np.min(df[TARGET_COLUMN].unique()) + 1)

kfold_params = {
    "n_splits": 15,
    "random_state": random_seed,
    "shuffle": True,
}

data_meta = {
    "DATASET_PATH": DATASET_PATH,
    "nrows": df.shape[0],
    "label": get_graph_vertices(GRAPH_VER),
    "model": MODEL_DIR,
    "script_dir": "nl2ml" + os.path.abspath('').split("nl2ml",1)[1] ,
}


vertices parsed: ['Hypothesis', 'Environment', 'Data_Extraction', 'EDA', 'Data_Transform', 'Model_Train', 'Model_Evaluation', 'Hyperparam_Tuning', 'Vizualization', 'Data_Export', 'Model_Deploy', 'Other']


In [5]:
df_train, df_test = train_test_split(df, test_size=0.3)
vect_text_train = ct.tfidf_fit_transform(df_train[CODE_COLUMN], {"smooth_idf": True,}, TFIDF_DIR)
vect_text_test = ct.tfidf_transform(df_test[CODE_COLUMN], {"smooth_idf": True,}, TFIDF_DIR)

target_train = np.array(df_train[TARGET_COLUMN])
target_test = np.array(df_test[TARGET_COLUMN])
feat_train = np.array(df_train[CODE_COLUMN])
feat_test = np.array(df_test[CODE_COLUMN])
feat_dim = vect_text_train.shape[1]

In [6]:
session_config = tf.compat.v1.ConfigProto()
session_config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=session_config)

In [7]:
model_params = {
    "is_training":True, 
    "label_dim":1, 
    "feat_dim":feat_dim, 
    "n_train_sample":1000, 
    "n_test_sample":10,        
    "l2_coeff":l2_coeff,        
    "nll_coeff":nll_coeff,            
    "c_coeff":c_coeff,       
    "weight_regularizer":weight_regularizer,
    "latent_dim":latent_dim,         
    "random_seed":random_seed
}

In [8]:
model = MODEL(**model_params)



---- Tensor("Mean_2:0", shape=(), dtype=float32)
Instructions for updating:
Use `tf.cast` instead.
---- Tensor("Mean_2:0", shape=(), dtype=float32)
---- Tensor("Mean_6:0", shape=(), dtype=float32)


In [9]:
global_step = tf.Variable(0, name='global_step', trainable=False)

one_epoch_iter = df_train.shape[0] / 32

learning_rate_params = { 
    "learning_rate":learning_rate_start,
    "global_step":global_step,
    "decay_steps":df_train.shape[0] / batch_size * (max_epoch / lr_decay_times), 
    "decay_rate":lr_decay_ratio, 
    "staircase":True,
}

In [10]:
learning_rate = tf.compat.v1.train.exponential_decay(**learning_rate_params)
    #log the learning rate 

In [11]:
tf.compat.v1.summary.scalar('learning_rate', learning_rate)

<tf.Tensor 'learning_rate:0' shape=() dtype=string>

In [12]:
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate)
reset_optimizer_op = tf.compat.v1.variables_initializer(optimizer.variables())

In [13]:
var_x_encoder = tf.compat.v1.trainable_variables('feat_encoder')
update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
    if RESUME:
        train_op = optimizer.minimize(model.total_loss, 
                                      var_list = var_x_encoder, 
                                      global_step = global_step)
    else:
        train_op = optimizer.minimize(model.total_loss, 
                                      global_step = global_step)

In [14]:
param_setting = "lr-{}_lr-decay_{:.2f}_lr-times_{:.1f}_nll-{:.2f}_l2-{:.2f}_c-{:.2f}".format(
    learning_rate_start, 
    lr_decay_ratio, 
    lr_decay_times, 
    nll_coeff, 
    l2_coeff, 
    c_coeff)

create_path(SUMMARY_DIR+param_setting)

In [15]:
merged_summary = tf.compat.v1.summary.merge_all() # gather all summary nodes together
summary_writer = tf.compat.v1.summary.FileWriter(SUMMARY_DIR+param_setting+"/",
                                                 sess.graph) 

In [16]:
sess.run(tf.compat.v1.global_variables_initializer()) 
# initialize the global variables in tensorflow
saver = tf.compat.v1.train.Saver(max_to_keep=1) 
    #initializae the model saver

In [17]:
def MakeSummary(name, value):
    """Creates a tf.Summary proto with the given name and value."""
    summary = tf.compat.v1.Summary()
    val = summary.value.add()
    val.tag = str(name)
    val.simple_value = float(value)
    return summary

In [25]:
def train_step(sess, model, merged_summary, summary_writer, input_label, input_feat, train_op, global_step):
    feed_dict={}
    feed_dict[model.input_feat]=input_feat
    feed_dict[model.input_label]=input_label
    feed_dict[model.keep_prob]=keep_prob
    
#     print(type(global_step))
#     print(type(model.c_loss))
#     print(type(model.nll_loss))
#     print(type(model.total_loss))
#     print(type(merged_summary))
#     print(type(model.indiv_prob))
#     print(tf.shape(model.c_loss))
#     print(tf.shape(model.nll_loss))
#     print(tf.shape(model.total_loss))
#     print(tf.shape(merged_summary))
#     print(tf.shape(model.indiv_prob))
#     print(type(input_feat))
#     print(type(input_label))
#     print(type(keep_prob))

    temp, step, c_loss, c_loss_x, nll_loss, nll_loss_x, l2_loss, kl_loss, total_loss, summary, indiv_prob = \
    sess.run([train_op, global_step, model.c_loss, 
              model.c_loss_x, model.nll_loss, model.nll_loss_x, 
              model.l2_loss, model.kl_loss, model.total_loss, 
              merged_summary, model.indiv_prob], feed_dict)

    train_metrics = compute_metrics(indiv_prob, input_label, 0.5, all_metrics=False)
    macro_f1, micro_f1 = train_metrics['maF1'], train_metrics['miF1']

    summary_writer.add_summary(MakeSummary('train/nll_loss', nll_loss),step)
    summary_writer.add_summary(MakeSummary('train/l2_loss', l2_loss),step)
    summary_writer.add_summary(MakeSummary('train/total_loss', total_loss),step)
    summary_writer.add_summary(MakeSummary('train/macro_f1', macro_f1),step)
    summary_writer.add_summary(MakeSummary('train/micro_f1', micro_f1),step)
    print(macro_f1, micro_f1)

    return indiv_prob, nll_loss, nll_loss_x, kl_loss, total_loss, macro_f1, micro_f1


In [19]:
train_idx = np.array(list(range(target_train.shape[0])))

In [26]:
smooth_nll_loss=0.0 # label encoder decoder cross entropy loss
smooth_nll_loss_x=0.0 # feature encoder decoder cross entropy lossre
smooth_micro_f1 = 0.0 # micro_f1 score
smooth_kl_loss = 0.0
smooth_total_loss = 0.0
smooth_macro_f1 = 0.0
smooth_micro_f1 = 0.0

best_macro_f1 = 0.0 # best macro f1 for ckpt selection in validation
best_micro_f1 = 0.0 # best micro f1 for ckpt selection in validation
best_acc = 0.0 # best subset acc for ckpt selction in validation


check_freq=12

temp_label=[]
temp_indiv_prob=[]


for one_epoch in range(max_epoch):
    print('epoch '+str(one_epoch+1)+' starts!')
    np.random.shuffle(train_idx) # random shuffle the training indices

    for i in range(int(len(train_idx)/float(batch_size))):
        start = i*batch_size
        end = (i+1)*batch_size
    #             input_feat = get_data.get_feat(data,train_idx[start:end]) # get the NLCD features 
    #             input_label = get_data.get_label(data,train_idx[start:end]) # get the prediction labels 
        input_feat = vect_text_train[train_idx[start:end]].toarray()
        input_label = np.expand_dims(target_train[train_idx[start:end]], axis=1)
        #train the model for one step and log the training loss
        indiv_prob, nll_loss, nll_loss_x, kl_loss, total_loss, macro_f1, micro_f1 = \
        train_step(sess, model, merged_summary, summary_writer, input_label,input_feat, train_op, global_step)

        break
        smooth_nll_loss += nll_loss
        smooth_nll_loss_x += nll_loss_x
        smooth_macro_f1 += macro_f1
        smooth_micro_f1 += micro_f1
        smooth_kl_loss += kl_loss
        smooth_total_loss += total_loss

#         temp_label.append(input_label) #log the labels
#         temp_indiv_prob.append(indiv_prob) #log the individual prediction of the probability on each label

        current_step = sess.run(global_step) #get the value of global_step
        lr = sess.run(learning_rate)
        summary_writer.add_summary(MakeSummary('learning_rate', lr), current_step)

        if current_step % check_freq==0: #summarize the current training status and print them out
            nll_loss = smooth_nll_loss / float(check_freq)
            nll_loss_x = smooth_nll_loss_x / float(check_freq)
            kl_loss = smooth_kl_loss / float(check_freq)
            total_loss = smooth_total_loss / float(check_freq)
            macro_f1 = smooth_macro_f1 / float(check_freq)
            micro_f1 = smooth_micro_f1 / float(check_freq)

#             temp_indiv_prob = np.reshape(np.array(temp_indiv_prob), (-1))
#             temp_label = np.reshape(np.array(temp_label), (-1))

#             temp_indiv_prob = np.reshape(temp_indiv_prob,(-1, label_dim))
#             temp_label = np.reshape(temp_label,(-1, label_dim))

            time_str = datetime.datetime.now().isoformat()
            print("step=%d  %s\nlr=%.6f\nmacro_f1=%.6f, micro_f1=%.6f\nnll_loss=%.6f\tnll_loss_x=%.6f\nkl_loss=%.6f\ntotal_loss=%.6f\n" % (current_step, 
                time_str, lr, macro_f1, micro_f1, 
                nll_loss*nll_coeff, nll_loss_x*nll_coeff,
                kl_loss, total_loss))

#             temp_indiv_prob=[]
#             temp_label=[]

            smooth_nll_loss = 0.0
            smooth_nll_loss_x = 0.0
            smooth_kl_loss = 0.0
            smooth_total_loss = 0.0
            smooth_macro_f1 = 0.0
            smooth_micro_f1 = 0.0

            print("--------------------------------")

epoch 1 starts!
nan nan
epoch 2 starts!
nan nan
epoch 3 starts!
nan nan
epoch 4 starts!
nan nan
epoch 5 starts!
nan nan
epoch 6 starts!
nan nan
epoch 7 starts!
nan nan
epoch 8 starts!
nan nan
epoch 9 starts!
nan nan
epoch 10 starts!
nan nan
epoch 11 starts!
nan nan
epoch 12 starts!
nan nan
epoch 13 starts!
nan nan
epoch 14 starts!
nan nan
epoch 15 starts!
nan nan


In [21]:
# metrics_path = os.path.join(EXPERIMENT_DATA_PATH, "metrics.csv")
# params_path = os.path.join(EXPERIMENT_DATA_PATH, "params.yml")
# with dagshub.dagshub_logger(metrics_path=metrics_path, hparams_path=params_path) as logger:
#     print("selecting hyperparameters")
#     tfidf_params, svm_params, bagging_params, metrics = select_hyperparams(df, kfold_params, TFIDF_DIR, MODEL_DIR)
#     print("logging the results")
#     logger.log_hyperparams({"data": data_meta})
#     logger.log_hyperparams({"tfidf": tfidf_params})
#     logger.log_hyperparams({"bagging": bagging_params})
#     logger.log_hyperparams({"model": svm_params})
#     logger.log_hyperparams({"kfold": kfold_params})
#     logger.log_metrics(metrics)
# print("finished")
