In [1]:
import tensorflow as tf
import numpy as np
import functools
import glob
import pandas as pd
import time
from sklearn.metrics import roc_auc_score

from collections import defaultdict

In [2]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

['/gpu:0', '/gpu:1']

In [3]:
def lazy_property(function):
    attribute = '_' + function.__name__

    @property
    @functools.wraps(function)
    def wrapper(self):
        if not hasattr(self, attribute):
            setattr(self, attribute, function(self))
        return getattr(self, attribute)
    return wrapper

In [4]:
class RnnClassifierModel:
    def __init__(self, data, target, vocab_size,
                 dropout=0.5, num_hidden=100, num_layers=2, emb_dim=256, num_classes=2):
        self.data = data
        self.target = target
        self.dropout = dropout
        self._num_layers = num_layers
        self._num_hidden = num_hidden
        self._num_layers = num_layers
        self._emb_dim = emb_dim
        self._vocab_size = vocab_size
        self._num_classes = num_classes
        
    @staticmethod
    def _weight_and_bias(in_size, out_size):
        weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
        bias = tf.constant(0.1, shape=[out_size])
        return tf.Variable(weight), tf.Variable(bias)
    
    @staticmethod
    def _embedding_matrix(vocab_size, emb_dim):
        return tf.Variable(tf.random_uniform([vocab_size + 1, emb_dim]), name="embedding")
        
    @lazy_property
    def hidden_states(self):
        embedding_matrix = self._embedding_matrix(self._vocab_size, self._emb_dim)
        emb_vector = tf.nn.embedding_lookup(embedding_matrix, self.data)
        network = tf.nn.rnn_cell.GRUCell(self._num_hidden)
        network = tf.nn.rnn_cell.DropoutWrapper(
            network, output_keep_prob=self.dropout)
        network = tf.nn.rnn_cell.MultiRNNCell([network] * self._num_layers)
        output, _ = tf.nn.dynamic_rnn(network, emb_vector, dtype=tf.float32, swap_memory=True)
        # Select last output.
        output = tf.transpose(output, [1, 0, 2])
        return output
        
    @lazy_property
    def prediction(self):
        output = self.hidden_states
        last = output[-1, :, :]
        # Softmax layer.
        softmax_weights, softmax_bias = self._weight_and_bias(self._num_hidden, self._num_classes)
        probs = tf.nn.softmax(tf.matmul(last, softmax_weights) + softmax_bias)
        return probs
    
    @lazy_property
    def cost(self):
        probs = self.prediction
        logits = tf.log(probs)
        return tf.nn.sparse_softmax_cross_entropy_with_logits(logits, self.target)

    @lazy_property
    def optimize(self):
        optimizer = tf.train.AdamOptimizer()
        return optimizer.minimize(self.cost)

    @lazy_property
    def error(self):
        mistakes = tf.not_equal(
            tf.cast(self.target, tf.int64), tf.argmax(self.prediction, 1))
        return tf.reduce_mean(tf.cast(mistakes, tf.float32))

In [5]:
def build_dictionary(filenames):
    index = 1
    dictionary = {}

    for filename in filenames:
        with open(filename) as in_file:
            for row in in_file:
                words = (row.split(',')[1]).split()
                for word in words:
                    if word not in dictionary:
                        dictionary[word] = index
                        index += 1
    return dictionary

In [6]:
! wc -l ./train_data/train.csv

63609 ./train_data/train.csv


In [7]:
filenames = glob.glob("./train_data/*csv")
dictionary = build_dictionary(filenames)
embedding_dimension = 256

num_epochs = 100
batch_size = 256
num_objects = 63609

def preprocess(example, table):
    return table.lookup(tf.string_split(example).values)
    
def read_my_file_format(filename_queue):
    reader = tf.TextLineReader()
    key, record_string = reader.read(filename_queue)
    record_defaults = [[1], [''], [1]]
    dialog_id, dialog, label = tf.decode_csv(record_string, record_defaults=record_defaults)
    dialog = tf.pack([dialog])
    processed_example = preprocess(dialog, table)
    return processed_example, label

def input_pipeline(filenames, batch_size, num_epochs=None):
    filename_queue = tf.train.string_input_producer(
        filenames, num_epochs=num_epochs, shuffle=True)
    example, label = read_my_file_format(filename_queue)
    capacity = 100
    example_batch, label_batch = tf.train.batch(
        [example, label], 
        batch_size=batch_size, 
        capacity=capacity,
        dynamic_pad=True,
        allow_smaller_final_batch=True,
    )
    return example_batch, label_batch

In [8]:
print(len(dictionary))

13719


### Training step

In [None]:
keys = tf.constant(list(dictionary.keys()))
values = tf.constant(list(dictionary.values()), dtype=tf.int64)
table = tf.contrib.lookup.HashTable(
            tf.contrib.lookup.KeyValueTensorInitializer(keys, values), -1)

inputs, target = input_pipeline(filenames, batch_size=batch_size)

with tf.device("/gpu:1"):
    model = RnnClassifierModel(inputs, target, vocab_size=len(dictionary), num_classes=2)
    training_opt = model.optimize

    init_op = tf.global_variables_initializer()

saver = tf.train.Saver()

with tf.Session() as sess:
    table.init.run()
    sess.run(init_op)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    
    for i in range(num_epochs):
        start_time = time.time()
        epoch_prediction = np.array([])
        epoch_true_targets = np.array([])
        for _ in range(num_objects // batch_size + 1):
            start_time = time.time()
            sess.run(training_opt)
            batch_prediction = sess.run(model.prediction)[:, 1]
            batch_true_target = sess.run(target)
            epoch_prediction = np.concatenate([epoch_prediction, batch_prediction], axis=0)
            epoch_true_targets = np.concatenate([epoch_true_targets, batch_true_target], axis=0)
            #print("Batch error: {}".format(batch_error))
            #print("Batch running time: {}".format(time.time() - start_time))
            #epoch_error.append(batch_error)
        print("Epoch roc-auc: {}".format(roc_auc_score(epoch_true_targets, epoch_prediction)))
        print("Epoch {} is finished for {} sec".format(i, time.time() - start_time))
    coord.request_stop()
    coord.join(threads)
    save_path = saver.save(sess, "./tmp/model.ckpt")
    print("Model saved in file: %s" % save_path)

Epoch roc-auc: 0.5014070224141467
Epoch 0 is finished for 3.2983884811401367 sec
Epoch roc-auc: 0.49635059830778316
Epoch 1 is finished for 3.6774682998657227 sec
Epoch roc-auc: 0.4967166411467241
Epoch 2 is finished for 4.512049198150635 sec
Epoch roc-auc: 0.49811618703898247
Epoch 3 is finished for 3.7752153873443604 sec
Epoch roc-auc: 0.4987432818720713
Epoch 4 is finished for 3.421198844909668 sec
Epoch roc-auc: 0.49875175043347075
Epoch 5 is finished for 3.034984588623047 sec
Epoch roc-auc: 0.49473489477195565
Epoch 6 is finished for 5.296198606491089 sec
Epoch roc-auc: 0.5011181410470276
Epoch 7 is finished for 6.576495170593262 sec
Epoch roc-auc: 0.4981695725994325
Epoch 8 is finished for 3.0298824310302734 sec
Epoch roc-auc: 0.4995887668907453
Epoch 12 is finished for 2.973670721054077 sec


In [None]:
print('ads')

In [10]:
print("kek")

kek


In [None]:
test_X = tf.placeholder(dtype=tf.int64, shape=[1, None])

inputs, _ = input_pipeline(["./data/file0.csv", "./data/file1.csv"], 
                                batch_size=batch_size, )

model.data = inputs
prediction_op = model.prediction
saver = tf.train.Saver()
ckpt = tf.train.get_checkpoint_state("./tmp/")


with tf.Session() as sess:
    saver.restore(sess, ckpt.model_checkpoint_path)
    #sess.run(init_op)
    table.init.run()
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    res = sess.run(prediction_op, feed_dict={test_X: [[1, 2]]})
    print(res)
    coord.request_stop()
    coord.join(threads)