In [None]:
from cat2vec import Options, Cat2Vec
import tensorflow as tf
from sample_encoding import *
from utility import load_data
import random
import numpy as np
import pandas as pd
from time import time
from sklearn import metrics

def generate_discriminant_ctr_batch(data, Y, opts):
    data_index = 0
    while True:
        batch = np.ndarray(shape=(opts.batch_size, opts.sequence_length))
        labels = np.ndarray(shape=(opts.batch_size, opts.num_classes))
        for i in xrange(opts.batch_size):
            target = np.zeros(opts.num_classes)
            target[Y[data_index]] = 1.
            temp = data[data_index][0:opts.sequence_length]
            if len(temp) < opts.sequence_length:
                gap = opts.sequence_length - len(temp)
                temp = np.array(temp + [0] * gap)
            assert len(temp) == opts.sequence_length
            batch[i] = temp
            labels[i] = target
            data_index = (data_index + 1) % len(data)
        yield batch, labels
        
    
def generate_discriminant_test_batch(data, Y, opts):
    data_index = 0
    print('Total testing batch number',len(data)//opts.batch_size)
    for _ in range(len(data)//opts.batch_size):
        batch = np.ndarray(shape=(opts.batch_size, opts.sequence_length))
        labels = np.ndarray(shape=(opts.batch_size, opts.num_classes))
        for i in xrange(opts.batch_size):
            target = np.zeros(opts.num_classes)
            target[Y[data_index]] = 1.
            temp = data[data_index][0:opts.sequence_length]
            if len(temp) < opts.sequence_length:
                gap = opts.sequence_length - len(temp)
                temp = np.array(temp + [0] * gap)
            assert len(temp) == opts.sequence_length
            batch[i] = temp
            labels[i] = target
            data_index = data_index + 1
        yield batch, labels
        

class DiscriminantCTR(Cat2Vec):

    def __init__(self, options, session, cate2id, id2cate, pre_trained_emb=None, trainable=True, pre_trained_path=None):
        self.pre_trained_emb = None
        self.trainable = trainable
        if pre_trained_path is not None:
            self.load_pre_trained(pre_trained_path)
        Cat2Vec.__init__(self, options, session, cate2id, id2cate)
        # self.build_graph()
        
    def load_pre_trained(self, path):
        self.pre_trained_emb = np.array(pd.read_csv(path, sep=',',header=None),dtype=np.float32)
        print('pre-trained shape',self.pre_trained_emb.shape)
        
    def build_graph(self):
        """Build the model graph."""
        opts = self._options
        first_indices, second_indices = \
            get_batch_pair_indices(opts.batch_size, opts.sequence_length)
        # print(first_indices.shape)
        # the following is just for example, base class should not include this
        # with self._graph.as_default():
        self.train_inputs = tf.placeholder(tf.int32,
                                           shape=[opts.batch_size,
                                                  opts.sequence_length])
        self.train_labels = tf.placeholder(tf.int32, shape=[opts.batch_size,
                                                            opts.num_classes])
        l2_loss = tf.constant(0.0)
        with tf.device('/cpu:0'):
            if self.pre_trained_emb is None:
                self.embeddings = tf.Variable(tf.random_normal([opts.vocabulary_size,
                                  opts.embedding_size],
                                 stddev=1.0 / np.sqrt(opts.embedding_size)
                                 ))
            else:
                if self.pre_trained_emb.shape == (opts.vocabulary_size, opts.embedding_size):
                    self.embeddings = tf.get_variable(name="embeddings", 
                                                      shape=[opts.vocabulary_size, opts.embedding_size],
                                                      dtype=tf.float32,
                                                      initializer=tf.constant_initializer(self.pre_trained_emb), 
                                                      trainable=self.trainable)
                    print('Inited by pre-trained embeddings')
                else:
                    print('pre_trained_emb shape', self.pre_trained_emb.shape )
                    print('vocabulary_size,embedding_size',(opts.vocabulary_size, opts.embedding_size))
                    raise Exception('Error', 'pre_trained_emb size mismatch')
            embed = tf.nn.embedding_lookup(self.embeddings, self.train_inputs)
#             l2_loss += tf.nn.l2_loss(embed)
            encoded = sample_encoding(embed, opts.interaction_times,
                                      opts.batch_size, opts.sequence_length,
                                      opts.sequence_length, first_indices,
                                      second_indices, opts.gate_type,
                                      opts.norm_type)
#             encoded = tf.reshape(embed,[opts.batch_size,-1])
            encoded = tf.concat(1,[encoded,tf.reshape(embed,[opts.batch_size,-1])])
#             encoded = tf.reduce_sum(embed,1)
            with tf.name_scope("output"):
                encoded_size = encoded.get_shape().as_list()[1]
                print(encoded.get_shape().as_list())
#                 W, b = weight_bias([encoded_size, opts.embedding_size], [
#                                    opts.embedding_size], bias_init=0.)
#                 fc1 = tf.matmul(encoded, W) + b
#                 W, b = weight_bias([encoded_size//2, encoded_size//4], [
#                                    encoded_size//4], bias_init=0.)
#                 fc2 = tf.matmul(fc1, W) + b
                W, b = weight_bias([encoded_size, opts.num_classes], [
                                   opts.num_classes], bias_init=0.)
                l2_loss += tf.nn.l2_loss(W)
                l2_loss += tf.nn.l2_loss(b)
                scores = tf.matmul(encoded, W) + b
                self.probs = tf.nn.softmax(scores)
                self.predictions = tf.argmax(scores, 1, name="predictions")

            with tf.name_scope("loss"):
                losses = tf.nn.softmax_cross_entropy_with_logits(
                    scores, tf.to_float(self.train_labels))
                self.loss = tf.reduce_mean(losses) + opts.l2_reg_lambda * l2_loss

            with tf.name_scope("accuracy"):
                correct_predictions = tf.equal(
                    self.predictions, tf.argmax(self.train_labels, 1))
                self.accuracy = tf.reduce_mean(
                    tf.cast(correct_predictions, "float"), name="accuracy")
            self.loss = tf.clip_by_value(self.loss,-50,50) 
            optimizer = \
                tf.train.GradientDescentOptimizer(opts.learning_rate)
            
#             optimizer = tf.train.AdamOptimizer()
            self.train_operator = \
                optimizer.minimize(self.loss,
                                   gate_gradients=optimizer.GATE_NONE)
        tf.initialize_all_variables().run()
        print("Initialized")
        
    def eval_auc(self, test_data, test_Y):
#         test_batch_generator = generate_discriminant_test_batch(test_data,test_Y, self._options)
        opts = self._options
        probs = []
        y = []
        losses = 0.
        accuracies = []
        batch = np.ndarray(shape=(opts.batch_size, opts.sequence_length))
        labels = np.ndarray(shape=(opts.batch_size, opts.num_classes))
        data_index = 0
        batch_num = len(test_data)//opts.batch_size
        print('Total testing batch number', batch_num)
        for j in range(len(test_data)//self._options.batch_size):
            for i in xrange(opts.batch_size):
                target = np.zeros(opts.num_classes)
                target[test_Y[data_index]] = 1.
                temp = test_data[data_index][0:opts.sequence_length]
                if len(temp) < opts.sequence_length:
                    gap = opts.sequence_length - len(temp)
                    temp = np.array([0] * gap + temp)
                assert len(temp) == opts.sequence_length
                batch[i] = temp
                labels[i] = target
                data_index = data_index + 1            
            feed_dict = {self.train_inputs: batch,
                         self.train_labels: labels}
            prob, loss, accuracy = self._session.run([self.probs,
                                                   self.loss,
                                                   self.accuracy],
                                                   feed_dict=feed_dict)
            probs.extend(prob)
            losses += loss
            y.extend(np.argmax(labels, axis=1))
        probs = [p[1] for p in probs]
        fpr, tpr, thresholds = metrics.roc_curve(y, probs, pos_label=1)
        auc = metrics.auc(fpr, tpr)
#         print('AUC:',auc,'avg log loss:',np.mean(losses),'acc:',np.mean(accuracies))
        print('AUC:', auc, 'avg log loss:', losses * 1. / batch_num)
            
    def train(self, batch_generator, num_steps, test_data,test_Y):
        opts = self._options
        losses = 0.
        acc = 0.
        start = time()
        for step in xrange(num_steps):
            batch_inputs, batch_labels = batch_generator.next()
            feed_dict = {self.train_inputs: batch_inputs,
                         self.train_labels: batch_labels}
            _, loss, accuracy = self._session.run([self.train_operator,
                                                   self.loss,
                                                   self.accuracy],
                                                  feed_dict=feed_dict)
            losses += loss
            acc += accuracy
            if step % 500 == 0:
                t = time()-start
                if step > 0:
                    losses /= 500
                    t  /=500
                    acc /= 500
                print("Average loss at step ", step, ": ",
                      losses , ' accuracy: ', acc, 'time', t)
                
                start = time()
                losses = 0.
                acc = 0.
                
            if step % 500 == 0:
                print('Eval at step ', step)
                self.eval_auc(test_data, test_Y)
                print('Eval at Done ', step)
                


In [None]:
f = open('./data/ipinyou/training_not_aligned.csv','r')
X = []
for line in f.readlines():
    X.append([int(x) for x in line.strip().split(',')])
# f = open('./data/ipinyou/labels.csv','r')
Y = np.array(pd.read_csv('./data/ipinyou/labels.csv',sep=',',header=None))
# for line in f.readlines():
#     Y.append(int(line.strip()))
Y = Y.reshape(len(Y))
# X_Y = zip(X,Y)
print('X length',len(X),'Y length', len(Y),Y[0:10])

In [None]:
# sum([len(x) for x in X])/len(X)
# sum([1 for x in X if len(x)<=32])

In [None]:
pos_num = np.count_nonzero(Y)
neg_num = len(Y) - pos_num
print('positive samples in training:',pos_num)
print('negative samples in training:',neg_num)
# def balance_data()

In [None]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [None]:
def balance_data(X,y,ratio=0.5,random_seed=1337):
    np.random.seed(1337)
    shuffle_indices = np.random.permutation(np.arange(len(X)))
    X = X[shuffle_indices]
    y = y[shuffle_indices]
    pos_num = np.count_nonzero(y)
    neg_num = int(pos_num / ratio)
    pos_count = 0
    neg_count = 0
    rst = []
    for d in zip(y,X):
        if pos_count == pos_num and neg_count == neg_num:
            break
        if int(d[0]) == 1 and pos_count < pos_num:
            rst.append(d)
            pos_count += 1
        if int(d[0]) == 0 and neg_count < neg_num:
            rst.append(d)
            neg_count += 1
    shuffle_indices = np.random.permutation(np.arange(len(rst)))
    y_balanced = np.array([r[0] for r in rst])[shuffle_indices]
    X_balanced = np.array([r[1] for r in rst])[shuffle_indices]
    if np.count_nonzero(y_balanced) != pos_num:
        print('error')
    print('re-balanced data positive/negative: ',pos_count,'/',neg_count)
    return X_balanced, y_balanced

X_train_balanced,y_train_balanced =  balance_data(np.array(X_train),np.array(y_train))

In [None]:
# reverse_dictionary_raw = np.array(pd.read_csv('./data/ipinyou/reverse_dictionary_not_aligned.csv',sep=',',header=None))
vocabulary_size = 0
reverse_dictionary_raw = np.array(pd.read_csv('./data/ipinyou/reverse_dictionary_not_aligned.csv', sep=',', header=None))
reverse_dictionary = {}
dictionary = {}
for item in reverse_dictionary_raw:
    reverse_dictionary[int(item[1])] = item[0]
    dictionary[item[0]] = int(item[1])
if item[1] > vocabulary_size:
    vocabulary_size = item[1]
vocabulary_size = len(dictionary.keys())
print('vocabulary_size: ',vocabulary_size)
id2cate = reverse_dictionary
cate2id = dictionary

In [None]:
opts = Options()
print('Loading data...')
# data, id2cate, cate2id, vocabulary_size = load_data(debug=True)
opts.sequence_length = 22
opts.vocabulary_size = vocabulary_size
opts.norm_type = 'l2'
opts.gate_type = 'p_norm'
opts.l2_reg_lambda = 1e-5
opts.batch_size = 32
opts.embedding_size = 32
opts.interaction_times = 1
opts.learning_rate = 0.1
batch_generator = generate_discriminant_ctr_batch(X_train_balanced,y_train_balanced,opts)
print('Building graph')
pre_trained_path = './data/ipinyou/pre_trained_embs_1272384_discri_32_shuffle.csv.csv'
# pre_trained_path = None
with tf.Graph().as_default(), tf.Session() as session:
    discr_ctr = DiscriminantCTR(opts, session, id2cate, cate2id,
                                pre_trained_emb=None, 
                                trainable=True, 
                                pre_trained_path=pre_trained_path)
    print('Training model')
    discr_ctr.train(batch_generator, 10001,X_test, y_test)