In [None]:
# Number of iterations
T = 1000

# Eta in the paper
learning_rate = 0.01

# Let w = weight matrix
# Let f_w = neural network, input: single feature vector, output: score

# Let f_w(x[i][j]) = score of 

# Given query q[i], the ranking function f_w can generate a score list z[i]
# Let z[i](f_w) = [ f_w(x[i][1]), f_w(x[i][2]), f_w(x[i][3]), ... f_w(x[i][n]) ]

# Probability of documents 1 .. K is:
# P_z[i](f_w) ()

In [None]:
class PermutationGeneratorModel(object):
    """Wrapper for Tensorflow model graph for generating permutations of feature vector lists"""
    
    def __init__(self, num_inputs):
        # create graph
        self._num_inputs = num_imputs
        
    def run_train_step(self, sess, list_batch, judgments, list_lens, loss_weights):
        to_return = [self._train_op, self._summaries, self._loss, self.global_step]
        return sess.run(to_return,
                       feed_dict{self._lists: list_batch,
                                 self._list_lens: list_lens})
    
    def run_eval_step(self, sess, list_batch, judgments, list_lens, loss_weights):
        to_return = [self._summaries, self._loss, self.global_step]
        return sess.run(to_return,
                       feed_dict{self._lists: list_batch,
                                 self._judgments: judgments,
                                 self._list_lens: list_lens,
                                 self._loss_weights: loss_weights})
    
    def run_predict_step(self, sess, list_batch, judgments, list_lens, loss_weights):
        to_return = [self._outputs, self.global_step]
        return sess.run(to_return,
                       feed_dict={self._lists: list_batch,
                                 self._judgments: judgments,
                                 self._list_lens: list_lens,
                                 self._loss_weights: loss_weights})    

    def _add_placeholders(self):
        """Inputs which will be fed to the graph"""
        hps = self._hps
        self._lists = tf.placeholder(tf.float32,
                                     [hps.batch_size, hps.max_list_length],
                                     name='lists')
        self._judgments: tf.placeholder(tf.int32,
                                       [hps.batch_size, hps.max_list_length],
                                       name='judgments')
        self._list_lens = tf.placeholder(tf.int32, [hps.batch_size], name='list_lengths')
        self._loss_weights = tf.placeholder(tf.float32, 
                                            [hps.batch_size, hps.max_list_length]
                                            name='loss_weights')
        
    def _linear_layer(shape, inputs):
        initial_weight = tf.truncated_normal(shape, stddev=0.1)
        initial_bias = tf.constant(0.1, shape=shape)
        
        weight = tf.Variable(initial_weight)
        bias = tf.Variable(initial_bias)
        return weight * inputs + bias
        
    def _add_permutation_generator(self):
        hps = self._hps
        
        with tf.variable_scope('generator'):
            lists = self._lists
            judgments = self._judgments
            list_lens = self._list_lens
            loss_weights = self._loss_weights
            
            inputs = lists
            for i in xrange(hps.layers):
                with tf.variable_scope('layer_%d' % i), 
                     tf.device(self._next_device):
                    linear_layer = _linear_layer([hps.batch_size, hps.max_input_length], inputs)
                    outputs = tf.nn.relu(linear_layer)
                    inputs = outputs

In [3]:
"""Information Retrieval metrics
Useful Resources:
http://www.cs.utexas.edu/~mooney/ir-course/slides/Evaluation.ppt
http://www.nii.ac.jp/TechReports/05-014E.pdf
http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
http://hal.archives-ouvertes.fr/docs/00/72/67/60/PDF/07-busa-fekete.pdf
Learning to Rank for Information Retrieval (Tie-Yan Liu)
"""
import numpy as np


def mean_reciprocal_rank(rs):
    """Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75
    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank
    """
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


def r_precision(r):
    """Score is precision after all relevant documents have been retrieved
    Relevance is binary (nonzero is relevant).
    >>> r = [0, 0, 1]
    >>> r_precision(r)
    0.33333333333333331
    >>> r = [0, 1, 0]
    >>> r_precision(r)
    0.5
    >>> r = [1, 0, 0]
    >>> r_precision(r)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        R Precision
    """
    r = np.asarray(r) != 0
    z = r.nonzero()[0]
    if not z.size:
        return 0.
    return np.mean(r[:z[-1] + 1])


def precision_at_k(r, k):
    """Score is precision @ k
    Relevance is binary (nonzero is relevant).
    >>> r = [0, 0, 1]
    >>> precision_at_k(r, 1)
    0.0
    >>> precision_at_k(r, 2)
    0.0
    >>> precision_at_k(r, 3)
    0.33333333333333331
    >>> precision_at_k(r, 4)
    Traceback (most recent call last):
        File "<stdin>", line 1, in ?
    ValueError: Relevance score length < k
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Precision @ k
    Raises:
        ValueError: len(r) must be >= k
    """
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)


def average_precision(r):
    """Score is average precision (area under PR curve)
    Relevance is binary (nonzero is relevant).
    >>> r = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1]
    >>> delta_r = 1. / sum(r)
    >>> sum([sum(r[:x + 1]) / (x + 1.) * delta_r for x, y in enumerate(r) if y])
    0.7833333333333333
    >>> average_precision(r)
    0.78333333333333333
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Average precision
    """
    r = np.asarray(r) != 0
    out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]]
    if not out:
        return 0.
    return np.mean(out)


def mean_average_precision(rs):
    """Score is mean average precision
    Relevance is binary (nonzero is relevant).
    >>> rs = [[1, 1, 0, 1, 0, 1, 0, 0, 0, 1]]
    >>> mean_average_precision(rs)
    0.78333333333333333
    >>> rs = [[1, 1, 0, 1, 0, 1, 0, 0, 0, 1], [0]]
    >>> mean_average_precision(rs)
    0.39166666666666666
    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean average precision
    """
    return np.mean([average_precision(r) for r in rs])


def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [None]:
import tensorflow as tf
from collections import namedtuple
from itertools import izip
from math import log
from tabulate import tabulate
import random

tf.reset_default_graph()

hps = namedtuple('Params',
    'batch_size, max_list_length, vector_size, max_value, d_rnn_dim, d_relu_dim, g_layer2_dim, g_relu_dim'
)

hps.batch_size = 10
hps.max_list_length = 128
hps.d_rnn_dim = 64
hps.d_relu_dim = 8
hps.g_layer2_dim = 100
hps.g_relu_dim = 10

mu, sigma = 0, 1
vectors_train, scores_train, _ = group_into_batches(X_train, y_train, qid_train, hps.batch_size, hps.max_list_length, hps.vector_size)
vectors_devel, scores_devel, _ = group_into_batches(X_devel, y_devel, qid_train, hps.batch_size, hps.max_list_length, hps.vector_size)
hps.max_value = scores_train.max()
hps.vector_size = 136

def clamp_as_probability(input):
    return tf.maximum(tf.minimum(input, .99), .01)

def make_decider(rolled_vectors, rolled_scores):
    _, indices = tf.nn.top_k(rolled_scores, k=hps.max_list_length, sorted=True, name="sort")
    
    rows = []
    for i in xrange(hps.batch_size):
        row = tf.fill([hps.max_list_length, 1], i)
        rows.append(row)
    indices_dim_1 = tf.stack(rows)
    print "Indices", indices
    print "Indices dim 1", indices_dim_1
    
    slice_indices = tf.concat(2, [indices_dim_1, tf.expand_dims(indices, -1)], name="full_indices")
    print "Slice indices", slice_indices
    #print slice_indices
    
    #print rolled_scores
    # Unroll vectors and scores
    sorted_rolled_scores = tf.gather_nd(rolled_scores, slice_indices, 'sorted_scores')
    print "Sorted rolled scores", sorted_rolled_scores
    #print sorted_rolled_scores
    
    print "Rolled vectors", rolled_vectors
    sorted_rolled_vectors = tf.gather_nd(rolled_vectors, slice_indices, 'sorted_vectors')
    print "Sorted rolled vectors", sorted_rolled_vectors
    #print sorted_rolled_vectors

    expanded_sorted_rolled_scores = tf.expand_dims(sorted_rolled_scores, -1)
    print "Expanded sorted rolled vectors", expanded_sorted_rolled_scores
    rolled_input = tf.concat(2, [sorted_rolled_vectors, expanded_sorted_rolled_scores], name='rolled_input')
    print "Rolled input", rolled_input
    unrolled_input = tf.unstack(rolled_input, axis=1)
    
    gru = tf.nn.rnn_cell.GRUCell(hps.d_rnn_dim)
    outputs, state = tf.nn.rnn(gru, unrolled_input, scope="rnn", dtype=tf.float32)
    
    last_rnn_output = outputs[-1]
    
    w1=tf.get_variable("d_w0", [hps.d_rnn_dim, hps.d_relu_dim], initializer=tf.random_uniform_initializer())
    b1=tf.get_variable("d_b0", [hps.d_relu_dim], initializer=tf.constant_initializer(0.1))
    w2=tf.get_variable("d_w1", [hps.d_relu_dim, 1], initializer=tf.random_normal_initializer())
    b2=tf.get_variable("d_b1", [1], initializer=tf.constant_initializer(0.0))
    
    layer2 = tf.nn.relu(tf.batch_matmul(last_rnn_output, w1) + b1)
    layer3 = tf.sigmoid(tf.batch_matmul(layer2, w2) + b2)
    return clamp_as_probability(layer3)

# [BATCH_SIZE] x [LIST_LENGTH] x [VECTOR_SIZE]
with tf.variable_scope("GAN"):
    z_seeds = tf.placeholder(tf.float32,
                            [hps.batch_size, 1],
                            name='z_seed')
    z_rolled_vectors = tf.placeholder(tf.float32,
                                      [hps.batch_size, hps.max_list_length, hps.vector_size],
                                      name='z_rolled_vectors')
    with tf.variable_scope("generator") as scope:
        w1=tf.get_variable("g_w0", [hps.vector_size + 1, hps.g_relu_dim], initializer=tf.random_uniform_initializer())
        b1=tf.get_variable("g_b0", [hps.g_relu_dim], initializer=tf.constant_initializer(0.1))
        w2=tf.get_variable("g_w1", [hps.g_relu_dim, hps.g_layer_2_dim], initializer=tf.random_normal_initializer())
        b2=tf.get_variable("g_b1", [hps.g_layer_2_dim], initializer=tf.constant_initializer(0.1))
        w2=tf.get_variable("g_w2", [hps.g_layer_2_dim, 1], initializer=tf.random_normal_initializer())
        b2=tf.get_variable("g_b2", [1], initializer=tf.constant_initializer(0.1))
    
        # List of [BATCH_SIZE] x [VECTOR_SIZE]
        z_vectors = tf.unstack(z_rolled_vectors, axis=1)
        z_scores = []
        for z_vector in z_vectors:
            #print z_vector
            #print z_seeds
            z_vector_with_seed = tf.concat(1, [z_vector, z_seeds])
            g_layer_1 = tf.nn.relu(tf.batch_matmul(z_vector_with_seed, w1) + b1)
            g_layer_2 = tf.nn.relu(tf.batch_matmul(g_layer_1, w2) + b2)
            z_score = tf.sigmoid(tf.batch_matmul(g_layer_1, w3) + b3)
            z_score = clamp_as_probability(z_score) * max_value
            z_score = tf.squeeze(z_score)
            z_scores.append(z_score)

        z_scores = tf.stack(z_scores, axis=1, name='z_scores')
        theta_g = [v for v in tf.global_variables() if v.name.startswith(scope.name)]

    with tf.variable_scope("decider") as scope:
        x_vectors = tf.placeholder(tf.float32,
                   [hps.batch_size, hps.max_list_length, hps.vector_size],
                   name='x_vectors')
        x_scores = tf.placeholder(tf.float32,
                            [hps.batch_size, hps.max_list_length],
                            name='x_scores')
        
        D1 = make_decider(x_vectors, x_scores)
        
        scope.reuse_variables()
        
        D2 = make_decider(z_rolled_vectors, z_scores)
        
        theta_d = [v for v in tf.global_variables() if v.name.startswith(scope.name)]
       
  
    step_d = tf.Variable(0)
    obj_d = tf.reduce_mean(tf.log(D1) + tf.log(1 - D2))
    opt_d = tf.train.GradientDescentOptimizer(0.01).minimize(1 - obj_d, global_step=step_d, var_list=theta_d)

    step_g = tf.Variable(0)
    obj_g = tf.reduce_mean(tf.log(D2))
    opt_g = tf.train.GradientDescentOptimizer(0.01).minimize(1 - obj_g, global_step=step_g, var_list=theta_g)

    init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
   
    def evaluate_list(estimated_scores, true_scores):
        #print "Estimated", len(estimated_scores), "==", estimated_scores
        #print "True", len(true_scores), "==", true_scores
        rs = [x[1] for x in sorted(izip(estimated_scores, true_scores), key=lambda x: -x[0])]
        #print rs
        ndcg_10  = ndcg_at_k(rs, 10, method=1)
        ndcg_100 = ndcg_at_k(rs, 100, method=1)
        p_10 = precision_at_k(rs, 10)
        m_ap = average_precision(rs)
        return [ndcg_10, ndcg_100, p_10, m_ap]
    
    def evaluate_lists(guess_lists, gold_lists):
        evaluations = []
        for guess, gold in izip(guess_lists, gold_lists):
            evaluations.append(evaluate_list(guess, gold))
        return np.mean(evaluations, axis=0)    
    
    # Algorithm 1, GoodFellow et al. 2014
    for epoch in range(10000):
        d1s = []
        d1_losses = []
        d2s = []
        d2_losses = []
        train_guess_scores = []
        train_gold_scores = []
        start = timer()
        for i in xrange(0, len(vectors_train) - 3, 3):
            _x_vectors = np.asarray(vectors_train[i])
            _x_scores = np.asarray(scores_train[i])
            _z_vectors = np.asarray(vectors_train[i + 1])
            _z_seeds = np.random.normal(mu, sigma, (hps.batch_size, 1))
            d1, d2, o1, _ = sess.run([D1, D2, obj_d, opt_d], {x_vectors: _x_vectors, x_scores: _x_scores, z_rolled_vectors: _z_vectors, z_seeds: _z_seeds})
            d1s.append(np.mean(d1))
            d1_losses.append(np.mean(1 - o1))
            _z_vectors = np.asarray(vectors_train[i + 2])
            _z_seeds = np.random.normal(mu, sigma, (hps.batch_size, 1))
            d2, scores, o2, _ = sess.run([D2, z_scores, obj_g, opt_g], {x_vectors: _x_vectors, x_scores: _x_scores, z_rolled_vectors: _z_vectors, z_seeds: _z_seeds})
            d2s.append(np.mean(d2))
            d2_losses.append(np.mean(1 - o2))
            train_guess_scores.extend(scores)
            train_gold_scores.extend(scores_train[i + 2])
        d1 = np.mean(d1s)
        d2 = np.mean(d2s)
        d1_loss = np.mean(d1_losses)
        d2_loss = np.mean(d2_losses)
        end = timer()
        print "Epoch %d: Time: %s; D1=%6.3f (L1=%6.3f); D2=%6.3f (L2=%6.3f)" % (epoch + 1, end - start, d1, 1 - d1_loss, d2, 1 - d2_loss)
        devel_guess_scores = []
        devel_gold_scores = []
        for i in xrange(len(vectors_devel)):
            _x_vectors = _z_rolled_vectors = np.asarray(vectors_devel[i])
            _x_scores = _z_gold_scores = scores_devel[i]
            _z_seeds = np.random.normal(mu, sigma, (hps.batch_size, 1))
            _z_guess_scores = sess.run([z_scores], {x_vectors: _x_vectors, x_scores: np.asarray(_x_scores), z_rolled_vectors: _z_vectors, z_seeds: _z_seeds})
            devel_guess_scores.extend(_z_guess_scores[0])
            devel_gold_scores.extend(_x_scores)
        train_evals = evaluate_lists(train_guess_scores, train_gold_scores).tolist()
        train_evals.insert(0, 'Train')
        #print "Devel guess scores", devel_guess_scores
        #print "Devel gold scores", devel_gold_scores
        devel_evals = evaluate_lists(devel_guess_scores, devel_gold_scores).tolist()
        devel_evals.insert(0, 'Devel')
        print tabulate([train_evals, devel_evals], headers=['', 'NDCG@10', 'NDCG@100', 'P@10', 'MAP'], tablefmt='fancy_grid')
        
        t = list(zip(vectors_train, scores_train))
        random.shuffle(t)
        vectors_train, scores_train = zip(*t)

Number of batches = 599
Number of batches = 199
Indices Tensor("GAN/decider/sort:1", shape=(10, 128), dtype=int32)
Indices dim 1 Tensor("GAN/decider/stack:0", shape=(10, 128, 1), dtype=int32)
Slice indices Tensor("GAN/decider/full_indices:0", shape=(10, 128, 2), dtype=int32)
Sorted rolled scores Tensor("GAN/decider/sorted_scores:0", shape=(10, 128), dtype=float32)
Rolled vectors Tensor("GAN/decider/x_vectors:0", shape=(10, 128, 136), dtype=float32)
Sorted rolled vectors Tensor("GAN/decider/sorted_vectors:0", shape=(10, 128, 136), dtype=float32)
Expanded sorted rolled vectors Tensor("GAN/decider/ExpandDims_1:0", shape=(10, 128, 1), dtype=float32)
Rolled input Tensor("GAN/decider/rolled_input:0", shape=(10, 128, 137), dtype=float32)
Indices Tensor("GAN/decider/sort_1:1", shape=(10, 128), dtype=int32)
Indices dim 1 Tensor("GAN/decider/stack_1:0", shape=(10, 128, 1), dtype=int32)
Slice indices Tensor("GAN/decider/full_indices_1:0", shape=(10, 128, 2), dtype=int32)
Sorted rolled scores Tens

In [1]:
from sklearn.datasets import load_svmlight_file
from timeit import default_timer as timer
import numpy as np

# Data parsing
def load_letor_data(path):
    # Initially, vectors are flat
    start = timer()
    X_flat, y_flat, qid_flat = load_svmlight_file(path, dtype=np.float32, query_id=True)
    end = timer()
    print "Loaded SVMLight file in %6.3f seconds" % (end - start)
    
    # Flatten to numpy arrays
    start = timer()
    X_flat = X_flat.toarray()
    y_flat
    qid_flat
    end = timer()
    print "Converted to flat numpy arrays in %6.3f seconds" % (end - start)
    
    # We need to group them into buckets, one for each qid
    start = timer()
    qids, index, counts = np.unique(qid_flat, return_index=True, return_counts=True)
    n_qids = len(qids)
    X = []
    y = []
    qid = []
    for i in xrange(n_qids):
        start_index = index[i]
        end_index = index[i] + counts[i]
        assert np.all(qid_flat[start_index:end_index] == qids[i]), "Found QIDs in bucket[%d:%d] with mismatched QIDs = %s (should be %d)" % (start_index, end_index, qid_flat[start_index:end_index], qids[i])
        X.append(X_flat[start_index:end_index])
        y.append(y_flat[start_index:end_index])
        qid.append(qids[i])
    end = timer()
    print "Bucketed SVMLight vectors in %6.3f seconds"  % (end - start)
    return np.array(X), np.array(y), np.array(qid)

X_train, y_train, qid_train = load_letor_data('/shared/aifiles/disk1/travis/data/corpora/letor/letor_4/MSLR-WEB10K/Fold1/train.txt')    
X_devel, y_devel, qid_devel = load_letor_data('/shared/aifiles/disk1/travis/data/corpora/letor/letor_4/MSLR-WEB10K/Fold1/vali.txt')

Loaded SVMLight file in 116.278 seconds
Converted to flat numpy arrays in  1.569 seconds
Bucketed SVMLight vectors in  0.147 seconds
Loaded SVMLight file in 37.663 seconds
Converted to flat numpy arrays in  0.514 seconds
Bucketed SVMLight vectors in  0.044 seconds


In [5]:
# Convert to batches
def group_into_batches(X, y, qid, batch_size, max_len, num_feats):
    X_batched = []
    y_batched = []
    qid_batched = []
    for i in xrange(0, len(X) - batch_size, batch_size):
        X_batch = []
        y_batch = []
        qid_batch = qid[i:i+batch_size]
        for j in xrange(batch_size):
            _X = np.zeros([max_len, num_feats], dtype=np.float32)
            _y = np.zeros([max_len], dtype=np.float32)
            n = min(X[i + j].shape[0], max_len)
            assert X[i + j].shape[0] == y[i + j].shape[0]
            _X[0:n] = X[i + j][0:n]
            _y[0:n] = y[i + j][0:n]
            X_batch.append(_X)
            y_batch.append(_y)
        X_batched.append(X_batch)
        y_batched.append(y_batch)
        qid_batched.append(qid_batch)
    num_batches = len(X_batched)
    assert len(X_batched) == len(y_batched) == len(qid_batched)
    print "Number of batches = %d" % num_batches
    X_batched = np.asarray(X_batched)
    y_batched = np.asarray(y_batched)
    qid_batched = np.asarray(qid_batched)
#    X_batched.resize(num_batches, batch_size, max_len, num_feats)
#    y_batched.resize(num_batches, batch_size, max_len)
    return X_batched, y_batched, qid_batched

In [None]:
def make_soft_qids(qids):

In [None]:
print X_.shape
print y_.shape
print qid_.shape

print "X:", X_.shape
print "1st batch of X", X_[0].shape
print "1st sample of 1st batch of X", X_[0][0].shape
print "1st document in 1st sample of 1st batch of X", X_[0][0][0].shape
print "1st feature in 1st document in 1st sample of 1st batch of X", X_[0][0][0][0]