In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
import tensorflow as tf
from typing import NamedTuple, List, Tuple, Sequence,Mapping

import numpy as np

In [60]:
def to_tensors(data:Sequence[Mapping[str,float]], targets:Mapping[str,float]):
    """
    Turns dictionaries of instance features and target features into numpy arrays.
    """
    vocab = {}
    def to_vector(instance:Mapping[str,float]):
        vector = np.ndarray(len(vocab))
        for key, value in instance.items():
            vector[vocab[key]] = value
        return vector
        
    for instance in data:
        for key,value in instance.items():
            if key not in vocab:
                vocab[key] = len(vocab)
    data_vectors = []
    for instance in data:
        data_vectors.append(to_vector(instance))
        
    data_matrix = np.stack(data_vectors)
    
    target_vector = to_vector(targets)

    print(data_matrix)
    
    return data_matrix, target_vector


[[ 1.  1.]
 [ 0.  1.]]


In [82]:
def estimate_correction_weights(data:Sequence[Mapping[str,float]], targets:Mapping[str,float], 
                                reg_lambda = 0.0, debug=False):
    """
    Calculates a sequence of instance weights such that the total sum of their features
    equals/is close to the target vector.
    Args:
        data: list of feature vectors in sparse dictionary format
        targets: feature vector with target total counts
    Returns:
        `instance_weights`, `total` where instance_weights is a list of weights corresponding
        to the instances in `data` and `total` is the total population vector approximation.
    """
    data_matrix, target_vector = to_tensors(data,targets)
    
    instance_weights = tf.Variable(initial_value=tf.zeros([len(data)]))
    data_placeholder = tf.placeholder(tf.float32, shape=data_matrix.shape)
    target_placeholder = tf.placeholder(tf.float32, shape=target_vector.shape)
    total = tf.einsum("ij,j -> i", data_placeholder, instance_weights)
    target_loss = tf.nn.l2_loss(total - target_placeholder)
    regularizer = tf.nn.l2_loss(instance_weights - 1)
    total_loss = target_loss + reg_lambda * regularizer

    sess = tf.Session()
    optimizer = tf.train.AdamOptimizer(0.1)
    opt_op = optimizer.minimize(total_loss)
    project_nonnegative = tf.assign(instance_weights, tf.maximum(0.0,instance_weights))

    sess.run(tf.global_variables_initializer())
    for i in range(0,100):
        feed_dict = {data_placeholder:data_matrix, 
              target_placeholder:target_vector}
        sess.run(opt_op,feed_dict)
        sess.run(project_nonnegative)
        result = sess.run({'total':total,
              'loss':total_loss, 
              'regularizer':regularizer, 
              'weights':instance_weights,
              'target':target_placeholder},feed_dict)
        if debug:
            print(result['loss'])
            print(result['weights'])
            print(result['total'])
    return result['weights'],result['total']

In [117]:
data = [{"ent_premise_China":1, "ent_hyp_China":1},{"ent_premise_China":1, "ent_hyp_China":0}]
targets = {"ent_premise_China":1, "ent_hyp_China":1}

estimate_correction_weights(data, targets, reg_lambda=0.1, debug=True)

[[ 1.  1.]
 [ 0.  1.]]
0.806
[ 0.09999997  0.09999999]
[ 0.19999996  0.09999999]
0.565831
[ 0.19890024  0.19926944]
[ 0.3981697   0.19926944]
0.379527
[ 0.29551122  0.29710737]
[ 0.59261858  0.29710737]
0.245664
[ 0.38814718  0.39260525]
[ 0.78075242  0.39260525]
0.160732
[ 0.47452283  0.48461699]
[ 0.95913982  0.48461699]
0.118539
[ 0.55177951  0.57176262]
[ 1.12354207  0.57176262]
0.110016
[ 0.61678004  0.65248877]
[ 1.26926875  0.65248877]
0.123891
[ 0.66672748  0.72520626]
[ 1.39193368  0.72520626]
0.148367
[ 0.69988877  0.7884959 ]
[ 1.48838472  0.7884959 ]
0.17318
[ 0.71598929  0.84132487]
[ 1.55731416  0.84132487]
0.191089
[ 0.71605974  0.88319659]
[ 1.59925628  0.88319659]
0.1983
[ 0.70194262  0.91418767]
[ 1.61613035  0.91418767]
0.194053
[ 0.6758008   0.93488008]
[ 1.61068082  0.93488008]
0.179799
[ 0.63980412  0.94623655]
[ 1.58604074  0.94623655]
0.158329
[ 0.59599143  0.94946963]
[ 1.54546106  0.94946963]
0.133022
[ 0.54624265  0.94593447]
[ 1.49217713  0.94593447]
0.10724

(array([ 0.15402147,  0.92377818], dtype=float32),
 array([ 1.07779968,  0.92377818], dtype=float32))

In [78]:
def to_vector(instance:Mapping[str,float],vocab):
    vector = np.ndarray(len(vocab))
    for key, value in instance.items():
        vector[vocab[key]] = value
    return vector

def to_sparse_tensors(data:Sequence[Mapping[str,float]], targets:Mapping[str,float]):
    """
    Turns dictionaries of instance features and target features into numpy arrays.
    """
    vocab = {}

        
    for instance in data:
        for key,value in instance.items():
            if key not in vocab:
                vocab[key] = len(vocab)
                
    data_vectors = []
    data_indices = []
    data_values = []
    for instance_nr,instance in enumerate(data):
        for key, value in instance.items():
            data_indices.append((instance_nr,vocab[key]))
            data_values.append(value)
        
    data_matrix = tf.SparseTensorValue(data_indices, data_values, [len(data),len(vocab)])
    
    target_vector = to_vector(targets,vocab)

    print(data_matrix)
    
    return data_matrix, target_vector

In [127]:
def estimate_correction_weights_sparse(data:Sequence[Mapping[str,float]], 
                                       targets:Mapping[str,float], 
                                       reg_lambda = 0.0, debug=False,
                                       max_iterations=100):
    """
    Calculates a sequence of instance weights such that the total sum of their features
    equals/is close to the target vector.
    Args:
        data: list of feature vectors in sparse dictionary format
        targets: feature vector with target total counts
    Returns:
        `instance_weights`, `total` where instance_weights is a list of weights corresponding
        to the instances in `data` and `total` is the total population vector approximation.
    """
    data_matrix, target_vector = to_sparse_tensors(data,targets)
    
    instance_weights = tf.Variable(initial_value=tf.zeros([len(data),1]))
    data_placeholder = tf.sparse_placeholder(tf.float32)
    target_placeholder = tf.placeholder(tf.float32, shape=target_vector.shape)
    total = tf.sparse_tensor_dense_matmul(data_placeholder,instance_weights,adjoint_a=True)
    target_loss = tf.nn.l2_loss(total - target_placeholder)
    regularizer = tf.nn.l2_loss(instance_weights - 1)
    total_loss = target_loss + reg_lambda * regularizer

    sess = tf.Session()
    optimizer = tf.train.AdamOptimizer(0.1)
    opt_op = optimizer.minimize(total_loss)
    project_nonnegative = tf.assign(instance_weights, tf.maximum(0.0,instance_weights))

    sess.run(tf.global_variables_initializer())
    for i in range(0,max_iterations):
        feed_dict = {data_placeholder:data_matrix, 
              target_placeholder:target_vector}
        sess.run(opt_op,feed_dict)
        sess.run(project_nonnegative)
        result = sess.run({'total':total,
              'loss':total_loss, 
              'regularizer':regularizer, 
              'weights':instance_weights,
              'target':target_placeholder},feed_dict)
        if debug:
            print(result['loss'])
            print(result['weights'])
            print(result['total'])
    return result['weights'][:,0],result['total'][:,0]

In [128]:
data = [{"ent_premise_China":1.0, "ent_hyp_China":1.0},
        {"ent_premise_China":1.0, "ent_hyp_China":1.0}]
targets = {"ent_premise_China":1.0, "ent_hyp_China":1.0}

estimate_correction_weights_sparse(data, targets, reg_lambda=0.0,debug=False)

SparseTensorValue(indices=[(0, 0), (0, 1), (1, 0), (1, 1)], values=[1.0, 1.0, 1.0, 1.0], dense_shape=[2, 2])


(array([ 0.5,  0.5], dtype=float32), array([ 1.,  1.], dtype=float32))