# Install and load required packages

In [40]:
Pkg.add("Dataframes")
Pkg.add("Optim") # For L-BFGS <https://github.com/JuliaOpt/Optim.jl#basic-api-introduction>
#Pkg.add("NLopt")
#Pkg.add("Orchestra")
Pkg.add("ProfileView")

INFO: Nothing to be done
INFO: Nothing to be done
INFO: Nothing to be done


In [4]:
#Pkg.update()

In [42]:
using DataArrays, DataFrames
using Optim
#using NLopt # Nonlinear optimization library http://ab-initio.mit.edu/wiki/index.php/NLopt
using ProfileView

In [45]:
# Set parallel processing cores
nprocs()==CPU_CORES || addprocs(CPU_CORES-1)
nprocs()

8

# Load the "Adult" dataset
The Adult dataset is from [here](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/).
In categorical data, missing data is handled as just another category. This data set does not contain NA-values in the continuous features.

In [7]:
filename_orig_data = "data/adult.data"
filename_orig_test = "data/adult.test"
# NOTE: if the data set is very large, we should save the preprocessed data to avoid preprocessing cost.
#filename_data = "data/processed.adult.data"
#filename_test = "data/processed.adult.test"

# missing_data_marker = "?" # can be a string

df_orig_data = readtable(filename_orig_data);
df_orig_test = readtable(filename_orig_test);

# Not in use, let NA values go through as "?", handled as yet another category
#df_orig_test = readtable(filename_orig_test, nastrings = ["", "NA", "?"]);

# df_orig_test[ 1:3, :age ] # Example: selecting subset of rows (1 to 3), from a certain column

names(df_orig_data)

15-element Array{Symbol,1}:
 :age           
 :workclass     
 :fnlwgt        
 :education     
 :education_num 
 :marital_status
 :occupation    
 :relationship  
 :race          
 :sex           
 :capital_gain  
 :capital_loss  
 :hours_per_week
 :native_country
 :classification

## Parameters for our model
Set $K$ to the number of prototypes. The paper does not discuss how this should be chosen. Here we use the dimension of the original data, when sensitive and classification features are removed.

In [8]:
# Columns to encode with OneHot encoding aka Dummy variables
# Remember to not encode sensitive_column_name and classification_column_name
# Has to be an array
columns_to_encode = [:workclass, :education, :marital_status, :occupation, :relationship, :race, :native_country]

# Sensitive column is "gender" as in the paper
# NOTE: This column must be have only two possible values (be a binary variable)
#       as the method in the paper assumes this ("protected or not protected").
sensitive_column_name = :sex

# NOTE: This column must be have only two possible values (be a binary variable)
#       as the method in the paper assumes this (binary classification).
classification_column_name = :classification

# Size or data matrix. -1 for classification/target column and -1 for sensitive column
K = size(df_orig_data, 2) - 2

13

## Auxiliary helper functions

In [9]:
# Handling the categorical variables, using OneHot encoding
# https://groups.google.com/forum/#!topic/julia-users/7-Vtpi8w4YI
# Would be nice to use DataFrames pool, but couldn't figure out how to get the OneHot-encoded ModelMatrix out nicely.
# http://dataframesjl.readthedocs.org/en/latest/pooling.html
# http://stackoverflow.com/questions/29158626/dummy-variables-in-julia

function getdummy{R}(df::DataFrame, cname::Symbol, ::Type{R})
    darr = df[cname]
    vals = sort(levels(darr))[2:end]
    #namedict = Dict(vals, 1:length(vals))
    namedict = Dict(zip(vals,1:length(vals)))
    arr = zeros(R, length(darr), length(namedict))
    for i=1:length(darr)
        if haskey(namedict, darr[i])
            arr[i, namedict[darr[i]]] = 1
        end
    end
    newdf = convert(DataFrame, arr)
    names!(newdf, [symbol("$(cname)_$k") for k in vals])
    return newdf
end

# Conversion to dummy variables / OneHot encoding
function convertdummy{R}(df::DataFrame, cnames::Array{Symbol}, ::Type{R})
    # consider every variable from cnames as categorical
    # and convert them into set of dummy variables,
    # return new dataframe
    newdf = DataFrame()
    for cname in names(df)
        if !in(cname, cnames)
            newdf[cname] = df[cname]
        else
            dummydf = getdummy(df, cname, R)
            for dummyname in names(dummydf)
                newdf[dummyname] = dummydf[dummyname]
            end
        end
    end
    return newdf
end

# Two parameter version
convertdummy(df::DataFrame, cnames::Array{Symbol}) = convertdummy(df, cnames, Int32)


# A nice Unicode named summation function. Not really necessary, just something Julia can do.
#∑(from::Integer, to::Integer, inner::Function) = sum(inner, colon(from,to))
# Test:
#f(x, k) = x*k
#∑(1, 3, (k) -> f(1, k))

# Partition a matrix two, according to given indices or indicator vector.
# Your matrices need to be column-major, as this is the Julia memory layout.
function partition{T<:Integer}(x::Vector, indices::Vector{T})
    return x[ indices ], x[ setdiff(1:length(x), indices) ]
end
function partition{T<:Integer}(X::Matrix, indices::Vector{T})
    return X[ :, indices ], X[ :, setdiff(1:size(X,2), indices) ]
end
function partition{T<:Integer,U<:Any}(X::SharedArray{U,2}, indices::Vector{T})
    return X[ :, indices ], X[ :, setdiff(1:size(X,2), indices) ]
end
function partition{T<:Bool}(x::Vector, indicator::Vector{T})
    return partition(x, find(indicator))
end
function partition{T<:Bool}(X::Matrix, indicator::Vector{T})
    return partition(X, find(indicator))
end
function partition{T<:Bool,U<:Any}(X::SharedArray{U,2}, indicator::Vector{T})
    return partition(X, find(indicator))
end
# Test:
# @which partition([:first, :second, :third, :fourth], [true, false, true, false])
# @which partition([:first, :second, :third, :fourth], [1,3])

partition (generic function with 6 methods)

## Use OneHot encoding (dummy variables) for categorical features

In [10]:
# Vertically concatenate to get the whole dataset
df_orig_all = vcat(df_orig_data, df_orig_test)
print("Original data ", size(df_orig_data), size(df_orig_test), size(df_orig_all), "\n")
# Do the One-Hot-Encoding / Dummy variables conversion.
df_all = convertdummy(df_orig_all, columns_to_encode)

### Map sensitive and classification/target columns to appropriate types.

# Map Sensitive column "Male"/"Female" to true/false.
# Change this if you change the sensitive column
# (maybe do something that automatically just picks one category to be true and other to be false)
df_all[ sensitive_column_name ] = map(gender -> gender == "Female" ? true : false, df_all[ sensitive_column_name ])
df_all[ sensitive_column_name ] = convert(DataArrays.DataArray{Bool,1}, df_all[ sensitive_column_name ])
# Map classification column values to 0 and 1
df_all[ classification_column_name ] = map(class -> class == ">50K" ? 1 : 0, df_all[ classification_column_name ])
df_all[ classification_column_name ] = convert(DataArrays.DataArray{Integer,1}, df_all[ classification_column_name ])

# Read out the converted data back to data and test sets.
len_data = size(df_orig_data, 1)
len_test = size(df_orig_test, 1)
df_data = df_all[1:len_data, :]
df_test = df_all[len_data+1:len_data+len_test, :]
print("Encoded data ", size(df_data), size(df_test), size(df_all), "\n")

Original data (32561,15)(16281,15)(48842,15)
Encoded data (

# Code for the model

## Notation/Definitions in paper

- $X$ denotes the entire data set of individuals. Each $x \in X$ is a vector of length $D$ where each component of the vector describes some attribute of the person.
- $S$ is a binary random variable representing whether or not a given individual is a member of the protected set; we assume the system has access to this attribute.
- $X_0$ denotes the training set of individuals.
- $X^+ \subset X$, $X_0^+ \subset X_0$ denotes the subset of individuals (from the whole set and the training set respectively) that are members of the protected set (i.e., $S = 1$), and $X^−$ and $X_0^−$ denotes the subsets that are not members of the protected set, i.e., $S = 0$.
- $Z$ is a multinomial random variable, where each of the $K$ values represents one of the intermediate set of ”prototypes”. Associated with each prototype is a vector $\mathbf{v}_k$ in the same space as the individuals $\mathbf{x}$.
- $Y$ is the binary random variable representing the classification decision for an individual, and $f : X \rightarrow Y$ is the desired classification function.
- $d$ is a distance measure on $X$, e.g., simple Euclidean distance: $d(\mathbf{x}_n , \mathbf{v}_k ) = \Vert\mathbf{x}_n − \mathbf{v}_k \Vert_2$.

## Our changes and clarifications

We will differ a bit from the definitions in the paper. The definitions we use are:
- $\mathbf{X}$ denotes the entire data set, a $(N \times D)$ matrix. The rows of the matrix are the feature vectors $\mathbf{x}_n$ representing attributes of an individual. $\mathbf{X}$ contains neither the classification information (target) column, nor the sensitive column.
- $S$ is a binary variable representing whether or not a given individual is a member of the "protected group". For the user of the algorithm, this is a decision that is done before running the algorithm by setting `sensitive_column_name` in the parameters.
- $\mathbf{X}_{train}$ denotes the training set.
- $\mathbf{X}_{test}$ denotes the test set.
- $\mathbf{X}^+$ denotes the subset of individuals that are members of the "protected group" i.e. individuals for whom $S=1$. Similarly $\mathbf{X}^-$ denotes the subset of individuals for whom $S=0$. It's worthwhile to note that for the algorithm it actually doesn't matter if you flip the groups of who is "protected" and who is "non-protected", the result will be the same due to symmetry of statistical parity. So don't get too attached to the terminology.
- Define $\mathbf{X}_{train}^+$, $\mathbf{X}_{train}^-$, $\mathbf{X}_{test}^+$ and $\mathbf{X}_{test}^-$ similarly as above.
- $d$ is a distance measure on $\mathbf{X}$ (e.g. euclidean distance).
- $K$ is the number of prototypes.
- $Z$ is a random integer from the set $\left\{1,\dots,K\right\}$.
- $Y$ is a binary variable representing the classification decision (we consider binary classification only).

Let $Z$ be a random integer from the set $\left\{1,\dots,K\right\}$. Now we can denote the probability that a datapoint $\mathbf{x}$ maps to a particular prototype $k$ with $\mathbb{P}(Z=k \mid \mathbf{x})$ i.e. given a datapoint $\mathbf{x}$, the probability that $Z$, the index of the prototype for that data point, is $k$.

## Definitions in code

From the definitions above, we map some to code and also define additional stuff.

Julia is Column-Majored, so our matrices will be altered accordingly.

- $\mathbf{X}$ is just `𝐗`, but $(D \times N)$ instead of $(N \times D)$.
- $\mathbf{X}_{train}$ is `𝐗train`, $\mathbf{X}_{test}$ is `𝐗test`.
- Grouped versions are `𝐗⁺train`, `𝐗⁻train`, `𝐗⁺test`, and `𝐗⁻test` respectively.
- $S$ is defined as multiple vectors `S_<someset>`, each containing the sensitive column for `<someset>`, e.g. `S_𝐗`.
- $d$ is defined as a lambda function `d` and plain function `de`.
  - The functions implemented here have versions that default to euclidean distance, and versions that accept a user defined distance function.
- $Z$ is replaced by $\mathbf{Z}$, a matrix of probability vectors $\mathbf{z}$.
- The classification information is contained in `𝐲`, `𝐲train`, and `𝐲test`.

Additionally:
- Denote the tuple of prototypes $\mathbf{V} = \left(\mathbf{v}_1,...,\mathbf{v}_K\right)$. Since a single prototype $\mathbf{v}_k$ is a vector of length $D$, $\mathbf{V}$ can be expressed as a ($D \times K$) matrix. This is our optimization variable `𝐕`.
- `A` contains the hyperparameters.

In [11]:
# Sensitive indices for training and test data
S_𝐗train = convert(Array{Bool}, df_data[ sensitive_column_name ])
S_𝐗test = convert(Array{Bool}, df_test[ sensitive_column_name ])

# Classification vectors for training and test data
𝐲train = convert(Array, df_data[ classification_column_name ])
𝐲test = convert(Array, df_test[ classification_column_name ])

# Drop sensitive and classification columns
idxs_left = setdiff(names(df_data), [sensitive_column_name, classification_column_name])
𝐗train = transpose(convert(Matrix{Float64}, df_data[idxs_left]))
𝐗test = transpose(convert(Matrix{Float64}, df_test[idxs_left]))
# 𝐗train = transpose(convert(Matrix, df_data[idxs_left]))
# 𝐗test = transpose(convert(Matrix, df_test[idxs_left]))


# Standardize the features with mean 0 variance 1
# Otherwise exponentiation gets quickly out of hand, e^-800 is already NaN on Float64
# But only standardize non-one-hot-encoded features

# Features that are not one-hot-encoded
non_encoded = setdiff(names(df_orig_data), [sensitive_column_name, classification_column_name])
non_encoded = setdiff(non_encoded, columns_to_encode)
# Find indices of those features
non_encoded_idxs = find(symbol -> in(symbol, non_encoded), names(df_data[idxs_left]))
# Calculate mean and var from training set
train_mean = mean(𝐗train,2)[:]
train_var = var(𝐗train,2)[:]

# # Standardize everything
# 𝐗train = 𝐗train .- train_mean # substraction of vector
# 𝐗train = 𝐗train ./ train_var # division by vector
# # We need to use the same variance and mean for our test set as in the training set,
# # otherwise they would not be comparable.
# 𝐗test = 𝐗test .- train_mean # same mean as in training, on purpose
# 𝐗test = 𝐗test ./ train_var # same variance as in training, on purpose

# Only standardize non-one-hot-encoded features
for n_idx in non_encoded_idxs
    𝐗train[n_idx,:] = 𝐗train[n_idx,:] .- train_mean[n_idx] # substraction
    𝐗train[n_idx,:] = 𝐗train[n_idx,:] ./ train_var[n_idx] # division
    # We need to use the same variance and mean for our test set as in the training set,
    # otherwise they would not be comparable.
    𝐗test[n_idx,:] = 𝐗test[n_idx,:] .- train_mean[n_idx] # same mean as in training, on purpose
    𝐗test[n_idx,:] = 𝐗test[n_idx,:] ./ train_var[n_idx] # same variance as in training, on purpose
end

# Reconstruct full dataset
𝐗 = hcat(𝐗train, 𝐗test)
S_𝐗 = vcat(S_𝐗train, S_𝐗test)
𝐲 = vcat(𝐲train, 𝐲test)

# Dimensions
D = size(𝐗, 1)
N = size(𝐗, 2)
Ntrain = size(𝐗train, 2)
Ntest = size(𝐗test, 2)

### Distance function
# Lambda
d = (𝐚::Vector, 𝐛::Vector) -> vecnorm(𝐚 - 𝐛) # Euclidean distance
# Non-lambda is slightly faster for calculations, but has to be defined
# for all processes with @everywhere
@everywhere de(𝐚::Vector{Float64}, 𝐛::Vector{Float64}) = vecnorm(𝐚 - 𝐛)
# With alphas
#@everywhere dalpha(𝐚::Vector{Float64}, 𝐛::Vector{Float64}, 𝛂::Vector{Float64}) = sum(i -> 𝛂[i]*(𝐚[i]-𝐛[i])^2, 1:D) vecnorm(𝐚 - 𝐛)
@everywhere function dalpha(𝐚::Vector{Float64}, 𝐛::Vector{Float64}, 𝛂::Vector{Float64})
    # Does this function bind us to use K=D? See Eq. 12. Can't think straight right now.
    localD = length(𝛂)
    sum = 0.0
    @inbounds for i in 1:localD  # (Eq. 12)
        sum += 𝛂[i]*((𝐚[i]-𝐛[i])^2)
    end
    return sum
    # Slower alternatives:
    # return 𝛂 ⋅ ((𝐚-𝐛).^2) # (Eq. 12)
    # return sum(i -> 𝛂[i]*(𝐚[i]-𝐛[i])^2, 1:localD) # (Eq. 12)
end
@everywhere function dalpha_test(𝐚::Vector{Float64}, 𝐛::Vector{Float64}, 𝛂::Vector{Float64})
    # For testing purposes, ignores alphas and returns vector norm
    return vecnorm(𝐚 - 𝐛)
end

### Optimization variables
# Main optimization variable, matrix holding the prototype vectors.
# Initialized to random Float64 matrix, normal distribution 0 mean 1 variance
𝐕 = randn(D, K)
# # Weights or "prototype label predictions" (probabilities)
𝐰 = rand(K) # floats in [0,1)
# # Alphas
𝛂⁺ = rand(D)
𝛂⁻ = rand(D)

### Hyperparameters
A = Dict(:z=> 1000, :x=> 0.0001, :y=> 0.1)

print("Done.")

32561,101)(16281,101)(48842,101)
Done.

In [12]:
maximum(𝐗train), minimum(𝐗train), maximum(𝐗test), minimum(𝐗test), maximum(𝐕), minimum(𝐕)

(1.0,-1.3719338843612912,1.0,-1.3719338843612912,3.2262304410946294,-3.336553142375075)

Divide training and test sets to groups according to whether the individuals are "protected" or not.

In [13]:
(𝐗⁺train, 𝐗⁻train) = partition(𝐗train, S_𝐗train)
(𝐗⁺test, 𝐗⁻test) = partition(𝐗test, S_𝐗test)
"Training:",size(𝐗⁺train), size(𝐗⁻train), "Test:", size(𝐗⁺test), size(𝐗⁻test)

("Training:",(99,10771),(99,21790),"Test:",(99,5421),(99,10860))

## Mapping $\mathbf{X} \rightarrow \mathbf{Z}$
Now we can define a mapping from the original dataset $\mathbf{X}$ to probabilities via the *softmax function*. [Wikipedia](https://en.wikipedia.org/wiki/Softmax_function):
> Softmax function "squashes" a $K$-dimensional vector $\mathbf{z}$ of arbitrary real values to a $K$-dimensional vector $\sigma(\mathbf{z})$ of real values in the range $[0, 1]$ that add up to 1.

Most notably `softmax` returns a probability vector. We will define a modified version that maps a $D$-dimensional vector $\mathbf{x}$ to a $K$-dimensional vector $\sigma(\mathbf{x})$, i.e. the mapping won't necessarily preserve the dimensionality of $\mathbf{x}$.

Also from [Wikipedia](https://en.wikipedia.org/wiki/Multinomial_logistic_regression):
> $$\operatorname{softmax}(k,x_1,\ldots,x_n) = \frac{e^{x_k}}{\sum_{i=1}^n e^{x_i}}$$
> is referred to as the [*softmax function*](https://en.wikipedia.org/wiki/Softmax_function).  The reason is that the effect of exponentiating the values $x_1,\ldots,x_n$ is to exaggerate the differences between them.  As a result, $\operatorname{softmax}(k,x_1,\ldots,x_n)$ will return a value close to 0 whenever $x_k$ is significantly less than the maximum of all the values, and will return a value close to 1 when applied to the maximum value, unless it is extremely close to the next-largest value.  Thus, the softmax function can be used to construct a weighted average that behaves as a smooth function (which can be conveniently differentiated, etc.) and which approximates the [indicator function](https://en.wikipedia.org/wiki/Indicator_function).

So we define, as in the paper equation (2), $$\mathbb{P}(Z=k \mid \mathbf{x}) = \frac{e^{-d(\mathbf{x}, \mathbf{v}_k)}}{\sum_{j=1}^K e^{-d(\mathbf{x}, \mathbf{v}_j)}}$$
where
- $\mathbb{P}(Z=k \mid \mathbf{x})$ is described in [definitions](#Definitions)
- $\mathbf{x}$ is the datapoint
- $\mathbf{v}_k$ is a vector associated with the $k$th prototype
- $d$ is a distance measure between $\mathbf{x}$ and $\mathbf{v}_k$ (e.g. the euclidean distance)

This means that since we have replaced $x_k$ in the softmax with the negative distance between $\mathbf{x}$ and prototype $\mathbf{v}_k$, the softmax returns a value close to 0 whenever the distance from $\mathbf{x}$ to the prototype $\mathbf{v}_k$ is significantly higher than $\min_{j\in\{1,\dots,K\}}\, d(\mathbf{x}, \mathbf{v}_i)$, and close to 1 when applied to the minimum value.

"Mapping from X to Z" in the paper means mapping the vector $\mathbf{x}$ to a probability vector $\mathbf{z}$ of length $K$ via the softmax function. These probability vectors are then used directly for training the classifier.

**In code** this means that $\mathbb{P}(Z=k \mid \mathbf{x})$ is represented by a function taking
- the data point $\mathbf{x}$ which is a `Vector` of length $D$
- $(D \times K)$ `Matrix` of prototypes $\mathbf{V}$ containing all $K$ prototypes $\mathbf{v}_k$ i.e. each prototype is a `Vector` (remember, Column-Major order on matrices)
- a distance measure on $\mathbf{X}$

and returning
- a `Vector` $\mathbf{z}$ of length $K$ representing a [probability vector](https://en.wikipedia.org/wiki/Probability_vector), where each value $z_k$ of the probability vector $\mathbf{z}$ tells how probable it is that $\mathbf{x}$ maps to $\mathbf{v}_k$. Since $\mathbf{z}$ is a probability vector, $\sum_{i=k}^K z_k = 1$.

We will name this function `softmax` and define it as follows:

In [59]:
# TODO: clean up duplicate code

### FOR VECTORS; give in 𝐱, get 𝐳

# This is same as Eq (2) in paper.
function softmax_dist{T<:Number}(𝐱::Vector{T}, 𝐕::Matrix{T}, distanceMeasure::Function)
    K = size(𝐕, 2)
    res = Vector{Float64}(K)
    denominator = Float64(0.0)
    # Use one loop to calculate both numerator and denominator
    @inbounds for k in 1:K
        res[k] = exp(- distanceMeasure(𝐱, 𝐕[:,k]) )
        denominator += res[k]
    end
    denom = inv(denominator)
    res .* denom
end

function softmax_euclidean{T<:Number}(𝐱::Vector{T}, 𝐕::Matrix{T})
    K = size(𝐕, 2)
    res = Vector{Float64}(K)
    denominator = Float64(0.0)
    # Use one loop to calculate both numerator and denominator
    @inbounds for k in 1:K
        res[k] = exp(- vecnorm(𝐱 - 𝐕[:,k]) )
        denominator += res[k]
    end
    denom = inv(denominator)
    res .* denom
end

### FOR MATRICES; give in 𝐗, get 𝐙

function softmax_dist{T<:Number}(𝐗::Matrix{T}, 𝐕::Matrix{T}, distanceMeasure::Function)
    N = size(𝐗, 2)
    K = size(𝐕, 2)
    # Preallocate result matrix, no need to zero it
    res = Matrix{Float64}(K, N)
    @inbounds for n in 1:N
        res[:,n] = softmax_dist(𝐗[:,n], 𝐕, distanceMeasure)
    end
    res
end

# Version that accepts alphas for distance function
function softmax_dist_alpha{T<:Number,U<:Number}(𝐗::Matrix{T}, 𝐕::Matrix{T}, distanceMeasure::Function, 𝛂::Vector{U})
    #nprocs()==CPU_CORES || addprocs(CPU_CORES-1)    
    N = size(𝐗, 2)
    K = size(𝐕, 2)
    # Preallocate result matrix, no need to zero it
    res = Matrix{Float64}(K, N)
    @inbounds @simd for n in 1:N
        @inbounds @simd for k in 1:K
            res[k,n] = exp(- distanceMeasure(𝐗[:,n], 𝐕[:,k], 𝛂) )
        end
        res[:,n] = res[:,n] .* inv(sum(res[:,n]))
    end
    res
end

# Parallel version
function softmax_dist_par{T<:Number}(𝐗::Matrix{T}, 𝐕::Matrix{T}, distanceMeasure::Function)
    #nprocs()==CPU_CORES || addprocs(CPU_CORES-1)    
    N = size(𝐗, 2)
    K = size(𝐕, 2)
    # Preallocate result matrix, no need to zero it
    res = SharedArray(Float64, (K, N))
    @sync @parallel for n in 1:N
        for k in 1:K
            res[k,n] = exp(- distanceMeasure(𝐗[:,n], 𝐕[:,k]) )
        end
        res[:,n] = res[:,n] .* inv(sum(res[:,n]))
    end
    res
end

# Parallel version that accepts alphas for distance function
function softmax_dist_alpha_par{T<:Number,U<:Number}(𝐗::Matrix{T}, 𝐕::Matrix{T}, distanceMeasure::Function, 𝛂::Vector{U})
    #nprocs()==CPU_CORES || addprocs(CPU_CORES-1)    
    N = size(𝐗, 2)
    K = size(𝐕, 2)
    # Preallocate result matrix, no need to zero it
    res = SharedArray(Float64, (K, N))
    @sync @parallel for n in 1:N
        for k in 1:K
            res[k,n] = exp(- distanceMeasure(𝐗[:,n], 𝐕[:,k], 𝛂) )
        end
        res[:,n] = res[:,n] .* inv(sum(res[:,n]))
    end
    res
end

function softmax_euclidean{T<:Number}(𝐗::Matrix{T}, 𝐕::Matrix{T})
    N = size(𝐗, 2)
    K = size(𝐕, 2)
    # Preallocate result matrix, no need to zero it
    res = Matrix{Float64}(K, N)
    @inbounds for n in 1:N
        denominator = Float64(0.0)
        # Use one loop to calculate both numerator and denominator
        for k in 1:K
            res[k,n] = exp(- vecnorm(𝐗[:,n] - 𝐕[:,k]) )
            denominator += res[k,n]
        end
        res[:,n] = res[:,n] .* inv(denominator)
    end
    res
end

# Parallel version
function softmax_euclidean_par{T<:Number}(𝐗::Matrix{T}, 𝐕::Matrix{T})
    #nprocs()==CPU_CORES || addprocs(CPU_CORES-1)
    N = size(𝐗, 2)
    K = size(𝐕, 2)
    # Preallocate result matrix, no need to zero it
    res = SharedArray(Float64, (K, N))
    @sync @parallel for n in 1:N
        for k in 1:K
            res[k,n] = exp(- vecnorm(𝐗[:,n] - 𝐕[:,k]) )
        end
        res[:,n] = res[:,n] .* inv(sum(res[:,n]))
    end
    res
end

softmax_euclidean_par (generic function with 1 method)

In [15]:
# For development
𝐙pre = softmax_euclidean_par(𝐗train, 𝐕)
𝐙pre⁺ = softmax_euclidean_par(𝐗⁺train, 𝐕)
𝐙pre⁻ = softmax_euclidean_par(𝐗⁻train, 𝐕);

This approach is akin to using a funky [multinomial logistic regression](https://en.wikipedia.org/wiki/Multinomial_logistic_regression) to "predict the prototype" (category) where a data point $\mathbf{x}$ maps to.
Wikipedia:
> These are all statistical classification problems. They all have in common a dependent variable to be predicted that comes from one of a limited set of items which cannot be meaningfully ordered, as well as a set of independent variables (also known as features, explanators, etc.), which are used to predict the dependent variable. Multinomial logit regression is a particular solution to the classification problem that assumes that a linear combination of the observed features and some problem-specific parameters can be used to determine the probability of each particular outcome of the dependent variable. The best values of the parameters for a given problem are usually determined from some training data.


## Parts of the optimization objective

### $L_z$ &mdash; statistical parity
In code, we will denote $L_z$ with function `Lz`.

In [61]:
### "NAIVE" VERSION FOR UNDERSTANDING THE IMPLEMENTATION

function LzNaive{T<:Number}(𝐗⁺::Matrix{T}, 𝐗⁻::Matrix{T}, 𝐕::Matrix{T}, dist::Function)
    # Operate on matrices and take mean from sample dimension N
    meanp = mean( softmax_dist_par(𝐗⁺, 𝐕, dist), 2 ) # Eq (6)
    meann = mean( softmax_dist_par(𝐗⁻, 𝐕, dist), 2 ) # Similarly for M_k^-
    sum(abs(meanp - meann)) # Eq (7), sum is from k=1 to K
end

### VERSIONS TAKING IN THE DATA SET AND PROTOTYPES
# Optionally a distance measure function can be passed as an argument

function Lz{T<:Number}(𝐗⁺::Matrix{T}, 𝐗⁻::Matrix{T}, 𝐕::Matrix{T})
    return Lz(sdata(softmax_euclidean_par(𝐗⁺, 𝐕)), sdata(softmax_euclidean_par(𝐗⁻, 𝐕)))
end

function Lz{T<:Number}(𝐗⁺::Matrix{T}, 𝐗⁻::Matrix{T}, 𝐕::Matrix{T}, dist::Function)
    return Lz(sdata(softmax_dist_par(𝐗⁺, 𝐕, dist)), sdata(softmax_dist_par(𝐗⁻, 𝐕, dist)))
end

# # TODO: use parallel version of softmax_dist_alpha?
# function Lz{T<:Number,U<:Number}(𝐗⁺::Matrix{T}, 𝐗⁻::Matrix{T}, 𝐕::Matrix{T}, dist::Function, 𝛂::Vector{U})
#     return Lz(softmax_dist_alpha(𝐗⁺, 𝐕, dist, 𝛂), softmax_dist_alpha(𝐗⁻, 𝐕, dist, 𝛂))
# end

### VERSION FOR PRECALCULATED 𝐙⁺ and 𝐙⁻
# Note we have only a version for matrices. This is because during performance
# testing I noticed that
#
#   ZZZp = sdata(𝐙shared⁺)
#   ZZZn = sdata(𝐙shared⁻)
#   Lz(ZZZp, ZZZn)
#
# is faster than
#
#   Lz(𝐙shared⁺, 𝐙shared⁻)
#
# So use the Matrix version always and if necessary lift the matrices out
# of the SharedArray with sdata().
#
# TODO: is there way to make a faster parallel version?

function Lz{T<:Number}(𝐙⁺::Matrix{T}, 𝐙⁻::Matrix{T})
    # Operate on matrices and take mean from sample dimension N
    meanp = mean( 𝐙⁺, 2 )[:] # Eq (6)
    meann = mean( 𝐙⁻, 2 )[:] # Similarly for M_k^-
    sum(abs(meanp - meann)) # Eq (7), sum is from k=1 to K
end

Lz (generic function with 3 methods)

In [17]:
# Test (e.g. somewhere between 0.02 and 0.06, depending on 𝐕 randomization)
LZtrain = Lz(𝐗⁺test, 𝐗⁻test, 𝐕)

0.05724699945304177

### $L_x$ &mdash; information loss
In code, we will denote $L_x$ with function `Lx`.

In [18]:
# Symbols: 𝐗 ⁺ ⁻ ∑ 𝐕 𝐱 𝐲 𝐙 𝐳

Note .* elementwise multiplication of softmax_dist() and V, there is no \cdot in the paper in Eq (9), dot product would return a scalar.

In [62]:
### NAIVE VERSION FOR UNDERSTANDING THE IMPLEMENTATION

function LxNaive{T<:Number,U<:Number}(𝐗::Matrix{T}, 𝐕::Matrix{U}, dist::Function)
    D = size(𝐗, 1)
    N = size(𝐗, 2)
    K = size(𝐕, 2)
    𝐗hat = zeros(Float64, (D,N))
    sum = Float64(0.0)
    for n in 1:N
        𝐳_n = softmax_dist(𝐗[:,n], 𝐕, dist) # prob that x_n maps to protos (Array{13,1})
        for k in 1:K # Eq (9)
            𝐗hat[:,n] = 𝐗hat[:,n] + (𝐳_n[k] * 𝐕[:,k])
        end
        sum += (𝐗[:,n] - 𝐗hat[:,n]) ⋅ (𝐗[:,n] - 𝐗hat[:,n]) # Eq (8)
    end
    return sum, 𝐗hat
end

### VERSIONS TAKING IN THE DATA SET AND PROTOTYPES
# Optionally a distance measure function can be passed as an argument

function Lx{T<:Number,U<:Number}(𝐗::Matrix{T}, 𝐕::Matrix{U})
    return Lx(softmax_euclidean_par(𝐗, 𝐕), 𝐗, 𝐕)
end

function Lx{T<:Number,U<:Number}(𝐗::Matrix{T}, 𝐕::Matrix{U}, dist::Function)
    return Lx(softmax_dist_par(𝐗, 𝐕, dist), 𝐗, 𝐕)
end

### VERSION FOR PRECALCULATED 𝐙

function Lx{T<:Number}(𝐙::SharedArray{T,2}, 𝐗::Matrix{T}, 𝐕::Matrix{T})
    D = size(𝐕, 1)
    K = size(𝐕, 2)
    N = size(𝐙, 2)
    # Keep 𝐙 as SharedArray, will be faster than taking sdata() when fed to the following @parallel loop
    sum = @parallel (+) for n in 1:N # Eq (8)
        𝐱hat_n = zeros(Float64, D)
        for k in 1:K # Eq (9)
            𝐱hat_n = 𝐱hat_n + (𝐙[k,n] * 𝐕[:,k]) # We are constructing a vector of length D
        end
        (𝐗[:,n] - 𝐱hat_n) ⋅ (𝐗[:,n] - 𝐱hat_n) # "simple squared error"
    end
    return sum
end

# TODO parallel?
function Lx{T<:Number}(𝐙::Matrix{T}, 𝐗::Matrix{T}, 𝐕::Matrix{T})
    D = size(𝐕, 1)
    K = size(𝐕, 2)
    N = size(𝐙, 2)
#    # Keep 𝐙 as SharedArray, will be faster than taking sdata() when fed to the following @parallel loop
    sum::Float64 = 0.0
    @inbounds @simd for n in 1:N # Eq (8)
        𝐱hat_n = zeros(Float64, D)
        for k in 1:K # Eq (9)
            𝐱hat_n = 𝐱hat_n + (𝐙[k,n] * 𝐕[:,k]) # We are constructing a vector of length D
        end
        sum += (𝐗[:,n] - 𝐱hat_n) ⋅ (𝐗[:,n] - 𝐱hat_n) # "simple squared error"
    end
    return sum
end

# TODO: test if making V into sharedarray increases performance, probably not since V is usually small

Lx (generic function with 4 methods)

In [20]:
# Test (e.g. 423475.8479283692)
LXtrain = Lx(𝐗train, 𝐕, d)

611049.2235646215

### $L_y$ &mdash; prediction accuracy
In code, we will denote $L_y$ with function `Ly`.

Essentially here we are letting the optimization pick both the prototypes (i.e. feature vectors) and their predictions (i.e. labels), and the predictions don't have to be discrete 0 and 1, but can be from the range $[0,1]$ and thus themselves can be viewed as probabilities. E.g. let's say that for prototype $v_k$ it's prediction $w_k = 0.82$, then "there is a 82% chance prototype $\mathbf{v}_k$ gets label 1"

In [63]:
### NAIVE VERSION TO HELP UNDERSTAND THE IMPLEMENTATION

function LyNaive{T1<:Number,T2<:Number,T3<:Number}(
        𝐗::Matrix{T1}, 𝐕::Matrix{T1}, 𝐲::Vector{T2}, 𝐰::Vector{T3}, dist::Function
    )
    D = size(𝐗, 1)
    N = size(𝐗, 2)
    K = size(𝐕, 2)
    𝐲hat = zeros(Float64, N)
    sum = Float64(0.0)
    # Replace 𝐲hat in Eq (10) with Eq (11), then you get this for loop
    for n in 1:N
        𝐙_n = softmax_dist(𝐗[:,n], 𝐕, dist) # Vector of length K
        for k in 1:K
            𝐲hat[n] = 𝐲hat[n] + (𝐙_n[k] * 𝐰[k])
        end
        # The following line could be replaced with
        # if 𝐲[n] == 1
        #    sum -= log(𝐲hat[n])
        # else # 𝐲[n] == 0
        #    sum -= log(1 - 𝐲hat[n])
        # end
        sum += -𝐲[n] * log(𝐲hat[n])  -  (1 - 𝐲[n]) * log(1 - 𝐲hat[n])
    end
    #return sum, 𝐲hat
    return sum
end

### VERSIONS TAKING IN THE DATA SET AND PROTOTYPES
# Optionally a distance measure function can be passed as an argument

function Ly{T1<:Number,T2<:Number,T3<:Number}(
        𝐗::Matrix{T1}, 𝐕::Matrix{T1}, 𝐲::Vector{T2}, 𝐰::Vector{T3}
    )
    # 𝐙 = softmax_euclidean_par(𝐗, 𝐕)
    return Ly(softmax_euclidean_par(𝐗, 𝐕), 𝐲, 𝐰)
end

function Ly{T1<:Number,T2<:Number,T3<:Number}(
        𝐗::Matrix{T1}, 𝐕::Matrix{T1}, 𝐲::Vector{T2}, 𝐰::Vector{T3}, dist::Function
    )
    # 𝐙 = softmax_dist_par(𝐗, 𝐕, dist)
    return Ly(softmax_dist_par(𝐗, 𝐕, dist), 𝐲, 𝐰)
end

### VERSION FOR PRECALCULATED 𝐙

# function Ly{T1<:Number,T2<:Number,T3<:Number}(
#         𝐙::Matrix{T1}, 𝐲::Vector{T2}, 𝐰::Vector{T3}
#     )
#     # Copy to shared memory
#     𝐙shared = convert(SharedArray{T1, 2}, 𝐙)
#     return Ly(𝐙shared, 𝐲, 𝐰)
# end

function Ly{T1<:Number,T2<:Number,T3<:Number}(
        𝐙::SharedArray{T1,2}, 𝐲::Vector{T2}, 𝐰::Vector{T3}
    )
    N = size(𝐙, 2)
    # Keep 𝐙 as SharedArray, will be faster than taking sdata() when fed to the following @parallel loop
    sum = @parallel (+) for n in 1:N # Eq (10)
        yhat_n = 𝐙[:,n] ⋅ 𝐰 # Eq (11)
        - 𝐲[n] * log(yhat_n) - (1 - 𝐲[n]) * log(1 - yhat_n)
    end
    return sum
end

function Ly{T1<:Number,T2<:Number,T3<:Number}(𝐙::Matrix{T1}, 𝐲::Vector{T2}, 𝐰::Vector{T3})
    N = size(𝐙, 2)
#     # Keep 𝐙 as SharedArray, will be faster than taking sdata() when fed to the following @parallel loop
    sum::Float64 = 0.0
    @inbounds @simd for n in 1:N # Eq (10)
        yhat_n = 𝐙[:,n] ⋅ 𝐰 # Eq (11)
        sum += - 𝐲[n] * log(yhat_n) - (1 - 𝐲[n]) * log(1 - yhat_n)
    end
    return sum
end

Ly (generic function with 4 methods)

In [22]:
# Test (e.g. 20572.03833866735)
LYtrain = Ly(𝐗train, 𝐕, 𝐲train, 𝐰)

20547.574072108637

## Optimization objective function

In [64]:
# Overall objective function
objective_euclidean(𝐗, 𝐗⁺, 𝐗⁻, 𝐕, 𝐲, 𝐰, A) = A[:z]*Lz(𝐗⁺, 𝐗⁻, 𝐕) + A[:x]*Lx(𝐗, 𝐕) + A[:y]*Ly(𝐗, 𝐕, 𝐲, 𝐰)
objective_dist(𝐗, 𝐗⁺, 𝐗⁻, 𝐕, 𝐲, 𝐰, A, dist) = A[:z]*Lz(𝐗⁺, 𝐗⁻, 𝐕, dist) + A[:x]*Lx(𝐗, 𝐕, dist) + A[:y]*Ly(𝐗, 𝐕, 𝐲, 𝐰, dist)
function objective_pre(𝐗::Matrix, S::Vector{Bool}, 𝐕::Matrix, 𝐲::Vector, 𝐰::Vector, A::Dict)
    # Calculate 𝐙 and partitions once
    𝐙 = softmax_euclidean_par(𝐗, 𝐕)
    (𝐙⁺, 𝐙⁻) = partition(𝐙, S)
    # Use functions that accept precalculated 𝐙
    return A[:z]*Lz(𝐙⁺, 𝐙⁻) + A[:x]*Lx(𝐙, 𝐗, 𝐕) + A[:y]*Ly(𝐙, 𝐲, 𝐰)
end
function objective_pre_alphadist(𝐗::Matrix, S::Vector{Bool}, 𝐕::Matrix, 𝐲::Vector, 𝐰::Vector, A::Dict, 𝛂⁺::Vector, 𝛂⁻::Vector)
    # Calculate 𝐙 and partitions
    (𝐗⁺, 𝐗⁻) = partition(𝐗, S)
    (𝐲⁺, 𝐲⁻) = partition(𝐲, S)
    𝐙⁺ = softmax_dist_alpha_par(𝐗⁺, 𝐕, dalpha, 𝛂⁺) # Can put dalpha_test to make sure result is same as objective_pre
    𝐙⁻ = softmax_dist_alpha_par(𝐗⁻, 𝐕, dalpha, 𝛂⁻) # Can put dalpha_test to make sure result is same as objective_pre
    # Use functions that accept precalculated 𝐙
    return A[:z]*Lz(sdata(𝐙⁺), sdata(𝐙⁻)) + A[:x]*Lx(𝐙⁺, 𝐗⁺, 𝐕) + A[:x]*Lx(𝐙⁻, 𝐗⁻, 𝐕) + A[:y]*Ly(𝐙⁺, 𝐲⁺, 𝐰) + A[:y]*Ly(𝐙⁻, 𝐲⁻, 𝐰)
end
function objective_pre_alphadist_nonpar(𝐗::Matrix, S::Vector{Bool}, 𝐕::Matrix, 𝐲::Vector, 𝐰::Vector, A::Dict, 𝛂⁺::Vector, 𝛂⁻::Vector)
    # Calculate 𝐙 and partitions
    (𝐗⁺, 𝐗⁻) = partition(𝐗, S)
    (𝐲⁺, 𝐲⁻) = partition(𝐲, S)
    𝐙⁺ = softmax_dist_alpha(𝐗⁺, 𝐕, dalpha, 𝛂⁺) # Can put dalpha_test to make sure result is same as objective_pre
    𝐙⁻ = softmax_dist_alpha(𝐗⁻, 𝐕, dalpha, 𝛂⁻) # Can put dalpha_test to make sure result is same as objective_pre
    # Use functions that accept precalculated 𝐙
    return A[:z]*Lz(𝐙⁺, 𝐙⁻) + A[:x]*Lx(𝐙⁺, 𝐗⁺, 𝐕) + A[:x]*Lx(𝐙⁻, 𝐗⁻, 𝐕) + A[:y]*Ly(𝐙⁺, 𝐲⁺, 𝐰) + A[:y]*Ly(𝐙⁻, 𝐲⁻, 𝐰)
end

objective_pre_alphadist_nonpar (generic function with 1 method)

In [24]:
objective_euclidean(𝐗train, 𝐗⁺train, 𝐗⁻train, 𝐕, 𝐲train, 𝐰, A)

2173.422892653416

In [25]:
objective_pre(𝐗train, S_𝐗train, 𝐕, 𝐲train, 𝐰, A)

2173.422892653416

In [26]:
objective_pre_alphadist(𝐗train, S_𝐗train, 𝐕, 𝐲train, 𝐰, A, 𝛂⁺, 𝛂⁻)

3067.9043622279164

In [65]:
objective_pre_alphadist_nonpar(𝐗train, S_𝐗train, 𝐕, 𝐲train, 𝐰, A, 𝛂⁺, 𝛂⁻)

3067.904362227916

In [67]:
function fun_to_profile()
    objective_pre_alphadist_nonpar(𝐗train, S_𝐗train, 𝐕, 𝐲train, 𝐰, A, 𝛂⁺, 𝛂⁻)
end

function fun_to_profile_loop()
   for i in 1:50
        fun_to_profile()
    end
end

# Profiling
profiling_logfile = "profile.bin"
function benchmark()
    # Any setup code goes here.

    # Run once, to force compilation.
    println("======================= First run:")
    srand(666)
    @time fun_to_profile()

    # Run a second time, with profiling.
    println("\n\n======================= Second run:")
    srand(666)
    Profile.init(delay=0.01)
    Profile.clear()
    Profile.clear_malloc_data()
    @profile @time fun_to_profile_loop()

    # Write profile results to profile.bin.
    r = Profile.retrieve()
    f = open(profiling_logfile, "w")
    serialize(f, r)
    close(f)
end

function show_profiling()
    f = open(profiling_logfile)
    r = deserialize(f);
    ProfileView.view(r[1], lidict=r[2])
end

show_profiling (generic function with 1 method)

In [None]:
benchmark()

  

In [None]:
#show_profiling()

In [66]:
# @time for i in 1:10
#     objective_euclidean(𝐗train, 𝐗⁺train, 𝐗⁻train, 𝐕, 𝐲train, 𝐰, A)
# end
# @time for i in 1:10
#     objective_pre(𝐗train, S_𝐗train, 𝐕, 𝐲train, 𝐰, A)
# end
# @time for i in 1:10
#     objective_pre_alphadist(𝐗train, S_𝐗train, 𝐕, 𝐲train, 𝐰, A, 𝛂⁺, 𝛂⁻)
# end
# @time for i in 1:10
#     objective_pre_alphadist_nonpar(𝐗train, S_𝐗train, 𝐕, 𝐲train, 𝐰, A, 𝛂⁺, 𝛂⁻)
# end
#   7.533158 seconds (5.70 M allocations: 301.744 MB, 1.88% gc time)
#   4.280320 seconds (5.09 M allocations: 301.643 MB, 1.56% gc time)
#   4.707826 seconds (5.64 M allocations: 577.336 MB, 2.26% gc time)
#   7.008553 seconds (74.51 M allocations: 20.287 GB, 33.03% gc time)

  7.008553 seconds (74.51 M allocations: 20.287 GB, 33.03% gc time)


In [None]:
objective_euclidean(𝐗test, 𝐗⁺test, 𝐗⁻test, 𝐕, 𝐲test, 𝐰, A)

## Test optimization run

In [None]:
count = 0 # keep track of # function evaluations
#function obj_func(x::Vector, grad::Vector)
function obj_func(x::Vector)
    # Increase count
    global count
    count::Int += 1
    
    # Print progress
    if count % 50 == 0
        @printf("Round %d\n", count)
    end
    
    # Get 𝐕, 𝐰, 𝛂⁺, 𝛂⁻
    loc𝐕 = reshape(x[1:length(𝐕)], size(𝐕)) # Uses the global 𝐕 for size
    cursor = length(𝐕)
    loc𝛂⁺ = x[cursor+1:cursor+length(𝛂⁺)]
    cursor += length(𝛂⁺)
    loc𝛂⁻ = x[cursor+1:cursor+length(𝛂⁻)]
    cursor += length(𝛂⁻)
    loc𝐰 = x[cursor+1:cursor+length(𝐰)]
    #return size(loc𝐕), size(loc𝛂⁺), size(loc𝛂⁻), size(loc𝐰)
    return objective_pre_alphadist(𝐗train, S_𝐗train, loc𝐕, 𝐲train, loc𝐰, A, loc𝛂⁺, loc𝛂⁻)
#     if length(grad) > 0:
#         ...set grad to gradient, in-place...
#     return ...value of f(x)...
end

In [None]:
?reshape

In [None]:
size(𝐕),size(𝐰'),size(𝛂⁺),size(𝛂⁻)

In [None]:
optVarInit = vcat(reshape(𝐕, length(𝐕)), reshape(𝛂⁺, length(𝛂⁺)), reshape(𝛂⁻, length(𝛂⁻)), reshape(𝐰, length(𝐰)));

In [None]:
bounds_lower = vcat(fill(-Inf, length(𝐕)), zeros(length(𝛂⁺) + length(𝛂⁻) + length(𝐰)));
bounds_upper = vcat(fill(+Inf, length(𝐕)), ones(length(𝛂⁺) + length(𝛂⁻) + length(𝐰)));
length(optVarInit), length(bounds_lower), length(bounds_upper)

In [None]:
# # Call L-BFGS
res = optimize(obj_func,
    optVarInit,
    method = :l_bfgs,
    xtol = 1e-4,
    grtol = 1e-12,
    iterations = 15000,
    store_trace = true,
    show_trace = false)

# TODO: Can we use upper and lower limits for 𝐕? Will it speed up the optimizer?
#       The prototypes need to lie inside the smallest hypercube that contains all original datapoints, yes?

# d1 = DifferentiableFunction(obj_func)
# # Note that d1 above will use central finite differencing to approximate the gradient.

# @elapsed res = fminbox(d1,
#     optVarInit,
#     bounds_lower,
#     bounds_upper,
#     method = :l_bfgs,
#     xtol = 1e-4,
#     grtol = 1e-12,
#     iterations = 15000,
#     store_trace = true,
#     show_trace = false)
# #fminbox(d4, x0, l, u)

In [None]:
#opt = Opt(:LD_MMA, 2)
opt = Opt(:LN_SBPLX, length(optVarInit)) # Use a derivative-free optimization algorithm (instead of L-BFGS)

# Lower and upper bounds for alphas and 𝐰
lower_bounds!(opt, bounds_lower)
upper_bounds!(opt, bounds_upper)

# Tolerance
xtol_rel!(opt,1e-4)

# Stop when the number of function evaluations exceeds the second argument.
#(0 or negative for no limit.)
maxeval!(opt, 15000)

# Stop when the optimization time (in seconds) exceeds the second argument.
#(0 or negative for no limit.)
maxtime!(opt, 60*2)

# Minimize
min_objective!(opt, obj_func)

(minf,minx,ret) = optimize(opt, optVarInit)
println("got $minf at $minx after $count iterations (returned $ret)")

# Optimization, running the algorithm
Hyperparameters for the objective function.
In the paper they use grid search to find the parameters. The sets defined here are the same as in the paper.

In [None]:
# Sets of hyperparameters as in paper, for grid search
gridA = Dict(:z => Set([0.1, 0.5, 1.0, 5.0, 10.0]), :x => Set([0, 0.01]), :y => Set([0.1, 0.5, 1.0, 5.0, 10.0]))
# An example of selected hyperparameters, for development
A = Dict(:z => 0.01, :x => 0.5, :y => 1.0)

# TODO
- Overall process with pictures

# Problems/Cons/Notes:
- Let's say that there is a column/feature "Religion" in the dataset.
- Now this paper says we can only say that "Is a member of protected group" or "Is not a member of protected group".
- You have to decide what is the "protected" group, and what is the "normal/non protected" group. You have to decide based on some external criteria who are discriminated against and who are not.
- Let's say we have a dataset with a feature "Religion" and we have 5 different religions represented.
- Now we have to choose which ones are protected and which ones are not.
- The problem of course is that some might be in general discriminated against more than others. There is not necessary even split between the different groups that are discriminated against.

- What we would like to say is that "Religion" is a sensitive feature, and we should not infer _anything_ from it, regardless what it is.

- Does running the algo multiple times help, changing the binary classification each time? Can we extend it so that $S \in {1,...,C}$ where $C$ is the number of categories in the sensitive column.
  - We can extend, just split $L_z$ to multiple cases and the optimization is done to all of them. There will be $c = \frac{(C-1)C}{2} \approx O(C^2)$ pairs. Whether this is computationally still feasible is another question. In the objective function $L_z$ is replaced by $A_{z_1} \cdot L_{z_1} + A_{z_2} \cdot L_{z_2} + \dots + A_{z_c} \cdot L_{z_c}$.

- On the current case where $S \in \left\{0,1\right\}$ once we have set for which rows $S=1$ and $S=0$, we can flip them around without changing anything. This is because we are using statistical parity. This means that from the algorithm's perspective saying that group0 is non-protected and groups 1..4 are protected is the same as saying group1 is protected and other non-protected.