In [1]:
%pwd

'/Users/ryandevera/data-science/umn_environments/Constrained-Deep-Learning-Survey'

In [2]:
# stdlib
import math
import random

# third party
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow.compat.v1 as tf
import tensorflow_constrained_optimization as tfco
import warnings

# first party
from cdlsurvey.data import get_data
from cdlsurvey.models import Model
from cdlsurvey.utils import training_helper

# Disable eager execution
tf.disable_eager_execution()

# suppress warnings
warnings.filterwarnings('ignore')

# For plotting in notebook
%matplotlib inline

2024-01-18 14:02:45.760884: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
PROTECTED_COLUMNS = ['gender_Female', 'gender_Male', 'race_White', 'race_Black']

In [4]:
CATEGORICAL_COLUMNS = [
    'workclass', 'education', 'marital_status', 'occupation', 'relationship',
    'race', 'gender', 'native_country'
]
CONTINUOUS_COLUMNS = [
    'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'education_num'
]
COLUMNS = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num',
    'marital_status', 'occupation', 'relationship', 'race', 'gender',
    'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
    'income_bracket'
]
LABEL_COLUMN = 'label'

PROTECTED_COLUMNS = [
    'gender_Female', 'gender_Male', 'race_White', 'race_Black'
]

In [5]:
# Gather the data
train_df, test_df, FEATURE_NAMES = get_data()

In [6]:
# Make sure there are positive labels
train_df['label'].sum(), test_df['label'].sum()

(7841, 3846)

In [7]:
# def training_generator(model,
#                        train_df,
#                        test_df,
#                        minibatch_size,
#                        num_iterations_per_loop=1,
#                        num_loops=1):
#     random.seed(31337)
#     num_rows = train_df.shape[0]
#     minibatch_size = min(minibatch_size, num_rows)
#     permutation = list(range(train_df.shape[0]))
#     random.shuffle(permutation)

#     session = tf.Session()
#     session.run((tf.global_variables_initializer(),
#                tf.local_variables_initializer()))

#     minibatch_start_index = 0
#     for n in range(num_loops):
#         for _ in range(num_iterations_per_loop):
#             minibatch_indices = []
#             while len(minibatch_indices) < minibatch_size:
#                 minibatch_end_index = (
#                 minibatch_start_index + minibatch_size - len(minibatch_indices))
#                 if minibatch_end_index >= num_rows:
#                     minibatch_indices += range(minibatch_start_index, num_rows)
#                     minibatch_start_index = 0
#                 else:
#                     minibatch_indices += range(minibatch_start_index, minibatch_end_index)
#                     minibatch_start_index = minibatch_end_index
                    
#             session.run(
#                   model.train_op,
#                   feed_dict=model.feed_dict_helper(
#                       train_df.iloc[[permutation[ii] for ii in minibatch_indices]]))

#         train_predictions = session.run(
#             model.predictions_tensor,
#             feed_dict=model.feed_dict_helper(train_df))
#         test_predictions = session.run(
#             model.predictions_tensor,
#             feed_dict=model.feed_dict_helper(test_df))

#         yield (train_predictions, test_predictions)

In [8]:
# def error_rate(predictions, labels):
#     signed_labels = (
#       (labels > 0).astype(np.float32) - (labels <= 0).astype(np.float32))
#     numerator = (np.multiply(signed_labels.values, predictions.values) <= 0).sum()
#     denominator = predictions.shape[0]
#     return float(numerator) / float(denominator)


# def positive_prediction_rate(predictions, subset):
#     numerator = np.multiply((predictions > 0).astype(np.float32),
#                           (subset > 0).astype(np.float32)).sum()
#     denominator = (subset > 0).sum()
#     return float(numerator) / float(denominator)

# def tpr(df):
#     """Measure the true positive rate."""
#     fp = sum((df['predictions'] >= 0.0) & (df[LABEL_COLUMN] > 0.5))
#     ln = sum(df[LABEL_COLUMN] > 0.5)
#     return float(fp) / float(ln)

# def _get_error_rate_and_constraints(df, tpr_max_diff):
#     """Computes the error and fairness violations."""
#     error_rate_local = error_rate(df[['predictions']], df[[LABEL_COLUMN]])
#     overall_tpr = tpr(df)
#     return error_rate_local, [(overall_tpr - tpr_max_diff) - tpr(df[df[protected_attribute] > 0.5]) for protected_attribute in PROTECTED_COLUMNS]

# def _get_exp_error_rate_constraints(cand_dist, error_rates_vector, constraints_matrix):
#     """Computes the expected error and fairness violations on a randomized solution."""
#     expected_error_rate = np.dot(cand_dist, error_rates_vector)
#     expected_constraints = np.matmul(cand_dist, constraints_matrix)
#     return expected_error_rate, expected_constraints

# def training_helper(model,
#                     train_df,
#                     test_df,
#                     minibatch_size,
#                     num_iterations_per_loop=1,
#                     num_loops=1):
#     train_error_rate_vector = []
#     train_constraints_matrix = []
#     test_error_rate_vector = []
#     test_constraints_matrix = []
#     for train, test in training_generator(
#       model, train_df, test_df, minibatch_size, num_iterations_per_loop,
#       num_loops):
#         train_df['predictions'] = train
#         test_df['predictions'] = test

#         train_error_rate, train_constraints = _get_error_rate_and_constraints(
#           train_df, model.tpr_max_diff)
#         train_error_rate_vector.append(train_error_rate)
#         train_constraints_matrix.append(train_constraints)

#         test_error_rate, test_constraints = _get_error_rate_and_constraints(
#             test_df, model.tpr_max_diff)
#         test_error_rate_vector.append(test_error_rate)
#         test_constraints_matrix.append(test_constraints)

#     return (train_error_rate_vector, train_constraints_matrix, test_error_rate_vector, test_constraints_matrix)

In [9]:
model = Model(
    tpr_max_diff=0.05,
    protected_columns=PROTECTED_COLUMNS,
    feature_names=FEATURE_NAMES,
    label_column=LABEL_COLUMN,
)
model.build_train_op(0.01, unconstrained=True)

# training_helper returns the list of errors and violations over each epoch.
train_errors, train_violations, test_errors, test_violations = training_helper(
      model,
      train_df,
      test_df,
      100,
      num_iterations_per_loop=326,
      num_loops=40)

AttributeError: 'tuple' object has no attribute 'values'

In [10]:
%debug

> [0;32m/Users/ryandevera/data-science/umn_environments/Constrained-Deep-Learning-Survey/cdlsurvey/metrics.py[0m(19)[0;36merror_rate[0;34m()[0m
[0;32m     17 [0;31m[0;34m[0m[0m
[0m[0;32m     18 [0;31m    [0;31m# Assign the numerator[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 19 [0;31m    [0mnumerator[0m [0;34m=[0m [0;34m([0m[0mnp[0m[0;34m.[0m[0mmultiply[0m[0;34m([0m[0msigned_labels[0m[0;34m.[0m[0mvalues[0m[0;34m,[0m [0mpredictions[0m[0;34m.[0m[0mvalues[0m[0;34m)[0m [0;34m<=[0m [0;36m0[0m[0;34m)[0m[0;34m.[0m[0msum[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     20 [0;31m[0;34m[0m[0m
[0m[0;32m     21 [0;31m    [0;31m# Assign the denominator[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  signed_labels


(       predictions
0             -1.0
1              1.0
2             -1.0
3             -1.0
4              1.0
...            ...
32556         -1.0
32557         -1.0
32558         -1.0
32559         -1.0
32560          1.0

[32561 rows x 1 columns],)


ipdb>  type(signed_labels)


<class 'tuple'>


ipdb>  exit


In [None]:
print("Train Error", train_errors[-1])
print("Train Violation", max(train_violations[-1]))
print()
print("Test Error", test_errors[-1])
print("Test Violation", max(test_violations[-1]))

In [None]:
model = Model(
    tpr_max_diff=0.05,
    protected_columns=PROTECTED_COLUMNS,
    feature_names=FEATURE_NAMES,
    label_column=LABEL_COLUMN,
)
model.build_train_op(0.01, unconstrained=False)

# training_helper returns the list of errors and violations over each epoch.
train_errors, train_violations, test_errors, test_violations = training_helper(
      model,
      train_df,
      test_df,
      100,
      num_iterations_per_loop=326,
      num_loops=40)

In [None]:
print("Train Error", train_errors[-1])
print("Train Violation", max(train_violations[-1]))
print()
print("Test Error", test_errors[-1])
print("Test Violation", max(test_violations[-1]))

In [None]:
print("Train Error", np.mean(train_errors))
print("Train Violation", max(np.mean(train_violations, axis=0)))
print()
print("Test Error", np.mean(test_errors))
print("Test Violation", max(np.mean(test_violations, axis=0)))

In [None]:
cand_dist = tfco.find_best_candidate_distribution(train_errors, train_violations)
print(cand_dist)

In [None]:
m_stoch_error_train, m_stoch_violations_train = _get_exp_error_rate_constraints(cand_dist, train_errors, train_violations)
m_stoch_error_test, m_stoch_violations_test = _get_exp_error_rate_constraints(cand_dist, test_errors, test_violations)

print("Train Error", m_stoch_error_train)
print("Train Violation", max(m_stoch_violations_train))
print()
print("Test Error", m_stoch_error_test)
print("Test Violation", max(m_stoch_violations_test))