In [1]:
import torch
import sys
import pandas as pd
import urllib.request

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import math
# import tqdm
import torch
import gpytorch
import pandas as pd
from matplotlib import pyplot as plt
import random
import numpy as np
import urllib.request
import os
from math import floor

data_dim = 103

class FCN(torch.nn.Sequential):
    def __init__(self):
        super(FCN, self).__init__()
        self.add_module('linear1', torch.nn.Linear(data_dim, 50))
        self.add_module('relu1', torch.nn.ReLU())
        self.add_module('linear2', torch.nn.Linear(50, 2))

feature_extractor = FCN()

class GPRegressionModel(gpytorch.models.ExactGP):
        def __init__(self, train_x, train_y, likelihood):
            super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)
            self.mean_module = gpytorch.means.ConstantMean()
            self.covar_module = gpytorch.kernels.GridInterpolationKernel(
                gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(ard_num_dims=2)),
                num_dims=2, grid_size=100
            )
            self.feature_extractor = feature_extractor

            # This module will scale the NN features so that they're nice values
            self.scale_to_bounds = gpytorch.utils.grid.ScaleToBounds(-1., 1.)

        def forward(self, x):
            # We're first putting our data through a deep net (feature extractor)
            projected_x = self.feature_extractor(x)
            projected_x = self.scale_to_bounds(projected_x)  # Make the NN values "nice"

            mean_x = self.mean_module(projected_x)
            covar_x = self.covar_module(projected_x)
            return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


def train(train_x, train_y, num_device=0):
  device = torch.device('cpu')


  
  likelihood = gpytorch.likelihoods.GaussianLikelihood()
  model = GPRegressionModel(train_x, train_y, likelihood)
  
  model.to(device)
  likelihood.to(device)

  training_iterations = 60
  
  optimizer = torch.optim.Adam([
  {'params': model.feature_extractor.parameters()},
  {'params': model.covar_module.parameters()},
  {'params': model.mean_module.parameters()},
  {'params': model.likelihood.parameters()},], lr=0.01)
  mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
  
  demo_parity = 0
  iterator = range(training_iterations)
  
  for i in range(training_iterations):
    model.train()
    likelihood.train()
    # Zero backprop gradients
    optimizer.zero_grad()
    # Get output from model
    output = model(train_x.to(device))

    # Calc loss and backprop derivatives
    loss = -mll(output, train_y.to(device))
    loss.backward()
    optimizer.step()

    
    model.eval()
    likelihood.eval()
      
  return model, likelihood


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd

link = 'https://drive.google.com/uc?export=download&id=1BSMYXhJbHqLtwnkapdXvcQD_qid9gdNF'
df_base = pd.read_csv(link)
df_base.drop(['Unnamed: 0'], axis=1, inplace=True)

In [4]:
df_base.to_csv("../csv_files/data1.csv")

In [6]:
# df_base.drop(['Unnamed: 0'], axis=1, inplace=True)
data_train, data_test = train_test_split(df_base, test_size=0.3, random_state=0)


Y_train, Y_test = data_train['income'], data_test['income']
data_train.drop(['income'], axis=1, inplace=True), data_test.drop(['income'], axis=1, inplace=True)
clf = LogisticRegression(random_state=0).fit(data_train, Y_train)
clf.score(data_test, Y_test)

0.8410112773641926

In [7]:
link = 'https://drive.google.com/uc?export=download&id=1BSMYXhJbHqLtwnkapdXvcQD_qid9gdNF'
df_base = pd.read_csv(link)

KeyboardInterrupt: 

In [None]:
df_base.drop(['Unnamed: 0', 'income'], axis=1, inplace = True)
df_base_pos = df_base[df_base['gender_Female'] > 0]
df_base_neg = df_base[df_base['gender_Female'] <= 0]

In [8]:
df_base_neg = df_base_neg.reset_index(drop=True)
df_base_pos = df_base_pos.reset_index(drop=True)

In [9]:
device = torch.device("cpu")

In [10]:
temp_pos = df_base_pos.copy()
temp_neg = df_base_neg.copy()

In [9]:
##### Cold start: random sampling #####
pos_rows = temp_pos.sample(1000)
pos_data, pos_indices = pos_rows.values, pos_rows.index
temp_pos = temp_pos.drop(index=pos_indices).reset_index(drop=True)
pos_queried = [pos_data]

neg_rows = temp_neg.sample(1000)
neg_data, neg_indices = neg_rows.values, neg_rows.index
temp_neg = temp_neg.drop(index=neg_indices).reset_index(drop=True)
neg_queried = [neg_data]

pos_labels = []
pos_labels.append(clf.predict(pos_data))
neg_labels = []
neg_labels.append(clf.predict(neg_data))



In [10]:
#### training the GP ####
model_pos, likelihood_pos = train(
    torch.tensor(np.concatenate(pos_queried, axis=0).astype(np.float32)),
    torch.tensor(np.concatenate(pos_labels, axis=0)).flatten(),
    num_device=0
)

model_neg, likelihood_neg = train(
    torch.tensor(np.concatenate(neg_queried, axis=0).astype(np.float32)),
    torch.tensor(np.concatenate(neg_labels, axis=0)).flatten(),
    num_device=0
)

In [11]:
for _ in range(10):
    #### sampling new points for comparison ####
    pos_rows = temp_pos.sample(1000)
    pos_data, pos_indices = pos_rows.values, pos_rows.index
    neg_rows = temp_neg.sample(1000)
    neg_data, neg_indices = neg_rows.values, neg_rows.index


    #### calculating variance as score ####
    var_pos = likelihood_pos(model_pos(torch.tensor(pos_data.astype(np.float32)))).variance
    var_neg = likelihood_neg(model_neg(torch.tensor(neg_data.astype(np.float32)))).variance

    pos_query_indices = pos_indices[var_pos.sort(descending=True).indices[:100]]
    neg_query_indices = neg_indices[var_neg.sort(descending=True).indices[:100]]

    selected_pos = pos_rows.loc[pos_query_indices]
    selected_neg = neg_rows.loc[neg_query_indices]

    temp_pos = temp_pos.drop(index=selected_pos.index).reset_index(drop=True)
    temp_neg = temp_neg.drop(index=selected_neg.index).reset_index(drop=True)

    pos_queried.append(selected_pos.values)
    pos_labels.append(clf.predict(selected_pos))
    neg_queried.append(selected_neg.values)
    neg_labels.append(clf.predict(selected_neg))

In [18]:
y_pos = np.concatenate(pos_labels)
y_neg = np.concatenate(neg_labels)

In [19]:
estimated = np.square(np.mean(y_pos) - np.mean(y_neg))

In [20]:
y_pos_real = clf.predict(df_base[df_base.gender_Female>0])
y_neg_real = clf.predict(df_base[df_base.gender_Female<=0])

In [21]:
real = np.square(np.mean(y_pos_real) - np.mean(y_neg_real))

In [22]:
print(real)
print(estimated)

0.03471984052886321
0.030450249999999995


In [47]:
(len(y_pos) + len(y_neg))

4000

In [45]:
random_samples = df_base.sample(4000)
y_pos_random = clf.predict(random_samples[random_samples.gender_Female>0])
y_neg_random = clf.predict(random_samples[random_samples.gender_Female<=0])
parity_random = np.square(np.mean(y_neg_random) - np.mean(y_pos_random))

In [46]:
parity_random

0.03413215421321091