In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from utils import *

ModuleNotFoundError: No module named 'pyreadstat'

## Datasets for Classification

Details for each dataset are provided in utils/data_loader.py

Packages required to load USA_literacy:
sas7bdat

In [2]:
dir_data = '../data/clss'
names_data = ['India_distress', 'USA_literacy',  'USA_kidney', 'India_HIV', 'Zambia_perception_mc1', 
              'Angola_maternal', 'Congo_fever', 'Sjogren_ModelA', 'Sjogren_ModelB', 'USA_obesity', 
              'SouthAmerica_tuberculosis', 'Infection', 'Qatar_antibodies', 'Maternal_deaths']
for name_data in names_data:
    X, y, names_covariates = load_clss_data(name_data, dir_data)
    print(name_data, X.shape)

[33m[USA_literacy_numeracy.sas7bdat] column count mismatch[0m


India_distress (40286, 26)
USA_literacy (4661, 40)
USA_kidney (42464, 28)
India_HIV (9666, 17)
Zambia_perception_mc1 (1082, 16)
Angola_maternal (10010, 9)
Congo_fever (956, 11)
Sjogren_ModelA (92016, 3)
Sjogren_ModelB (92016, 4)
USA_obesity (13160, 9)
SouthAmerica_tuberculosis (478, 14)
Infection (288, 15)
Qatar_antibodies (1894, 10)
Maternal_deaths (9258, 12)


## Datasets for Regression

Details for each dataset are provided in utils/data_loader.py

In [5]:
dir_data = '../data/regr'

names_data = ['Dutch_drinking_inh', 'Dutch_drinking_wm', 'Dutch_drinking_sha', 'Brazil_health_heart', 
              'Brazil_health_stroke', 'Korea_grip', 'China_glucose_women2', 'China_glucose_men2', 
              'Spain_Hair', 'China_HIV']

for name_data in names_data:
    X, y, names_covariates = load_regr_data(name_data, dir_data)
    y = y.astype(np.float)

    B = np.column_stack([np.min(X, axis = 0), np.max(X, axis = 0)])
    vol = box_intersection(B, B)

    print(name_data, X.shape, vol)

Dutch_drinking_inh (12121, 16)
Dutch_drinking_wm (12131, 16)
Dutch_drinking_sha (12098, 16)
Brazil_health_heart (7728, 6)
Brazil_health_stroke (9675, 6)
Korea_grip (1022, 11)
China_glucose_women2 (4568, 11)
China_glucose_men2 (4360, 11)
Spain_Hair (529, 5)
China_HIV (2410, 27)


In [15]:
X, y, names_covariates = load_regr_data('Dutch_drinking_wm', dir_data)

In [16]:
print(names_covariates)
print(X[0,:])
print(y[0])

['Imputation__1.0' 'Imputation__2.0' 'Imputation__3.0' 'Imputation__4.0'
 'Imputation__5.0' 't1wm' 'sex' 't1age' 't1ses' 't1mat_alcohol'
 't1pat_alcohol' 't1ysr_del' 't3year_cannabis' 't4year_cannabis'
 't3daily_smoking' 't4month_smoking']
[ 0.          0.          0.          0.          0.          2.03194643
  1.02428287 -0.33311002  1.26949067 -1.41801865 -1.78799688 -0.53252065
 -0.61682165 -0.6857144  -0.42757434 -0.84370994]
1.4743969524630376


In [46]:
X, y, names_covariates = load_regr_data('Brazil_health_stroke', dir_data)

In [21]:
names_covariates

['Year', 'ESFProportion', 'ACSProportion', 'Population', 'GDP', 'DHI Value']

In [22]:
np.ptp(X, axis=0)

array([ 3.24037035,  2.42753081,  2.41139135, 26.18918286, 19.91949403,
        7.18713784])

In [29]:
n = len(y)

beta = np.linalg.solve(X.T @ X, X.T @ y)
res  = np.abs(X @ beta - y)
core_ind = np.argsort(res)[:int(n / 10)]

core_X = X[core_ind]
core_y = y[core_ind]

In [30]:
T = 10
for i in range(T):
    new_beta = np.linalg.solve(core_X.T @ core_X, core_X.T @ core_y)
    if np.linalg.norm(new_beta - beta) < 1e-3:
        print(f'Found stable core ({i} iterations)')
        break

    else:
        res      = np.abs(X @ new_beta - y)
        new_core_ind = np.argsort(res)[:int(n / 10)]
        core_X = X[core_ind]
        core_y = y[core_ind]
        beta = new_beta

Found stable core (1 iterations)


In [43]:
def proj(x, lb, ub):
    p = x.copy()

    for i in range(len(x)):
        if x[i] < lb[i]:
            p[i] = lb[i]
        elif x[i] > ub[i]:
            p[i] = ub[i]

    return p


def in_box(x, lb, ub):
    if (x == proj(x, lb, ub)).all():
        return 1
    else:
        return 0


def fit(X, Y):
    n = len(Y)
    X_copy = X.copy()
    # X_copy = np.append(X_copy, np.ones((n, 1)), axis = 1)

    XTX = X_copy.T @ X_copy
    beta_hat = np.linalg.solve(XTX, X_copy.T @ Y)
    min_eig = min(np.linalg.eigvals(XTX)) / len(X_copy)
    print(beta_hat)
    return beta_hat, min_eig


def cutoff(std, min_eig, n_core, n_full, alpha, x):
    d = len(x)
    return std * (np.linalg.norm(x) * np.sqrt(d * np.log(4 * d / alpha) / n_core) / min_eig + np.sqrt(2 * np.log(4 * n_full / alpha)))


def res(x, y, beta):
    # return abs(y - np.dot(beta[:-1], x) - beta[-1])
    return abs(y - np.dot(x, beta))



In [41]:
def directed_infty_norm(x, S):
    best = 0
    for j in range(len(x)):
        if S[j] != set():
            best = max(best, max([x[j] * s for s in S[j]]))
    return best


def largest_box_heuristic(X, B):
    # B is a d x 2 array containing the maximum allowed box.
    # B must contain the origin and the origin must be contained in the final selected region.
    # Any valid point x must have B[i,0] <= x[i] <= B[i,1].
    # All of the points in X should be within this box.
    n, d = X.shape
    X2 = X.copy()
    B2 = B.copy()
    S = [set([-1,1]) for j in range(d)]
    # print(S)

    while X2.any():
        directed_infty_norms = [directed_infty_norm(x, S) for x in X2]
        i = np.argmin(directed_infty_norms) # i = point which supports the new side
        j = list(np.abs(X2[i]) == directed_infty_norms[i]).index(True) # j = dimension which is being supported
        sign = int(np.sign(X2[i, j]))

        S[j].remove(sign)
        B2[j, int((sign + 1)/2)] = X2[i, j]
        # print(X2[i,j])
        # print(B2[j, int((sign + 1)/2)])

        X2 = X2[[k for k in range(len(X2)) if sign * X2[k, j] < directed_infty_norms[i]]]

    return B2

In [47]:
core_beta, min_eig = fit(core_X, core_y)

s_hat = np.sqrt(np.sum((core_X @ core_beta - core_y) ** 2) / (len(core_y) - len(X[0]) - 1))
alpha = 0.05
B = np.column_stack([np.min(X, axis = 0), np.max(X, axis = 0)])

excluded = np.zeros(n)
for k in range(n):
    if res(X[k], y[k], core_beta) > cutoff(s_hat, min_eig, int(n / 10), n, alpha, x):
        excluded[k] = 1

approx_region = largest_box_heuristic(X[excluded == 1], B)

[-2.89423907  0.15342818  1.48291035 -8.463658   -2.58521654 -2.86654597]
(967, 6)
(6,)


In [48]:
print(B)
print(approx_region)

[[-1.62018517  1.62018517]
 [-1.03130083  1.39622999]
 [-1.17729285  1.2340985 ]
 [-0.13821648 26.05096638]
 [-0.87601977 19.04347426]
 [-3.16185809  4.02527975]]
[[-1.62018517  1.62018517]
 [-1.03130083  1.39622999]
 [-1.17729285  1.2340985 ]
 [-0.13821648 22.66954056]
 [-0.87601977 19.04347426]
 [-3.16185809  4.02527975]]
