In [1]:
import pandas as pd
import numpy as np
import bisect

# Import Training, Testing and Validating Set

In [2]:
poi_train_x = pd.read_csv('train_X.csv')
print("ori train ", poi_train_x.shape[0])
poi_train_x = np.matrix(poi_train_x.to_numpy())
print("matrix train ", len(poi_train_x))
poi_train_y = pd.read_csv('train_y.csv')
print("ori train y ", poi_train_y.shape[0])
poi_train_y = poi_train_y['Life Expectancy'].tolist()
print("matrix train y ", len(poi_train_y))
poi_test_x = pd.read_csv('test_X.csv')
poi_test_x = np.matrix(poi_test_x.to_numpy())
poi_test_y = pd.read_csv('test_y.csv')
poi_test_y = poi_test_y['Life Expectancy'].tolist()
poi_val_x = pd.read_csv('val_X.csv')
poi_val_x = np.matrix(poi_val_x.to_numpy())
poi_val_y = pd.read_csv('val_y.csv')
poi_val_y = poi_val_y['Life Expectancy'].tolist()

print (poi_train_x[0])
print (poi_train_y[0])

ori train  1319
matrix train  1319
ori train y  1319
matrix train y  1319
[[2.00900000e+03 0.00000000e+00 2.52077562e-01 2.22222222e-03
  1.14221725e-01 6.87422107e-04 9.79591837e-01 9.89711711e-05
  2.10892236e-01 1.60000000e-03 9.79166667e-01 1.74114916e-01
  9.79381443e-01 0.00000000e+00 1.80413125e-03 1.54066216e-05
  5.50724638e-01 5.36842105e-01 7.75316456e-01 6.47342995e-01]]
0.6736242884250474


# Define Main Arguments

In [3]:
# num of poisoning points
poison_ct = 300
# num of points to train model
train_ct = len(poi_train_x)
test_ct = len(poi_test_x)
# proportion of poisoning
total_prop = poison_ct / (poison_ct + train_ct)
print ("Posion count: ", poison_ct, " Train count: ", train_ct, " Test count: ", test_ct, " Proportion of poisoning: ", total_prop)

Posion count:  300  Train count:  1319  Test count:  165  Proportion of poisoning:  0.1852995676343422


# Inf_flip Function

In [4]:
# First calculates the dot product of the transpose of training set and training set using the np.dot() function
dot_product = np.dot(poi_train_x.T, poi_train_x)
# Then, it adds a scaled identity matrix to the resulting square matrix. The scaling factor is 0.01.
# The identity matrix is created using the np.eye() function with a size equal to the number of columns in training set.
scal_id_matrix = 0.01 * np.eye(poi_train_x.shape[1])
# Calculates the inverse of a matrix inv_cov using the training set and the identity matrix np.eye().
# Then takes the inverse of the resulting matrix using the ** -1 notation.
# This results in the inv_cov matrix, which can be used in various linear algebraic operations.
# The inverse of the covariance matrix is used in various algorithms to estimate the regression coefficients, compute prediction intervals, or perform principal component analysis, among other things.
inv_cov = (dot_product + scal_id_matrix) ** -1
# The resulting matrix is then multiplied by the transpose of poi_train_x using poi_train_x.T. This is equivalent to computing poi_train_x times inv_cov times poi_train_x transpose.
# The resulting matrix H represents the projection of the training data onto a lower-dimensional space that captures the most important information or variance in the data
H = poi_train_x @ inv_cov @ poi_train_x.T
# row sum of H
row_sum = np.sum(H, axis=1)
train_y_arr = np.array(poi_train_y)
# computes an auxiliary array that measures uncertainty in the target y
y_uncert = np.abs(train_y_arr - 0.4) + 0.4
'''
num_differences = np.count_nonzero(train_y_arr != y_uncert)
proportion_differences = num_differences / np.size(train_y_arr)
print("difference ",proportion_differences)
'''
# computes an auxiliary array that represents the "flip" or opposite of the target y
y_opp = 1 - np.floor(0.5 + train_y_arr)
print("y_opp",y_opp)
# combines the projection strength and uncertainty measures for each instance into a single statistic that captures the trade-off between projection quality and target uncertainty.
# stat = np.multiply(bests.ravel(), room.ravel())
stats = (y_uncert * y_opp).flatten()
print("stats",stats)
# Compute the total probability of all instances
total_prob = sum(stats)
print("total_prob", total_prob)
# Initialize a list all_prob with a zero value.
all_prob = [0]
# Initialize an empty list to store the selected instance indices.
poi_idx = []
#for i in range(len(poi_train_x)):
#    all_prob.append(stats[i] + all_prob[-1])
all_prob = [sum(stats[:i+1]) for i in range(poi_train_x.shape[0])]
poi_idx = [bisect.bisect_left(all_prob, np.random.uniform(low=0, high=total_prob)) for i in range(poison_ct)]

x_pois = poi_train_x[poi_idx]
y_pois = [y_opp[i] for i in poi_idx]

#print("x_pois: ", x_pois)
print("x_pois len: ", len(x_pois))
print("x_pois col ct:",x_pois.shape[1])
#print("y_pois: ", y_pois)
print("y_pois len: ", len(y_pois))

y_opp [0. 0. 0. ... 0. 0. 0.]
stats [0. 0. 0. ... 0. 0. 0.]
total_prob 132.78026565464893
x_pois len:  300
x_pois col ct: 20
y_pois len:  300
