# Feature Processing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
from helpers import *

In [2]:
# Import full data
(yb_t, input_data_t, ids_t) = load_csv_data('data/train.csv', sub_sample = False)
(yb_test, input_data_test, ids_test) = load_csv_data('data/test.csv',sub_sample = False)

# Replace missing data with nan
input_data_t_clean = clean_data(input_data_t)
yb_t_clean = clean_data(yb_t)
input_data_test_clean = clean_data(input_data_test)
yb_test_clean = clean_data(yb_test)

In [4]:
# Eliminate data points with missing data
def delDataPoints(input_tx_train, input_y_train, input_tx_test, input_y_test):
    mask = [~np.isnan(input_tx_train).any(axis =1)]
    tx_train = input_tx_train[mask]
    y_train = input_y_train[mask]
    tx_test = input_tx_test
    y_test = input_y_test
    return tx_train, y_train, tx_test, y_test

tx_train, y_train, tx_test, y_test = delDataPoints(input_data_t_clean, yb_t_clean, input_data_test_clean, yb_test_clean)
print("tx_train.shape = ", tx_train.shape)
print("y_train.shape = ", y_train.shape)

tx_train.shape =  (68114, 30)
y_train.shape =  (68114,)


In [6]:
# Eliminate features with missing data for some points (must consider both training and test sets)
def delFeatures(input_tx_train, input_y_train, input_tx_test, input_y_test):
    tx_complete = np.concatenate((input_tx_train, input_tx_test), axis=0)
    mask = np.array([~np.isnan(tx_complete).any(axis =0)]).ravel()
    print(mask)
    tx_train = input_tx_train[:,mask]
    y_train = input_y_train
    tx_test = input_tx_test[:,mask]
    y_test = input_y_test
    return tx_train, y_train, tx_test, y_test

tx_train, y_train, tx_test, y_test = delFeatures(input_data_t_clean, yb_t_clean, input_data_test_clean, yb_test_clean)
print("tx_train.shape = ", tx_train.shape)
print("tx_test.shape = ", tx_test.shape)

[False  True  True  True False False False  True  True  True  True  True
 False  True  True  True  True  True  True  True  True  True  True False
 False False False False False  True]
tx_train.shape =  (250000, 19)
tx_test.shape =  (568238, 19)


In [15]:
# Replacing missing values by their respective mean for each feature (computing for the TRAINING set only!)
def replaceByMeanPerFeature(input_tx_train, input_y_train, input_tx_test, input_y_test):
    train_mean = np.nanmean(input_tx_train, axis=0) # Computes the mean per column without considering nan value
    for ind, mean in enumerate(train_mean):
        mask_tmp = np.isnan(input_tx_train[:,ind])
        input_tx_train[mask_tmp, ind] = mean
        mask_tmp = np.isnan(input_tx_test[:,ind])
        input_tx_test[mask_tmp, ind] = mean
    y_train = input_y_train
    y_test = input_y_test
    return input_tx_train, y_train, input_tx_test, y_test
       
#replaceByMeanPerFeature(input_data_t_clean, input_data_test_clean)  

In [16]:
# test for replaceByMeanPerFeature
A = np.array([[np.nan, 0, 4], [3, np.nan, 4], [2, 1, 4]])
y_A = np.array([1, 1])
print("A = ", A)

B = np.array([[np.nan, 2, 3],[4, np.nan, 6]])
y_B = np.array([1, 1])
print("B = ", B)

C, y_C, D, y_D = replaceByMeanPerFeature(A, y_A, B, y_B)
print("A_processed = ", C)
print("B_processed = ", D)

A =  [[ nan   0.   4.]
 [  3.  nan   4.]
 [  2.   1.   4.]]
B =  [[ nan   2.   3.]
 [  4.  nan   6.]]
A_processed =  [[ 2.5  0.   4. ]
 [ 3.   0.5  4. ]
 [ 2.   1.   4. ]]
B_processed =  [[ 2.5  2.   3. ]
 [ 4.   0.5  6. ]]


# Feature augmentation

In [34]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((len(x), 1))
    for deg in range(1, degree+1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly[:, 1:]

def augment_feat(input_tx_train, input_tx_test, degree):
    n_features = len(input_tx_train[0])
    for ind in range(0, n_features):
        input_tx_train = np.c_[input_tx_train, build_poly(input_tx_train[:, ind], degree)]
        input_tx_test = np.c_[input_tx_test, build_poly(input_tx_test[:, ind], degree)]
    return input_tx_train, input_tx_test
    

In [37]:
A = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]])
print("A = ", A)

B = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]])

C, D = augment_feat(A, B, 3)
print(C)

A =  [[1 2 3]
 [1 2 3]
 [1 2 3]]
[[  1.   2.   3.   1.   1.   1.   2.   4.   8.   3.   9.  27.]
 [  1.   2.   3.   1.   1.   1.   2.   4.   8.   3.   9.  27.]
 [  1.   2.   3.   1.   1.   1.   2.   4.   8.   3.   9.  27.]]
