# Feature Processing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
from helpers import *

In [3]:
# Import full data
(yb_t, input_data_t, ids_t) = load_csv_data('data/train.csv', sub_sample = False)
(yb_test, input_data_test, ids_test) = load_csv_data('data/test.csv',sub_sample = False)

# Replace missing data with nan
input_data_t_clean = clean_data(input_data_t)
yb_t_clean = clean_data(yb_t)
input_data_test_clean = clean_data(input_data_test)
yb_test_clean = clean_data(yb_test)

In [27]:
# Eliminate data points with missing data
def delDataPoints(input_tx, input_y):
    mask = [~np.isnan(input_tx).any(axis =1)]
    tx = input_tx[mask]
    y = input_y[mask]
    return tx, y
    
tx, y = delDataPoints(input_data_t_clean, yb_t_clean)
print("tx.shape = ", tx.shape)
print("y.shape = ", y.shape)

tx.shape =  (68114, 30)
y.shape =  (68114,)


In [89]:
# Eliminate features with missing data for some points (must consider both training and test sets)
def delFeatures(input_tx_train, input_tx_test):
    tx_complete = np.concatenate((input_tx_train, input_tx_test), axis=0)
    mask = np.array([~np.isnan(tx_complete).any(axis =0)]).ravel()
    print(mask)
    tx_train = input_tx_train[:,mask]
    tx_test = input_tx_test[:,mask]
    return tx_train, tx_test

tx_train, tx_test = delFeatures(input_data_t_clean, input_data_test_clean)
print("tx_train.shape = ", tx_train.shape)
print("tx_test.shape = ", tx_test.shape)

[False  True  True  True False False False  True  True  True  True  True
 False  True  True  True  True  True  True  True  True  True  True False
 False False False False False  True]
tx_train.shape =  (250000, 19)
tx_test.shape =  (568238, 19)


In [132]:
# Replacing missing values by their respective mean for each feature (computing for the TRAINING set only!)
def replaceByMeanPerFeature(input_tx_train, input_tx_test):
    train_mean = np.nanmean(input_tx_train, axis=0)
    for ind, mean in enumerate(train_mean):
        mask_tmp = np.isnan(input_tx_train[:,ind])
        input_tx_train[mask_tmp, ind] = mean
        mask_tmp = np.isnan(input_tx_test[:,ind])
        input_tx_test[mask_tmp, ind] = mean
    return input_tx_train, input_tx_test
       
#replaceByMeanPerFeature(input_data_t_clean, input_data_test_clean)  

In [133]:
# test for replaceByMeanPerFeature
A = np.array([[np.nan, 0, 4], [3, np.nan, 4], [2, 1, 4]])
print(A)
B = np.array([[np.nan, 2, 3],[4, np.nan, 6]])
print(B)
A, B = replaceByMeanPerFeature(A, B)
print(B)

[[ nan   0.   4.]
 [  3.  nan   4.]
 [  2.   1.   4.]]
[[ nan   2.   3.]
 [  4.  nan   6.]]
[[ 2.5  2.   3. ]
 [ 4.   0.5  6. ]]
