# Imputing data

### Support Functions

In [1]:
import numpy as np
import matplotlib.pyplot as plt
    
# For make_features_labels
from tools.feature_format import featureFormat, targetFeatureSplit

def make_features_labels(dataset, feature_names, remove_nan=False):
    """
    Quick way to split a dataset into features and labels based on feature names
    """
    data = featureFormat(dataset, feature_names, 
                         sort_keys=True, remove_NaN=remove_nan, remove_all_zeroes=False)
    labels, features = targetFeatureSplit(data)
    labels = np.array(labels)
    features = np.array(features)
    
    return features, labels

### Get necessary data and variables from [feature_selection](feature_selection.ipynb)

In [2]:
import pickle
with open('data/final_project_dataset.pkl', 'rb') as f:
    fin_data = pickle.load(f)

# Remove 'TOTAL' from data
del fin_data['TOTAL']

# Convert negative numbers in data to zero
count = 0
for key1, item in fin_data.items():
    for key2 in item.keys():
        if item[key2] < 0:
            item[key2] = 0
            count += 1
print '%i negative values changed to zero' % count

with open('selected_feature_names.pkl', 'rb') as f:
    selected_feature_names = pickle.load(f)
    
selected_features, selected_labels = make_features_labels(fin_data, selected_feature_names)
imputer_dict = {'original': (selected_features, selected_labels)}

66 negative values changed to zero


In [3]:
nan_mask = np.isnan(selected_features)
zero_features = selected_features.copy()
zero_features[nan_mask] = 0
imputer_dict['zero'] = (zero_features, selected_labels)

In [4]:
#Show that zero_features is actually the same as what was being done before with remove_NaN
test_features, test_labels = make_features_labels(fin_data, 
                                                  selected_feature_names, 
                                                  remove_nan=True)
np.array_equal(zero_features, test_features)

True

In [5]:
from sklearn.preprocessing import Imputer
imputer_strategy_list = ['mean', 'median', 'most_frequent']
for strategy in imputer_strategy_list:
    imp = Imputer(strategy='%s' % strategy)
    new_features = imp.fit_transform(selected_features)
    imputer_dict[strategy] = (new_features, test_labels)

In [8]:
for key in imputer_dict.keys():
    print key
    print imputer_dict[key][0].shape, imputer_dict[key][1].shape
    print np.array_equal(selected_features, imputer_dict[key][0])
    print

zero
(145, 11) (145,)
False

most_frequent
(145, 11) (145,)
False

median
(145, 11) (145,)
False

original
(145, 11) (145,)
False

mean
(145, 11) (145,)
False



In [10]:
#Expect the original to not be equal to itself because nan != nan
np.array_equiv(selected_features, selected_features)

False

In [11]:
with open('imputer_dict.pkl', 'wb') as f:
    pickle.dump(imputer_dict, f)