# Prep CSV

In [1]:
import pandas as pd
import os
import numpy as np

# matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
HOUSING_PATH = "raw_data"

In [3]:
def load_housing_data(housing_path=HOUSING_PATH, ds_type=""):
    if ds_type == "test":
        csv_path = os.path.join(housing_path, "raw_cali_housing_test.csv")
    else:
        csv_path = os.path.join(housing_path, "split_"+ds_type+".csv")
    return pd.read_csv(csv_path)

In [4]:
# view sample housing data
housing = load_housing_data(ds_type="train")
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13600 entries, 0 to 13599
Data columns (total 10 columns):
Unnamed: 0            13600 non-null int64
longitude             13600 non-null float64
latitude              13600 non-null float64
housing_median_age    13600 non-null float64
total_rooms           13600 non-null float64
total_bedrooms        13600 non-null float64
population            13600 non-null float64
households            13600 non-null float64
median_income         13600 non-null float64
median_house_value    13600 non-null float64
dtypes: float64(9), int64(1)
memory usage: 1.0 MB


In [5]:
HEADER_COLS = list(housing.columns.values)[1:]
print(HEADER_COLS)

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']


In [6]:
# we need a custom transformer to handle Pandas DataFrames
# NOTE: there may be a `columnTransformer` class in the future
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [7]:
# TODO: modify strange list append
temp = list(housing)
label_val = []
label_val.append(temp[-1])
num_attribs = temp[1:-1] # don't use id or "answer"

# numerical pipeline
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

# label passthrough
label_pipeline = Pipeline([
        ('selector', DataFrameSelector(label_val)),
    ])

# complete pipeline
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("label_pipeline", label_pipeline),
    ])



In [8]:
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared[1]

array([-1.19173712e+00,  7.70060454e-01, -4.51560797e-02,  1.06772223e+00,
        1.22704143e+00,  1.22333017e+00,  1.27324618e+00,  6.91416926e-02,
        2.73900000e+05])

In [10]:
train_df = pd.DataFrame(housing_prepared)
train_df.to_csv(os.path.join(HOUSING_PATH, "final_train.csv"), header=HEADER_COLS)

In [11]:
housing_val = load_housing_data(ds_type="valid")
val_transform = full_pipeline.transform(housing_val)
val_df = pd.DataFrame(val_transform)
val_df.to_csv(os.path.join(HOUSING_PATH, "final_validation.csv"), header=HEADER_COLS)

In [12]:
housing_test = load_housing_data(ds_type="test")
test_transform = full_pipeline.transform(housing_test)
test_df = pd.DataFrame(test_transform)
test_df.to_csv(os.path.join(HOUSING_PATH, "final_test.csv"), header=HEADER_COLS)