In [1]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf

# matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
HOUSING_PATH = "raw_data"

In [3]:
def load_housing_data(housing_path=HOUSING_PATH, ds_type=""):
    csv_path = os.path.join(housing_path, "final_" + ds_type + ".csv")
    return pd.read_csv(csv_path)

In [4]:
# view sample housing data
housing = load_housing_data(ds_type="train")
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13600 entries, 0 to 13599
Data columns (total 10 columns):
Unnamed: 0            13600 non-null int64
longitude             13600 non-null float64
latitude              13600 non-null float64
housing_median_age    13600 non-null float64
total_rooms           13600 non-null float64
total_bedrooms        13600 non-null float64
population            13600 non-null float64
households            13600 non-null float64
median_income         13600 non-null float64
median_house_value    13600 non-null float64
dtypes: float64(9), int64(1)
memory usage: 1.0 MB


In [5]:
housing.iloc[0]

Unnamed: 0                 0.000000
longitude                  1.255990
latitude                  -1.179384
housing_median_age        -0.839725
total_rooms                0.784693
total_bedrooms             0.232195
population                 0.571559
households                 0.445669
median_income              0.835270
median_house_value    223500.000000
Name: 0, dtype: float64

^ everything looks as expected here. TODO: the id is manufactured+generated from data prepping and could be reverse engineered to get the original id, but should likely be modified such that the original id is retained

In [6]:
housing_feat_labels = list(housing)
NUM_FEATS = len(housing_feat_labels)
print(housing_feat_labels)
print(NUM_FEATS)

['Unnamed: 0', 'longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']
10


In [7]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _floats_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _floats_features(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[v for v in value]))

In [8]:
def generate_tfrecords(input_filename, output_filename, num_feats):
    print("Start to convert {} to {}".format(input_filename, output_filename))
    writer = tf.python_io.TFRecordWriter(output_filename)

    index = 0
    for line in open(input_filename, "r"):
        index += 1

        # Ignore first line (header)
        if index == 1:
            continue

        data = line.split(",")
        label = float(data[num_feats])
        features = [float(i) for i in data[1:num_feats]]
        iid = int(data[0])
        
        # create features
        feature = {'/iid': _int64_feature(iid),
                   '/features' : _floats_features(features),
                   '/label': _floats_feature(label)}
        
        # create example protocol buffer
        example = tf.train.Example(features=tf.train.Features(feature=feature))
        
        writer.write(example.SerializeToString())

    writer.close()
    print("Successfully convert {} to {}".format(input_filename, output_filename))

In [9]:
for file in os.listdir(HOUSING_PATH):
    if file.endswith(".csv"): # only csv files
        if file.startswith("final_"):
            dataset_type = file.split(".")[0]
            input_path = os.path.join(".", HOUSING_PATH, file)
            output_path = os.path.join("..", "data", dataset_type+".tfrecords")
            generate_tfrecords(input_path, output_path, NUM_FEATS-1)

Start to convert ./raw_data/final_train.csv to ../data/final_train.tfrecords
Successfully convert ./raw_data/final_train.csv to ../data/final_train.tfrecords
Start to convert ./raw_data/final_test.csv to ../data/final_test.tfrecords
Successfully convert ./raw_data/final_test.csv to ../data/final_test.tfrecords
Start to convert ./raw_data/final_validation.csv to ../data/final_validation.tfrecords
Successfully convert ./raw_data/final_validation.csv to ../data/final_validation.tfrecords
