In [1]:
# Capstone 2 - Pre-processing and Training Data Development
# Identify categorical and numeric variables.
# Create dummy data (categorical)
# Scale numeric data (numeric)
# Split the dataset into training and testing subsets.

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("data/raw/kc_house_data.csv")  
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
df.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [5]:
# Drop 'id', not predictive
df.drop('id', axis=1, inplace=True)

# Convert zipcode to categorical
df['zipcode'] = df['zipcode'].astype('category')

# Check types
df.dtypes

date               object
price             float64
bedrooms            int64
bathrooms         float64
sqft_living         int64
sqft_lot            int64
floors            float64
waterfront          int64
view                int64
condition           int64
grade               int64
sqft_above          int64
sqft_basement       int64
yr_built            int64
yr_renovated        int64
zipcode          category
lat               float64
long              float64
sqft_living15       int64
sqft_lot15          int64
dtype: object

In [6]:
# Identify data types
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols, numeric_cols

(Index(['date', 'zipcode'], dtype='object'),
 Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
        'waterfront', 'view', 'condition', 'grade', 'sqft_above',
        'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long',
        'sqft_living15', 'sqft_lot15'],
       dtype='object'))

In [7]:
# Create dummy data
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df_encoded.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,zipcode_98146,zipcode_98148,zipcode_98155,zipcode_98166,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,...,False,False,False,False,False,False,True,False,False,False
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,...,False,False,False,False,False,False,False,False,False,False
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,...,False,False,False,False,False,False,False,False,False,False
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,...,False,False,False,False,False,False,False,False,False,False
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,...,False,False,False,False,False,False,False,False,False,False


In [8]:
# Scaling
numeric_cols = df_encoded.select_dtypes(include=['int64','float64']).columns
numeric_cols.dtype
target_col = "price"
numeric_cols = numeric_cols.drop(target_col)
scaler = StandardScaler()
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])
df_encoded.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,zipcode_98146,zipcode_98148,zipcode_98155,zipcode_98166,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199
0,221900.0,-0.398737,-1.447464,-0.979835,-0.228321,-0.915427,-0.087173,-0.305759,-0.629187,-0.558836,...,False,False,False,False,False,False,True,False,False,False
1,538000.0,-0.398737,0.175607,0.533634,-0.189885,0.936506,-0.087173,-0.305759,-0.629187,-0.558836,...,False,False,False,False,False,False,False,False,False,False
2,180000.0,-1.473959,-1.447464,-1.426254,-0.123298,-0.915427,-0.087173,-0.305759,-0.629187,-1.409587,...,False,False,False,False,False,False,False,False,False,False
3,604000.0,0.676485,1.149449,-0.13055,-0.244014,-0.915427,-0.087173,-0.305759,2.444294,-0.558836,...,False,False,False,False,False,False,False,False,False,False
4,510000.0,-0.398737,-0.149007,-0.435422,-0.169653,-0.915427,-0.087173,-0.305759,-0.629187,0.291916,...,False,False,False,False,False,False,False,False,False,False


In [9]:
# Define X and y
X = df_encoded.drop(target_col, axis=1)
y = df_encoded[target_col]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((17290, 457), (4323, 457))

In [10]:
df_encoded.to_csv("data/raw/housing_preprocessed.csv", index=False)