In [1]:
from sklearn import linear_model, metrics
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 0)
%matplotlib inline

In [2]:
df = pd.read_csv('kc_house_data_train.csv', index_col=0)

In [3]:
# Fix 33 bdrms typo
df['bedrooms'] = df.bedrooms.replace([33],3)

In [None]:
# Make dummies for view
df['view'] = np.where(df['view'] == 2, 1, df['view'])
df = pd.get_dummies(df, columns=['view'], drop_first=True)
df.rename(columns={'view_1':'view_1_or_2'}, inplace=True)

In [None]:
# Make dummies for zipcode
df = pd.get_dummies(df, columns=['zipcode'], drop_first=True)

In [4]:
# Cap all sqft features to remove outliers
df['sqft_above'] = np.where(df['sqft_above'] > 7000, 7000, df['sqft_above'])
df['sqft_basement'] = np.where(df['sqft_basement'] > 3000, 3000, df['sqft_basement'])
df['sqft_living'] = np.where(df['sqft_living'] > 8000, 8000, df['sqft_living'])
df['sqft_living15'] = np.where(df['sqft_living15'] > 6000, 6000, df['sqft_living15'])
df['sqft_lot'] = np.where(df['sqft_lot'] > 100000, 100000, df['sqft_lot'])
df['sqft_lot15'] = np.where(df['sqft_lot15'] > 100000, 100000, df['sqft_lot15'])

In [5]:
# Engineer feature for date in terms of years old (since built or renovated) at time of sale

# Get latter of year built/renovated
df['yr_built_or_renovated'] = df.yr_built
df['yr_built_or_renovated'].where(df.yr_built > df.yr_renovated, df.yr_renovated, inplace=True)

# Subtract year built/renovated from date sold
df['years_old'] = df.date.map(lambda x : int(x[:4])) - df.yr_built_or_renovated

# Add age feature to df
df['age_feature'] = (df['years_old']-57)**2

In [None]:
# Add dummy for basement
df['basement'] = np.where(df.sqft_basement, 1, 0)

In [None]:
# Add ratio property/neighbors ratio for sqft_lot
df['sqft_lot_relative_to_nabe'] = df['sqft_lot'] / df['sqft_lot15']

In [None]:
# Add ratio property/neighbors ratio for sqft_living
df['sqft_living_relative_to_nabe'] = df['sqft_living'] / df['sqft_living15']

In [None]:
# Add dummy for renovated
df['renovated'] = np.where(df.yr_renovated, 1, 0)

In [None]:
# Add sqft per floor
df['sq_ft_per_flr'] = df.sqft_above / df.floors

In [None]:
# Add dummy for has bathroom
df['has_bathroom'] = np.where(df.bathrooms, 1, 0)

In [None]:
# Add grade feature
df['grade'] = (df['grade']-9)**2

In [6]:
# Drop id and date
df.drop(columns=['id', 'date'], inplace=True)

In [7]:
# Write prepared data to csv for modelling
df.to_csv('prepared_data.csv')

In [8]:
df.shape

(17290, 22)

In [None]:
df.head()

In [None]:
df.corr()