In [60]:
# Loading Packages
import pandas as pd
import numpy as np
import datetime
from dateutil.relativedelta import relativedelta

from scipy.stats import norm
import seaborn as sns
import matplotlib.pyplot as plt
from geopy.distance import distance

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures


from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from utils import *

%load_ext autoreload
%autoreload 2

pd.set_option('display.max_columns', 300)
sns.set(style='white', context='notebook', palette='deep')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Read in hold out data, scalers, and best model

In [61]:
holdout = pd.read_csv('kc_house_data_test_features.csv')

In [62]:
scaler_file = open('scaler.pickle','rb')
final_scaler = pickle.load(scaler_file)

model_file = open('model.pickle','rb')
final_model = pickle.load(model_file)

scaler_file.close()
model_file.close()

Feature Engineering for holdout set

In [63]:
# Getting rid of higly correlated features

holdout.drop(columns = ['sqft_above', 'sqft_living15', 'sqft_lot15','id','Unnamed: 0'], inplace=True)

In [64]:
# define catgorical and continius variables

categorical_variables = ['condition', 'view', 'floors', 'grade',  'waterfront', 'zipcode']
contin_variables = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_basement']

In [65]:
# Cleaning data wit map_bed_bath function

holdout = holdout.apply(map_bed_bath, axis = 1)

# Lets Create new feature - distance from the house to downtown in miles by using function distance_to_dwntwn
holdout['dist_dntwn'] = holdout.apply(distance_to_dwntwn, axis = 1)  
holdout = holdout.drop(columns = ['lat','long'])

In [66]:
# Createing catigorical variable based on sqft_basement variable
holdout['basement'] = holdout['sqft_basement'].map(lambda x : 1 if x != 0 else 0)

# Add it to our list of categorical variables
categorical_variables = categorical_variables+['basement']

# Generate dummies based on categorica variables with function to_dummies
holdout = to_dummies(holdout, categorical_variables)

In [67]:
# Creating new features based on when the house was built and when it was renovated.

holdout['years_old'] = holdout['yr_built'].map(lambda x : 2021-x)
holdout['abs_years_old'] = holdout['years_old'].map(lambda x: abs(x-round(holdout['years_old'].mean())))

# Converting 'yr_renovated' column to datetime format with function format_yr_renov
# And Create new features 'yrs_from_renov' based on 'yr_renovated'

holdout = holdout.apply(format_yr_renov, axis=1)
holdout['yrs_from_renov'] = holdout['yr_renovated'].map(lambda x : relativedelta(datetime.datetime.now(), x).years)

In [68]:
# Drop some columns that we wont use anymore
holdout = holdout.drop(columns=['date','yr_built','yr_renovated'])

In [69]:
# Lets create poly features degrtee 2 for quantitative variables except features with dummies

fts_for_poly = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_basement', 'dist_dntwn', 'years_old', 'yrs_from_renov', 'abs_years_old']

# generate poly features degree 2 with function create_poly_df
df_poly2 = create_poly_df(holdout[fts_for_poly], 2)

# Seperate features with dummies values 
not_poly_fts = [x for x in holdout.columns if x not in fts_for_poly]

# Combine poly 2 degree variables and dummies 
train_poly2 = pd.merge(df_poly2, holdout[not_poly_fts], left_index=True, right_index=True)

In [70]:
transformed_holdout = final_scaler.transform(train_poly2)

In [71]:
# Predicting the holdout set

In [72]:
final_answers = final_model.predict(train_poly2)

In [73]:
df = pd.DataFrame(final_answers)

In [74]:
# Exporting your predictions

In [75]:
df.to_csv('housing_preds_ivan.csv')