# Predictions on the Holdout Set

In [1]:
#importing libraries
import pandas as pd
import numpy as np 

pd.set_option('display.max_columns', 300)

In [2]:
#importing data
df = pd.read_csv('Data/kc_house_data_test_features.csv', index_col=0)
print(df.shape)
df.head()

(4322, 20)


Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,1974300020,20140827T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
1,1974300020,20150218T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
2,3630020380,20141107T000000,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576
3,1771000290,20141203T000000,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565
4,5126310470,20150115T000000,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916


In [3]:
#Checking for null values
df.isna().sum().head()

id             0
date           0
bedrooms       0
bathrooms      0
sqft_living    0
dtype: int64

There seems to be no null values I will proceed bring in my model and features.

## Model

In [4]:
import pickle

In [5]:
#importing the model
infile = open("Data/model.pickle",'rb')
model = pickle.load(infile)
infile.close()

In [6]:
print(model.intercept_)
print(len(model.coef_))

11.85382445291674
88


# Cleaning the New Data

In [7]:
#bathrooms
conditions = [df['bedrooms'] > 6, df['bedrooms'] < 1]
choices = [6, 1]

df['bedrooms'] = np.select(conditions, choices, df['bedrooms'])

#bedrooms
conditions = [df['bathrooms'] > 5, df['bathrooms'] < 1]
choices = [5, 1]

df['bathrooms'] = np.select(conditions, choices, df['bathrooms'])

#sqft_living
df['sqft_living']=np.where(df['sqft_living'] > 8000, 8000, df['sqft_living'])

#sqft_lot
df['sqft_lot']=np.where(df['sqft_lot'] > 600000, 600000, df['sqft_lot'])

#sqft_living15
df['sqft_living15']=np.where(df['sqft_living15'] > 5000, 5000, df['sqft_living15'])

#sqft_lot15
df['sqft_lot15']=np.where(df['sqft_lot15'] > 300000, 300000, df['sqft_lot15'])

#new_grades - splitting the grades column into a manageable 3 groups

#creating a new_grades column
conditions = [df['grade'] <=6, df['grade'] <=9, df['grade'] >= 10]
choices = [1, 2, 3] #1 is Below Average, 2 is Average, 3 is Above
df['new_grades'] = np.select(conditions, choices)

## Generating the Features

In [8]:
#reno_size - a new interation feature between size and renovation

#creating a been_renovated column for a dummy variable for an interaction feature. 
df['been_renovated'] = np.where(df['yr_renovated'] != 0, 1, df['yr_renovated'])

df = pd.get_dummies(df, columns=['been_renovated'], drop_first = True)

df['reno_size'] = df['been_renovated_1']*df['sqft_living']


#amazon, microsoft, and boeing distance columns

#merging latitude and longitude into a column of tuples.
df['lat_long'] = list(zip(df['lat'], df['long']))

#creating a function to convert the latitude and longitude distances to miles
from math import radians, sin, cos, atan2, sqrt

#These are the coordinates for Amazon, Boeing, and Microsoft Headquarters
businesses=[(47.622288, -122.336494),(47.532531, -122.311605), (47.639323, -122.128383)]

#This is a list of lat long coordinates of homes
lat_long = list(df['lat_long'])

def distance_from(home, business):
    """
    Uses the haversine formula to calculate the average distance between the homes and 
    popular work headquarters then converts the distance into miles.
    """
    #radius of earth in km
    R = 6373
    
    lat1 = radians(business[0])
    long1 = radians(business[1])
    lat2 = radians(home[0])
    long2 = radians(home[1])

    dlon = long2 - long1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    
    #divide by 1.609 to convert into miles
    distance = R*c / 1.609
    return distance

distances = []

#getting the distance(in miles) between the three workplaces and the residences
for business in businesses:
    for home in lat_long:
        distances.append(distance_from(home, business))
        
#making the three separate columns for them
df['amazon'] = np.array(distances[:df.shape[0]])
df['boeing'] = np.array(distances[df.shape[0]:df.shape[0]*2])
df['microsoft'] = np.array(distances[df.shape[0]*2:])



#polynomial feature of yard_size squared

#creating a has basement column
df['has_basement'] = np.where(df['sqft_basement'] != 0, 1, 0)

#creating a floor including basement column
df['floor_base'] = df['floors'] + df['has_basement']

#Subtracting the lot size by the approximate living space of a single floor
#The floor_base column has the basement in consideration as a floor. 
#This needed to be done to account for total living space.
df['yard_size'] = df['sqft_lot'] - (df['sqft_living'] / df['floor_base'])

#transforming it into a polynomial feature
df['yard_size2']=df['yard_size']*df['yard_size']

## Getting Dummies

In [9]:
#getting dummies
df = pd.get_dummies(df, columns=['zipcode', 'waterfront', 'condition', 'new_grades'], drop_first = True)

## Final Columns

In [10]:
final_columns = ['amazon', 'boeing', 'microsoft','reno_size', 'yard_size2', 'bathrooms', 'waterfront_1', 'new_grades_2',
       'sqft_living', 'sqft_lot', 'sqft_living15', 'sqft_lot15', 'sqft_basement', 'sqft_above','new_grades_3', 
       'condition_2','condition_3','condition_4','condition_5','zipcode_98002', 'zipcode_98003',
       'zipcode_98004', 'zipcode_98005', 'zipcode_98006', 'zipcode_98007',
       'zipcode_98008', 'zipcode_98010', 'zipcode_98011', 'zipcode_98014',
       'zipcode_98019', 'zipcode_98022', 'zipcode_98023', 'zipcode_98024',
       'zipcode_98027', 'zipcode_98028', 'zipcode_98029', 'zipcode_98030',
       'zipcode_98031', 'zipcode_98032', 'zipcode_98033', 'zipcode_98034',
       'zipcode_98038', 'zipcode_98039', 'zipcode_98040', 'zipcode_98042',
       'zipcode_98045', 'zipcode_98052', 'zipcode_98053', 'zipcode_98055',
       'zipcode_98056', 'zipcode_98058', 'zipcode_98059', 'zipcode_98065',
       'zipcode_98070', 'zipcode_98072', 'zipcode_98074', 'zipcode_98075',
       'zipcode_98077', 'zipcode_98092', 'zipcode_98102', 'zipcode_98103',
       'zipcode_98105', 'zipcode_98106', 'zipcode_98107', 'zipcode_98108',
       'zipcode_98109', 'zipcode_98112', 'zipcode_98115', 'zipcode_98116',
       'zipcode_98117', 'zipcode_98118', 'zipcode_98119', 'zipcode_98122',
       'zipcode_98125', 'zipcode_98126', 'zipcode_98133', 'zipcode_98136',
       'zipcode_98144', 'zipcode_98146', 'zipcode_98148', 'zipcode_98155',
       'zipcode_98166', 'zipcode_98168', 'zipcode_98177', 'zipcode_98178',
       'zipcode_98188', 'zipcode_98198', 'zipcode_98199']

## Bringing in the Model

In [11]:
final_answers = model.predict(df[final_columns])
final_answer_exp = np.exp(final_answers)

## Exporting Predictions

In [12]:
df = pd.DataFrame(final_answer_exp)
df.to_csv('Data/housing_preds_zachary_greenberg.csv')