## Importing the Dependencies

In [393]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from pathlib import Path
import pickle
import json
import warnings

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (15,10)
warnings.filterwarnings('ignore', category=UserWarning)

## Loading the Dataset

In [7]:
df = pd.read_csv('C:/Users/admin/Desktop/PortfolioProjects/MachineLearning/house_price_prediction/data/data.csv')
df.head(3)

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA


In [11]:
df1 = df.drop(['date', 'street', 'country', 'statezip'], axis='columns')
df1.head(3)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city
0,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,Shoreline
1,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,Seattle
2,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,Kent


In [13]:
df1.shape

(4600, 14)

## Descriptive Statistics of the Data

### Data type

In [16]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          4600 non-null   float64
 1   bedrooms       4600 non-null   float64
 2   bathrooms      4600 non-null   float64
 3   sqft_living    4600 non-null   int64  
 4   sqft_lot       4600 non-null   int64  
 5   floors         4600 non-null   float64
 6   waterfront     4600 non-null   int64  
 7   view           4600 non-null   int64  
 8   condition      4600 non-null   int64  
 9   sqft_above     4600 non-null   int64  
 10  sqft_basement  4600 non-null   int64  
 11  yr_built       4600 non-null   int64  
 12  yr_renovated   4600 non-null   int64  
 13  city           4600 non-null   object 
dtypes: float64(4), int64(9), object(1)
memory usage: 503.3+ KB


### General Statistics

In [18]:
df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,4600.0,551962.988473,563834.702547,0.0,322875.0,460943.461539,654962.5,26590000.0
bedrooms,4600.0,3.40087,0.908848,0.0,3.0,3.0,4.0,9.0
bathrooms,4600.0,2.160815,0.783781,0.0,1.75,2.25,2.5,8.0
sqft_living,4600.0,2139.346957,963.206916,370.0,1460.0,1980.0,2620.0,13540.0
sqft_lot,4600.0,14852.516087,35884.436145,638.0,5000.75,7683.0,11001.25,1074218.0
floors,4600.0,1.512065,0.538288,1.0,1.0,1.5,2.0,3.5
waterfront,4600.0,0.007174,0.084404,0.0,0.0,0.0,0.0,1.0
view,4600.0,0.240652,0.778405,0.0,0.0,0.0,0.0,4.0
condition,4600.0,3.451739,0.67723,1.0,3.0,3.0,4.0,5.0
sqft_above,4600.0,1827.265435,862.168977,370.0,1190.0,1590.0,2300.0,9410.0


## Data Preprocessing

In [25]:
# Checking for missing values

df1.isnull().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
city             0
dtype: int64

In [38]:
# Checking the distribution of houses in the dataset across different cities

df1.city = df1.city.apply(lambda x: x.strip())
location_stats = df1.city.value_counts()
location_stats

city
Seattle                1573
Renton                  293
Bellevue                286
Redmond                 235
Issaquah                187
Kirkland                187
Kent                    185
Auburn                  176
Sammamish               175
Federal Way             148
Shoreline               123
Woodinville             115
Maple Valley             96
Mercer Island            86
Burien                   74
Snoqualmie               71
Kenmore                  66
Des Moines               58
North Bend               50
Covington                43
Duvall                   42
Lake Forest Park         36
Bothell                  33
Newcastle                33
SeaTac                   29
Tukwila                  29
Vashon                   29
Enumclaw                 28
Carnation                22
Normandy Park            18
Clyde Hill               11
Medina                   11
Fall City                11
Black Diamond             9
Ravensdale                7
Pacific        

In [40]:
location_stats.values.sum()

4600

In [46]:
location_stats_less_than_10 = location_stats[location_stats<10]
location_stats_less_than_10, len(location_stats_less_than_10), location_stats_less_than_10.values.sum()

(city
 Black Diamond          9
 Ravensdale             7
 Pacific                6
 Algona                 5
 Yarrow Point           4
 Skykomish              3
 Preston                2
 Milton                 2
 Inglewood-Finn Hill    1
 Snoqualmie Pass        1
 Beaux Arts Village     1
 Name: count, dtype: int64,
 11,
 41)

### Dimensionality Reduction

Changing the name of the cities with datapoints < 10 to `others`

In [53]:
df1.city = df1.city.apply(lambda x: 'others' if x in location_stats_less_than_10 else x)
df1.city.value_counts()

city
Seattle             1573
Renton               293
Bellevue             286
Redmond              235
Issaquah             187
Kirkland             187
Kent                 185
Auburn               176
Sammamish            175
Federal Way          148
Shoreline            123
Woodinville          115
Maple Valley          96
Mercer Island         86
Burien                74
Snoqualmie            71
Kenmore               66
Des Moines            58
North Bend            50
Covington             43
Duvall                42
others                41
Lake Forest Park      36
Newcastle             33
Bothell               33
SeaTac                29
Tukwila               29
Vashon                29
Enumclaw              28
Carnation             22
Normandy Park         18
Clyde Hill            11
Medina                11
Fall City             11
Name: count, dtype: int64

In [55]:
# Checking the number of bedrooms across the dataset

df1.bedrooms.unique()

array([3., 5., 4., 2., 6., 7., 9., 1., 8., 0.])

In [57]:
# Checking the number of bathrooms across the dataset

df1.bathrooms.unique()

array([1.5 , 2.5 , 2.  , 2.25, 1.  , 1.75, 2.75, 3.  , 3.25, 3.5 , 8.  ,
       4.25, 4.  , 3.75, 5.  , 4.5 , 5.75, 1.25, 6.5 , 4.75, 0.75, 5.25,
       5.5 , 6.25, 0.  , 6.75])

In [76]:
bathroom_stats = df1.groupby('bathrooms')['bathrooms'].agg('count')
bathroom_stats

bathrooms
0.00       2
0.75      17
1.00     743
1.25       3
1.50     291
1.75     629
2.00     427
2.25     419
2.50    1189
2.75     276
3.00     167
3.25     136
3.50     162
3.75      37
4.00      23
4.25      23
4.50      29
4.75       7
5.00       6
5.25       4
5.50       4
5.75       1
6.25       2
6.50       1
6.75       1
8.00       1
Name: bathrooms, dtype: int64

**Number of bathrooms should be a discrete quantity but in the dataset there are number of rooms taking up decimal values and they have to be removed**

In [91]:
df2 = df1[df1['bathrooms']%1 == 0]
df2.bathrooms.value_counts()

bathrooms
1.0    743
2.0    427
3.0    167
4.0     23
5.0      6
0.0      2
8.0      1
Name: count, dtype: int64

In [81]:
df1.iloc[4516, 2].dtype

dtype('float64')

In [93]:
# Checking distribution of floors in the dataset

df2.floors.value_counts()

floors
1.0    937
1.5    212
2.0    189
3.0     20
2.5     10
3.5      1
Name: count, dtype: int64

**Some datapoints have number of floors which isn't a discrete value and as such need to be removed**

In [96]:
df3 = df2[df2.floors % 1 == 0]
df3.floors.value_counts()

floors
1.0    937
2.0    189
3.0     20
Name: count, dtype: int64

In [98]:
df3.shape

(1146, 14)

In [104]:
df3.sample(3)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city
2929,450000.0,3.0,2.0,1430,3480,1.0,0,0,3,980,450,1947,2012,Seattle
4589,182805.0,3.0,1.0,1040,8892,1.0,0,0,4,800,240,1958,1972,Federal Way
1297,355000.0,3.0,2.0,1220,1186,3.0,0,0,3,1220,0,2007,0,Seattle


### Feature Engineering

Adding a new column - price per sqft of the entire lot

In [106]:
df3['price_per_sqft_lot'] = df3['price']/df3['sqft_lot']
df3.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['price_per_sqft_lot'] = df3['price']/df3['sqft_lot']


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,price_per_sqft_lot
2,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,Kent,28.626433
5,490000.0,2.0,1.0,880,6380,1.0,0,0,3,880,0,1938,1994,Seattle,76.802508
6,335000.0,2.0,2.0,1350,2560,1.0,0,0,3,1350,0,1976,0,Redmond,130.859375


In [108]:
df3['price_per_sqft_lot'].value_counts()

price_per_sqft_lot
0.000000      14
40.000000      3
62.500000      3
25.000000      3
30.303030      2
              ..
63.157895      1
63.878828      1
112.962963     1
62.403698      1
30.678733      1
Name: count, Length: 1112, dtype: int64

In [110]:
df3[df3.price_per_sqft_lot == 0]

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,price_per_sqft_lot
4362,0.0,4.0,4.0,3680,18804,2.0,0,0,3,3680,0,1990,2009,Sammamish,0.0
4383,0.0,5.0,4.0,4430,9000,2.0,0,0,3,4430,0,2013,1923,Bellevue,0.0
4412,0.0,3.0,3.0,1860,7440,1.0,0,0,5,1040,820,1954,0,Seattle,0.0
4413,0.0,4.0,3.0,1990,6180,2.0,0,0,3,1990,0,1990,2009,Federal Way,0.0
4420,0.0,4.0,1.0,1360,13372,1.0,0,0,3,1360,0,1955,2005,Kenmore,0.0
4442,0.0,1.0,1.0,720,6000,1.0,0,0,3,720,0,1940,1996,Seattle,0.0
4453,0.0,3.0,1.0,1300,6710,1.0,0,0,4,1300,0,1952,0,Mercer Island,0.0
4479,0.0,5.0,2.0,1910,7200,1.0,0,0,4,1110,800,1951,1999,Seattle,0.0
4509,0.0,6.0,3.0,3020,13783,2.0,0,0,3,3020,0,1952,2002,Mercer Island,0.0
4521,0.0,4.0,1.0,1810,7500,1.0,0,0,2,1410,400,1959,0,Seattle,0.0


### Dropping datapoints which don't have a price / have a price of 0.0

In [115]:
df4 = df3[~(df3.price == 0.0)]
df4.shape

(1132, 15)

In [119]:
# Checking the general statistics for the price per sqft lot feature

df4['price_per_sqft_lot'].describe().T

count    1132.000000
mean       71.448297
std       120.918204
min         0.384012
25%        27.914382
50%        44.203173
75%        87.306317
max      3412.036443
Name: price_per_sqft_lot, dtype: float64

**As seen in the data above the minimum price per sqft lot is `less than 0.5` and the maximum price per sqft lot is `3412`. So there's need for outlier removal for each city using the mean and standard deviation of the dataset.**

### Outlier Removal Using Mean and Standard Deviation For Each City

In [126]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('city'):
        m = np.mean(subdf.price_per_sqft_lot)
        st = np.std(subdf.price_per_sqft_lot)
        reduced_df = subdf[(subdf.price_per_sqft_lot>(m-st)) & (subdf.price_per_sqft_lot<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

df5 = remove_pps_outliers(df4)
df5.shape

(908, 15)

In [128]:
df5.price_per_sqft_lot.describe().T

count    908.000000
mean      58.050346
std       40.942412
min        0.384012
25%       29.450003
50%       42.733062
75%       79.182692
max      220.032841
Name: price_per_sqft_lot, dtype: float64

### Outlier Removal Using Business Logic

**Shifting focus to the minimum price per sqft lot in the dataset. While prices for lots were cheaper in 2014 which is the period which this data was collected from, a quick Google search shows that the price was around `30` for a sqft lot. So it's imperative to check out how many datapoints have prices per sqft lot less than `20` which is the criteria I choose to use for this project.**

In [146]:
len(df5[df5['price_per_sqft_lot']<20])

85

In [151]:
df6 = df5[~(df5['price_per_sqft_lot']<20)]
df6.shape

(823, 15)

In [153]:
df6.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,price_per_sqft_lot
2,225000.0,3.0,1.0,1660,7210,1.0,0,0,3,1100,560,1963,2008,Auburn,31.206657
3,257200.0,3.0,2.0,1850,8250,1.0,0,0,4,1150,700,1952,0,Auburn,31.175758
5,234000.0,4.0,2.0,1630,9010,1.0,0,0,4,1050,580,1975,0,Auburn,25.971143
6,204700.0,4.0,2.0,1670,9987,1.0,0,0,3,1670,0,1967,2011,Auburn,20.496646
7,206000.0,4.0,2.0,1700,6025,1.0,0,0,3,1700,0,1978,0,Auburn,34.190871


### Feature Engineering 

Adding a new column - years since last renovation

**Before doing this, some columns in the yr_renovated column have a value of 0 and while this indicates the property hasn't been renovated, it doesn't tell the year the property was renovated last which in this case should be the year it was built.**

In [163]:
df6.loc[df6['yr_renovated']==0, 'yr_renovated'] = df6['yr_built']
df6.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,price_per_sqft_lot
2,225000.0,3.0,1.0,1660,7210,1.0,0,0,3,1100,560,1963,2008,Auburn,31.206657
3,257200.0,3.0,2.0,1850,8250,1.0,0,0,4,1150,700,1952,1952,Auburn,31.175758
5,234000.0,4.0,2.0,1630,9010,1.0,0,0,4,1050,580,1975,1975,Auburn,25.971143
6,204700.0,4.0,2.0,1670,9987,1.0,0,0,3,1670,0,1967,2011,Auburn,20.496646
7,206000.0,4.0,2.0,1700,6025,1.0,0,0,3,1700,0,1978,1978,Auburn,34.190871


In [173]:
current_year = pd.Timestamp.now().year
df6['yrs_since_last_renovation'] = current_year - df6['yr_renovated']
df6.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6['yrs_since_last_renovation'] = current_year - df6['yr_renovated']


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,price_per_sqft_lot,yrs_since_last_renovation
2,225000.0,3.0,1.0,1660,7210,1.0,0,0,3,1100,560,1963,2008,Auburn,31.206657,16
3,257200.0,3.0,2.0,1850,8250,1.0,0,0,4,1150,700,1952,1952,Auburn,31.175758,72
5,234000.0,4.0,2.0,1630,9010,1.0,0,0,4,1050,580,1975,1975,Auburn,25.971143,49


**Another feature which I believe would be more useful is presence or absence of a basement in the property, so this will be added as a new feature.**

Key:

0 ----> No basement

1 ----> Property has a basement

In [178]:
df6['basement'] = df6['sqft_basement'].apply(lambda x: 0 if x == 0 else 1)
df6.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6['basement'] = df6['sqft_basement'].apply(lambda x: 0 if x == 0 else 1)


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,price_per_sqft_lot,yrs_since_last_renovation,basement
2,225000.0,3.0,1.0,1660,7210,1.0,0,0,3,1100,560,1963,2008,Auburn,31.206657,16,1
3,257200.0,3.0,2.0,1850,8250,1.0,0,0,4,1150,700,1952,1952,Auburn,31.175758,72,1
5,234000.0,4.0,2.0,1630,9010,1.0,0,0,4,1050,580,1975,1975,Auburn,25.971143,49,1


In [183]:
# Dropping some more features

df7 = df6.drop(columns=['waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'price_per_sqft_lot'],
              axis='columns')

In [185]:
df7.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,city,yrs_since_last_renovation,basement
2,225000.0,3.0,1.0,1660,7210,1.0,Auburn,16,1
3,257200.0,3.0,2.0,1850,8250,1.0,Auburn,72,1
5,234000.0,4.0,2.0,1630,9010,1.0,Auburn,49,1
6,204700.0,4.0,2.0,1670,9987,1.0,Auburn,13,0
7,206000.0,4.0,2.0,1700,6025,1.0,Auburn,46,0


### One Hot Encoding For Cities

In [281]:
df8 = pd.get_dummies(df7, columns=['city']).astype(int)
df8

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,yrs_since_last_renovation,basement,city_Auburn,city_Bellevue,...,city_Renton,city_Sammamish,city_SeaTac,city_Seattle,city_Shoreline,city_Snoqualmie,city_Tukwila,city_Vashon,city_Woodinville,city_others
2,225000,3,1,1660,7210,1,16,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,257200,3,2,1850,8250,1,72,1,1,0,...,0,0,0,0,0,0,0,0,0,0
5,234000,4,2,1630,9010,1,49,1,1,0,...,0,0,0,0,0,0,0,0,0,0
6,204700,4,2,1670,9987,1,13,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,206000,4,2,1700,6025,1,46,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
901,248000,4,3,2163,5883,2,18,0,0,0,...,0,0,0,0,0,0,0,0,0,1
902,234950,3,1,1360,9948,1,20,0,0,0,...,0,0,0,0,0,0,0,0,0,1
903,260000,3,2,1630,8018,1,21,0,0,0,...,0,0,0,0,0,0,0,0,0,1
904,196440,3,2,1560,7352,1,32,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### Reseting Index of Dataframe

In [284]:
df9 = df8.reset_index().drop('index', axis=1)
df9.head(1)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,yrs_since_last_renovation,basement,city_Auburn,city_Bellevue,...,city_Renton,city_Sammamish,city_SeaTac,city_Seattle,city_Shoreline,city_Snoqualmie,city_Tukwila,city_Vashon,city_Woodinville,city_others
0,225000,3,1,1660,7210,1,16,1,1,0,...,0,0,0,0,0,0,0,0,0,0


## Spliting Data into Training and Test Data

In [306]:
X = df9.drop('price', axis=1)
y = df9.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

## Model Selection

In [309]:
models = [LinearRegression(), Lasso(), RandomForestRegressor(random_state=1234)]
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=1234)

for model in models:
    scores = cross_val_score(model, X, y, cv=cv)
    avg_score = np.sum(scores)/len(scores)
    print(f"Model: {model} | Accuracies: {scores} | Average Accuracy: {avg_score}")

Model: LinearRegression() | Accuracies: [0.71180601 0.64634169 0.61460477 0.66934713 0.69588898] | Average Accuracy: 0.6675977140612606


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Model: Lasso() | Accuracies: [0.71181908 0.64633595 0.61602769 0.66936144 0.69589993] | Average Accuracy: 0.6678888182963479
Model: RandomForestRegressor(random_state=1234) | Accuracies: [0.65432772 0.6342855  0.5360843  0.66345643 0.5785845 ] | Average Accuracy: 0.6133476921505242


## Modeling

In [311]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Model Evaluation on Training and Test Data Using R Squared Error

In [315]:
train_preds = regressor.predict(X_train)
train_acc = r2_score(y_train, train_preds)
perc_train_acc = round(train_acc*100, 2)
print(f"The accuracy of the model on the training data is {perc_train_acc}% \n")

test_preds = regressor.predict(X_test)
test_acc = r2_score(y_test, test_preds)
perc_test_acc = round(test_acc*100,2)
print(f"The accuracy of the model on the test data is {perc_test_acc}%")

The accuracy of the model on the training data is 67.57% 

The accuracy of the model on the test data is 71.18%


## Saving The Model

In [387]:
PATH = Path('model')
PATH.mkdir(parents=True, exist_ok=True)
MODEL_NAME = 'house_price_prediction_model.pickle'
MODEL_PATH = PATH / MODEL_NAME

with open(MODEL_PATH, 'wb') as f:
    pickle.dump(obj=regressor, file=f)

## Saving the Column Names

In [349]:
X.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'yrs_since_last_renovation', 'basement', 'city_Auburn', 'city_Bellevue',
       'city_Bothell', 'city_Burien', 'city_Carnation', 'city_Clyde Hill',
       'city_Covington', 'city_Des Moines', 'city_Duvall', 'city_Enumclaw',
       'city_Federal Way', 'city_Issaquah', 'city_Kenmore', 'city_Kent',
       'city_Kirkland', 'city_Lake Forest Park', 'city_Maple Valley',
       'city_Mercer Island', 'city_Newcastle', 'city_Normandy Park',
       'city_North Bend', 'city_Redmond', 'city_Renton', 'city_Sammamish',
       'city_SeaTac', 'city_Seattle', 'city_Shoreline', 'city_Snoqualmie',
       'city_Tukwila', 'city_Vashon', 'city_Woodinville', 'city_others'],
      dtype='object')

In [357]:
columns = {
    'data_columns': [col.lower() for col in X.columns]
}
with open('columns.json', 'w') as f:
    f.write(json.dumps(columns))

## Building a Predictive System

In [344]:
def predict_price(
    n_bedrooms,
    n_bathrooms,
    sqft_living,
    sqft_lot,
    floors,
    yrs_since_last_renovation,
    basement,
    city
):
    """
        Predicts the price of a property given some features of the property.

        Args:
            n_bedrooms = Number of bedrooms in the property
            n_bathrooms = Number of bathroooms in the property
            sqft_living = Square feet of the living area
            sqft_lot = Sqaure feet of the lot
            floors = Number of floors on the property, Value of 1 if only the ground floor
            yrs_since_last_renovation = How many years since the property was last renovated(or built is no renovation), i.e. current_year - that year
            basement = Does the property have a basement?
            city = In which city is the property located

        Returns:
            Price prediction of the property.
    """

    if f"city_{city}" in X.columns:
        city_index = np.where(X.columns == f"city_{city}")[0][0]
    else:
        city_index = 38
    x = np.zeros(len(X.columns))
    x[0] = n_bedrooms
    x[1] = n_bathrooms
    x[2] = sqft_living
    x[3] = sqft_lot
    x[4] = floors
    x[5] = yrs_since_last_renovation
    if basement == 'Yes' or basement == 'yes':
        x[6] = 1
    x[city_index] = 1


    return int(regressor.predict([x])[0])

In [359]:
df9.sample(2)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,yrs_since_last_renovation,basement,city_Auburn,city_Bellevue,...,city_Renton,city_Sammamish,city_SeaTac,city_Seattle,city_Shoreline,city_Snoqualmie,city_Tukwila,city_Vashon,city_Woodinville,city_others
131,560000,3,2,1860,13374,1,39,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250,259950,4,2,2030,9300,1,32,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [383]:
predict_price(3,2,1860,13374,1,39,'No','Issaquah'), df9.iloc[131, 0]

(559571, 560000)

In [385]:
predict_price(4,2,2030,9300,1,32,'No','Redmond'), df9.iloc[250,0]

(505809, 259950)

In [395]:
df9.sample(3)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,yrs_since_last_renovation,basement,city_Auburn,city_Bellevue,...,city_Renton,city_Sammamish,city_SeaTac,city_Seattle,city_Shoreline,city_Snoqualmie,city_Tukwila,city_Vashon,city_Woodinville,city_others
715,700000,3,1,1410,7200,2,123,0,0,0,...,0,0,0,1,0,0,0,0,0,0
272,305000,4,1,2100,9288,1,56,1,0,0,...,1,0,0,0,0,0,0,0,0,0
63,285000,3,1,1090,8640,1,51,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [403]:
df9.iloc[715, :].T

price                        700000
bedrooms                          3
bathrooms                         1
sqft_living                    1410
sqft_lot                       7200
floors                            2
yrs_since_last_renovation       123
basement                          0
city_Auburn                       0
city_Bellevue                     0
city_Bothell                      0
city_Burien                       0
city_Carnation                    0
city_Clyde Hill                   0
city_Covington                    0
city_Des Moines                   0
city_Duvall                       0
city_Enumclaw                     0
city_Federal Way                  0
city_Issaquah                     0
city_Kenmore                      0
city_Kent                         0
city_Kirkland                     0
city_Lake Forest Park             0
city_Maple Valley                 0
city_Mercer Island                0
city_Newcastle                    0
city_Normandy Park          

In [407]:
print(predict_price(3,1,1410,7200,2,123,'No','Seattle'))
print(predict_price(4,1,2100,9288,1,56,'Yes','Renton'))
print(predict_price(3,1,1090,8640,1,51,'No','Burien'))

466111
358265
195321


In [422]:
df9.floors.unique()

array([1, 2, 3])