# King County House price dataset needs to be used. The data represents the details of houses sold in King County, USA in 2014 and 2015. From the given features, predict the price of a house

In [1]:
# Import required libraries

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read data from the input csv file
house_df = pd.read_csv(r"C:\Users\udayk\Downloads\housesalesprediction\kc_house_data.csv")
house_df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
# Determine the size of the data
house_df.shape

(21613, 21)

In [4]:
# Determine the columns in the data
house_df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [5]:
# View information about the data in each columns
house_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

# Feature Engineering

In [6]:
# selecting features
house_data_df = house_df[["price","date", "bedrooms", 
                                  "bathrooms", "sqft_living", "floors", 
                                  "waterfront", "view", "condition", "grade","zipcode"]]

In [7]:
# Extracting year and month from date
house_data_df.loc[:,"year"] = house_data_df["date"].str[0:4]
house_data_df.loc[:,"month"] = house_data_df["date"].str[4:6]


In [8]:
house_data_df.head()

Unnamed: 0,price,date,bedrooms,bathrooms,sqft_living,floors,waterfront,view,condition,grade,zipcode,year,month
0,221900.0,20141013T000000,3,1.0,1180,1.0,0,0,3,7,98178,2014,10
1,538000.0,20141209T000000,3,2.25,2570,2.0,0,0,3,7,98125,2014,12
2,180000.0,20150225T000000,2,1.0,770,1.0,0,0,3,6,98028,2015,2
3,604000.0,20141209T000000,4,3.0,1960,1.0,0,0,5,7,98136,2014,12
4,510000.0,20150218T000000,3,2.0,1680,1.0,0,0,3,8,98074,2015,2


In [9]:

# removing date after this extraction
house_data_df = house_data_df.drop(columns=["date"])

In [10]:
house_data_df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,waterfront,view,condition,grade,zipcode,year,month
0,221900.0,3,1.0,1180,1.0,0,0,3,7,98178,2014,10
1,538000.0,3,2.25,2570,2.0,0,0,3,7,98125,2014,12
2,180000.0,2,1.0,770,1.0,0,0,3,6,98028,2015,2
3,604000.0,4,3.0,1960,1.0,0,0,5,7,98136,2014,12
4,510000.0,3,2.0,1680,1.0,0,0,3,8,98074,2015,2


In [11]:
# Encoding the categorical values
cat_features = ["waterfront", "view", "condition", "grade", "year", "month", "zipcode"]
house_data_df = pd.get_dummies(house_data_df,columns=cat_features)


In [12]:
# Normalizing the continous numerical features
''' The features with continous nuumerical values are normalized so that, 
    each of these features span across a common range of values. '''
from sklearn.preprocessing import StandardScaler

# finding the mean and std deviation of numerical columns
scaler = StandardScaler().fit(house_data_df[['price', 'bedrooms',
                                             'bathrooms', 'sqft_living', 'floors']]) 
# scaling columns to a common range
house_data_normalized = scaler.transform(house_data_df[['price', 'bedrooms',
                                             'bathrooms', 'sqft_living', 'floors']])
house_data_normalized


array([[-0.86671733, -0.39873715, -1.44746357, -0.97983502, -0.915427  ],
       [-0.00568792, -0.39873715,  0.1756067 ,  0.53363434,  0.93650577],
       [-0.98084935, -1.47395936, -1.44746357, -1.42625404, -0.915427  ],
       ...,
       [-0.37586519, -1.47395936, -1.77207762, -1.15404732,  0.93650577],
       [-0.38158814, -0.39873715,  0.50022075, -0.52252773,  0.93650577],
       [-0.58588173, -1.47395936, -1.77207762, -1.15404732,  0.93650577]])

In [13]:
# Replacing the numerical columns with normalized values

# Updating the numerical (normalized) data into dataframe
house_data_df_normalized = pd.DataFrame(house_data_normalized,columns=['price', 'bedrooms',
                                             'bathrooms', 'sqft_living', 'floors']) 
house_data_df_normalized = house_data_df_normalized.join(house_data_df[house_data_df.columns.drop(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'floors'])])

# selecting feature and target

In [14]:
# selecting feature and target

Y = house_data_df_normalized['price']
X = house_data_df_normalized[house_data_df_normalized.columns.drop('price')]
print(X.shape)


(21613, 112)


In [15]:
# Splitting into train and test data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.2,random_state = 42)
x_train.shape, x_test.shape

((17290, 112), (4323, 112))

In [16]:
# Build a model and evaluate its performance using R-squared
# importing required module to build the model
from sklearn.linear_model import LinearRegression

# Building and training the model
model = LinearRegression()
model.fit(x_train,y_train)

# Evaluating the model on the train and test data for a R-squared score
train_score = model.score(x_train,y_train)
test_score = model.score(x_test,y_test)
print('Train score (R-squared): ',train_score)
print(' Test score (R-squared): ' ,test_score)

Train score (R-squared):  0.8329720155926127
 Test score (R-squared):  0.8343019007915471


# Evalyating the model performance using RMSE

In [17]:
# Evalyating the model performance using RMSE

from sklearn.metrics import mean_squared_error

# root mean square error (RMSE) calculate train data
train_predictions = model.predict(x_train)
train_RSME = mean_squared_error(y_train,train_predictions)**0.5
# root mean square error (RMSE) calculate test data
test_predictions = model.predict(x_test)
test_RMSE = mean_squared_error(y_test,test_predictions)**0.5

print(' Train RMSE :',train_RSME)
print(' Test RMSE :',test_RMSE)

 Train RMSE : 0.4023838230912359
 Test RMSE : 0.4311165555539107
