In [30]:
# Importing basic libraries 

import numpy as np
import pandas as pd

# Importing viusalization libraries

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# splitting the data

from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

# Importing the models
from sklearn.ensemble import GradientBoostingRegressor


# Importing the metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

# Importing pickle file
import pickle

import warnings

# Filter out a specific warning by category
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
# Importing dataset

df = pd.read_csv('../HousePricePredictiondataset.csv')
df.head(4)

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,02-05-2014 00:00,313000.0,3,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,02-05-2014 00:00,2384000.0,5,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,02-05-2014 00:00,342000.0,3,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,02-05-2014 00:00,420000.0,3,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA


In [3]:
# Checking the shape of dataset

df.shape

(4600, 18)

In [4]:
# Looking at the datatypes

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   int64  
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [5]:
# Checking for NUll values in the dataset

df.isnull().sum()

date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64

In [6]:
# Checking for duplicate values in the data

df.duplicated().sum()

0

In [7]:
# Number of unique values in all columns

df.nunique()

date               70
price            1741
bedrooms           10
bathrooms          26
sqft_living       566
sqft_lot         3113
floors              6
waterfront          2
view                5
condition           5
sqft_above        511
sqft_basement     207
yr_built          115
yr_renovated       60
street           4525
city               44
statezip           77
country             1
dtype: int64

In [8]:
# Deleting unwanted columns in model building

del df["street"]
del df["city"]
del df["country"]
del df["date"]  

In [9]:
# Preparing the Statezip column data into more useful data

lis = []
for i in df["statezip"]:
    lis.append(i[2:])
    
arr = np.array(lis,dtype=np.int64)
df["statezip"] = arr

In [10]:
df.drop(np.where(df["price"]==0)[0],axis=0,inplace=True)

In [11]:
# Removing skewness from data

print("Before transformation skew : ",df["sqft_living"].skew())
df["sqft_living"] = np.log(df["sqft_living"])
print("After transformation skew : ",df["sqft_living"].skew())

Before transformation skew :  1.7188875411513491
After transformation skew :  -0.05370929015223324


In [12]:
# Removing the skeness from data

print("Before transformation skew : ",df["sqft_basement"].skew())
df["sqft_basement"] = np.cbrt(df["sqft_basement"])
print("After transformation skew : ",df["sqft_basement"].skew())

Before transformation skew :  1.6550756371892148
After transformation skew :  0.5650940779006431


In [13]:
# Removing the skewness from data

print("Before transformation skew : ",df["sqft_lot"].skew())
df["sqft_lot"] = np.log(df["sqft_lot"])
print("After transformation skew : ",df["sqft_lot"].skew())

Before transformation skew :  11.329014386009023
After transformation skew :  0.8412624258021195


In [14]:
# Removing the skewness from data

print("Before transformation skew : ",df["sqft_above"].skew())
df["sqft_above"] = np.log(df["sqft_above"])
print("After transformation skew : ",df["sqft_above"].skew())

Before transformation skew :  1.4539329178421054
After transformation skew :  0.23924533231027634


In [15]:
print("Before transformation skew : ",df["price"].skew())
df["price"] = np.log(df["price"])
print("After transformation skew : ",df["price"].skew())

Before transformation skew :  25.02381726204781
After transformation skew :  0.32998138379856273


In [16]:
# Removing the skewness

print("Before transformation skew : ",df["view"].skew())
df["view"] = np.cbrt(df["view"])
print("After transformation skew : ",df["view"].skew())

Before transformation skew :  3.3733978634952058
After transformation skew :  2.7925334595507962


In [17]:
# Delting the highest correlated column with sqft_living(0.86 corr)

del df["sqft_above"]

In [18]:
# Splitting the data in target and Features

X = df.drop("price",axis=1,inplace=False)
Y = df.loc[:,"price"]

print("Shape of x : ",X.shape)
print("Shape of y : ",Y.shape)

Shape of x :  (4551, 12)
Shape of y :  (4551,)


In [19]:
# scalling the data

scaler = StandardScaler()
Xscaled = scaler.fit_transform(X)

In [20]:
# Splitting the data in training and testing

Xtrain,Xtest,Ytrain,Ytest = train_test_split(Xscaled,Y,test_size=0.2,random_state=42)

In [21]:
# Building the model

gradient = GradientBoostingRegressor(random_state=42)
gradient.fit(Xtrain,Ytrain)

GradientBoostingRegressor(random_state=42)

In [22]:
Ypred = gradient.predict(Xtest)

In [23]:
# Calculating the metrics

print("mse : ",mean_squared_error(Ytest,Ypred))
print("rmse : ",np.sqrt(mean_squared_error(Ytest,Ypred)))
print("mae : ",mean_absolute_error(Ytest,Ypred))
print("train r2score : ",r2_score(Ytrain,gradient.predict(Xtrain)))
print("test r2score : ",r2_score(Ytest,Ypred))

mse :  0.09481253901538607
rmse :  0.30791644810790164
mae :  0.21513970326733928
train r2score :  0.7817068398857432
test r2score :  0.6763250485979948


In [24]:
# Setting hyperparameters

parameter = {
                "learning_rate":[0.05,0.08,0.1,0.04],
                "n_estimators":[100,200,300,400],
                "min_samples_split":[4,6,8,9,11],
                "min_samples_leaf":[3,5,7,9],
                "max_depth":[5,7,9,11]
}

In [25]:
# Hyperparameter tuning

gradient = GradientBoostingRegressor()
random = RandomizedSearchCV(gradient,parameter,cv=5,random_state=42)
random.fit(Xtrain,Ytrain)
best_parameters = random.best_params_
best_score = random.best_score_
print(best_parameters)
print(best_score)

{'n_estimators': 300, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_depth': 5, 'learning_rate': 0.08}
0.755163817721155


In [26]:
Ypred = random.predict(Xtest)
print("mse : ",mean_squared_error(Ytest,Ypred))
print("rmse : ",np.sqrt(mean_squared_error(Ytest,Ypred)))
print("mae : ",mean_absolute_error(Ytest,Ypred))
print("test r2score : ",r2_score(Ytest,Ypred))

mse :  0.0747733149863314
rmse :  0.2734470972351533
mae :  0.17726771679331563
test r2score :  0.7447357770849262


In [31]:
# Creating a pickle file, and dumping the model

pickle.dump(scaler,open("G:\STUDY MATERIAL\THEORY\Internship\Assignment - 8\House price prediction\Model\scaler.pkl","wb"))
pickle.dump(random, open("G:\\STUDY MATERIAL\\THEORY\\Internship\\Assignment - 8\\House price prediction\\Model\\random.pkl", "wb"))


In [33]:
X.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_basement', 'yr_built',
       'yr_renovated', 'statezip'],
      dtype='object')