# Advance House Price Prediction Using Pytorch

In [1]:
# Lets import torch
import torch

In [2]:
# Version check
torch.__version__

'1.5.1+cpu'

### Loading the dataset

In [3]:
import pandas as pd
df=pd.read_csv('houseprice.csv',usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea",
                                         "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]).dropna()

In [4]:
# Lets see the first five rows
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,YearBuilt,1stFlrSF,2ndFlrSF,SalePrice
0,60,RL,65.0,8450,Pave,Reg,2003,856,854,208500
1,20,RL,80.0,9600,Pave,Reg,1976,1262,0,181500
2,60,RL,68.0,11250,Pave,IR1,2001,920,866,223500
3,70,RL,60.0,9550,Pave,IR1,1915,961,756,140000
4,60,RL,84.0,14260,Pave,IR1,2000,1145,1053,250000


In [5]:
# Lets check some informations about the features
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MSSubClass   1201 non-null   int64  
 1   MSZoning     1201 non-null   object 
 2   LotFrontage  1201 non-null   float64
 3   LotArea      1201 non-null   int64  
 4   Street       1201 non-null   object 
 5   LotShape     1201 non-null   object 
 6   YearBuilt    1201 non-null   int64  
 7   1stFlrSF     1201 non-null   int64  
 8   2ndFlrSF     1201 non-null   int64  
 9   SalePrice    1201 non-null   int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 103.2+ KB


### Feature Engineering

In [6]:
# Lets see the unique values of all features
for i in df.columns:
    print("Column name is {} and unique values are {}".format(i, len(df[i].unique())))

Column name is MSSubClass and unique values are 15
Column name is MSZoning and unique values are 5
Column name is LotFrontage and unique values are 110
Column name is LotArea and unique values are 869
Column name is Street and unique values are 2
Column name is LotShape and unique values are 4
Column name is YearBuilt and unique values are 112
Column name is 1stFlrSF and unique values are 678
Column name is 2ndFlrSF and unique values are 368
Column name is SalePrice and unique values are 597


In [7]:
# Lets handle the Year column
import datetime
datetime.datetime.now().year

2021

In [8]:
df["Total Years"] = datetime.datetime.now().year - df["YearBuilt"]

In [9]:
# Lets drop the YearBuilt feature as it is not that important now
df.drop("YearBuilt", axis = 1, inplace = True)

In [10]:
# Lets see the features now
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', '1stFlrSF', '2ndFlrSF', 'SalePrice', 'Total Years'],
      dtype='object')

In [11]:
# Lets seperate the categorical features and target feature
cat_features = ["MSSubClass", "MSZoning", "Street", "LotShape"]
out_feature = "SalePrice"

In [12]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
lbl_encoders = {}
for feature in cat_features:
    lbl_encoders[feature] = LabelEncoder()
    df[feature] = lbl_encoders[feature].fit_transform(df[feature])

In [13]:
# Lets see the data
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,1stFlrSF,2ndFlrSF,SalePrice,Total Years
0,5,3,65.0,8450,1,3,856,854,208500,18
1,0,3,80.0,9600,1,3,1262,0,181500,45
2,5,3,68.0,11250,1,0,920,866,223500,20
3,6,3,60.0,9550,1,0,961,756,140000,106
4,5,3,84.0,14260,1,0,1145,1053,250000,21
...,...,...,...,...,...,...,...,...,...,...
1455,5,3,62.0,7917,1,3,953,694,175000,22
1456,0,3,85.0,13175,1,3,2073,0,210000,43
1457,6,3,66.0,9042,1,3,1188,1152,266500,80
1458,0,3,68.0,9717,1,3,1078,0,142125,71


In [14]:
# Stacking and converting into tensors
import numpy as np
cat_features = np.stack([df["MSSubClass"], df["MSZoning"], df["Street"], df["LotShape"]], 1)
cat_features

array([[5, 3, 1, 3],
       [0, 3, 1, 3],
       [5, 3, 1, 0],
       ...,
       [6, 3, 1, 3],
       [0, 3, 1, 3],
       [0, 3, 1, 3]], dtype=int64)

In [15]:
# Converting numpy to tensors
cat_features = torch.tensor(cat_features, dtype = torch.int64)
cat_features

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

In [16]:
# Lets seperate the continuous features
cont_features = []
for i in df.columns:
    if i in ["MSSubClass", "MSZoning", "Street", "LotShape", "SalePrice"]:
        pass
    else:
        cont_features.append(i)

In [17]:
# Lets check the continuous features
cont_features

['LotFrontage', 'LotArea', '1stFlrSF', '2ndFlrSF', 'Total Years']

In [18]:
# Stacking continuous features into tensor
cont_values = np.stack([df[i].values for i in cont_features], axis = 1)
cont_values = torch.tensor(cont_values, dtype = torch.float)
cont_values

tensor([[   65.,  8450.,   856.,   854.,    18.],
        [   80.,  9600.,  1262.,     0.,    45.],
        [   68., 11250.,   920.,   866.,    20.],
        ...,
        [   66.,  9042.,  1188.,  1152.,    80.],
        [   68.,  9717.,  1078.,     0.,    71.],
        [   75.,  9937.,  1256.,     0.,    56.]])

In [19]:
# Lets check the datatype
cont_values.dtype

torch.float32

In [20]:
# Lets convert the target feature into tensor
y = torch.tensor(df["SalePrice"].values, dtype = torch.float).reshape(-1,1)
y

tensor([[208500.],
        [181500.],
        [223500.],
        ...,
        [266500.],
        [142125.],
        [147500.]])

In [21]:
# Lets see the information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MSSubClass   1201 non-null   int64  
 1   MSZoning     1201 non-null   int32  
 2   LotFrontage  1201 non-null   float64
 3   LotArea      1201 non-null   int64  
 4   Street       1201 non-null   int32  
 5   LotShape     1201 non-null   int32  
 6   1stFlrSF     1201 non-null   int64  
 7   2ndFlrSF     1201 non-null   int64  
 8   SalePrice    1201 non-null   int64  
 9   Total Years  1201 non-null   int64  
dtypes: float64(1), int32(3), int64(6)
memory usage: 89.1 KB


In [22]:
# Lets see the shapes of the categorical, continuous and the target features
cat_features.shape, cont_values.shape, y.shape

(torch.Size([1201, 4]), torch.Size([1201, 5]), torch.Size([1201, 1]))

### Embedding size for categorical columns

In [23]:
# Lets see the unique values of each categorical feature
cat_dims = [len(df[col].unique()) for col in ["MSSubClass", "MSZoning", "Street", "LotShape"]]
cat_dims

[15, 5, 2, 4]

In [24]:
# Lets select the input and the output dimensions

embedding_dim = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
embedding_dim

[(15, 8), (5, 3), (2, 1), (4, 2)]

In [25]:
# Lets import the torch libraries
import torch.nn as nn
import torch.nn.functional as F
embed_representation=nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim])
embed_representation

ModuleList(
  (0): Embedding(15, 8)
  (1): Embedding(5, 3)
  (2): Embedding(2, 1)
  (3): Embedding(4, 2)
)

In [26]:
# Categorical features
cat_features

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

In [27]:
# Lets see the first four rows
catfeatures = cat_features[:4]
catfeatures

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        [6, 3, 1, 0]])

In [28]:
# Embedding
pd.set_option('display.max_rows', 500)
embedding_val=[]
for i,e in enumerate(embed_representation):
    embedding_val.append(e(cat_features[:,i]))

In [29]:
# Lets see the embedding_val
embedding_val

[tensor([[-1.5918, -0.0265,  0.8832,  ..., -0.5375, -0.9647, -0.4923],
         [-0.6486,  3.8315,  0.1318,  ...,  0.7655,  0.1837,  0.1132],
         [-1.5918, -0.0265,  0.8832,  ..., -0.5375, -0.9647, -0.4923],
         ...,
         [ 0.7877, -0.8106,  0.5398,  ...,  1.5288,  0.9763, -0.2729],
         [-0.6486,  3.8315,  0.1318,  ...,  0.7655,  0.1837,  0.1132],
         [-0.6486,  3.8315,  0.1318,  ...,  0.7655,  0.1837,  0.1132]],
        grad_fn=<EmbeddingBackward>),
 tensor([[-1.0007, -1.1900, -0.7499],
         [-1.0007, -1.1900, -0.7499],
         [-1.0007, -1.1900, -0.7499],
         ...,
         [-1.0007, -1.1900, -0.7499],
         [-1.0007, -1.1900, -0.7499],
         [-1.0007, -1.1900, -0.7499]], grad_fn=<EmbeddingBackward>),
 tensor([[0.2276],
         [0.2276],
         [0.2276],
         ...,
         [0.2276],
         [0.2276],
         [0.2276]], grad_fn=<EmbeddingBackward>),
 tensor([[ 0.0814,  0.1626],
         [ 0.0814,  0.1626],
         [-0.2939, -0.1433],
  

In [30]:
# Lets stack them column wise
z = torch.cat(embedding_val, axis = 1)
z

tensor([[-1.5918, -0.0265,  0.8832,  ...,  0.2276,  0.0814,  0.1626],
        [-0.6486,  3.8315,  0.1318,  ...,  0.2276,  0.0814,  0.1626],
        [-1.5918, -0.0265,  0.8832,  ...,  0.2276, -0.2939, -0.1433],
        ...,
        [ 0.7877, -0.8106,  0.5398,  ...,  0.2276,  0.0814,  0.1626],
        [-0.6486,  3.8315,  0.1318,  ...,  0.2276,  0.0814,  0.1626],
        [-0.6486,  3.8315,  0.1318,  ...,  0.2276,  0.0814,  0.1626]],
       grad_fn=<CatBackward>)

In [31]:
# Lets implement dropout
dropout = nn.Dropout(0.4)

In [32]:
# Lets add the dropout to z
final_embed = dropout(z)
final_embed

tensor([[-2.6529, -0.0441,  0.0000,  ...,  0.3793,  0.0000,  0.2711],
        [-1.0810,  0.0000,  0.2196,  ...,  0.0000,  0.1357,  0.2711],
        [-0.0000, -0.0000,  1.4721,  ...,  0.0000, -0.0000, -0.2389],
        ...,
        [ 1.3128, -1.3510,  0.0000,  ...,  0.3793,  0.0000,  0.0000],
        [-0.0000,  6.3858,  0.0000,  ...,  0.0000,  0.0000,  0.2711],
        [-1.0810,  6.3858,  0.0000,  ...,  0.3793,  0.1357,  0.0000]],
       grad_fn=<MulBackward0>)

### Creating a Feed Forward Neural Network

In [33]:
class FeedForwardNN(nn.Module):
    
    def __init__(self, embedding_dim, n_cont, out_sz, layers, p = 0.5 ):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum((out for inp,out in embedding_dim))
        n_in = n_emb + n_cont
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
            
        layerlist.append(nn.Linear(layers[-1],out_sz))
        
        self.layers = nn.Sequential(*layerlist)
        
        
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

In [34]:
torch.manual_seed(100)
model = FeedForwardNN(embedding_dim,len(cont_features),1,[100,50],p=.4)

In [35]:
# Lets see the model
model

FeedForwardNN(
  (embeds): ModuleList(
    (0): Embedding(15, 8)
    (1): Embedding(5, 3)
    (2): Embedding(2, 1)
    (3): Embedding(4, 2)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=100, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=50, out_features=1, bias=True)
  )
)

In [36]:
# Lets define loss and optimizer
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

In [37]:
# train test split
batch_size = 1200
test_size = int(batch_size * 0.15)
train_categorical = cat_features[: batch_size - test_size]
test_categorical = cat_features[batch_size - test_size : batch_size]
train_cont = cont_values[: batch_size - test_size]
test_cont = cont_values[batch_size - test_size : batch_size]
y_train = y[: batch_size - test_size]
y_test = y[batch_size - test_size :batch_size]

In [38]:
# Lets see the lengths of the above variables
len(train_categorical), len(test_categorical), len(train_cont), len(test_cont), len(y_train), len(y_test)

(1020, 180, 1020, 180, 1020, 180)

In [39]:
# Model training
epochs = 3000
final_losses = []
for i in range(epochs):
    i = i + 1
    y_pred = model(train_categorical, train_cont)
    loss = torch.sqrt(loss_function(y_pred, y_train))
    final_losses.append(loss)
    if i%10 == 1:
        print("Epoch number: {} and the Loss: {}".format(i,loss.item()))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Epoch number: 1 and the Loss: 200496.78125
Epoch number: 11 and the Loss: 200496.375
Epoch number: 21 and the Loss: 200496.03125
Epoch number: 31 and the Loss: 200495.75
Epoch number: 41 and the Loss: 200495.515625
Epoch number: 51 and the Loss: 200495.125
Epoch number: 61 and the Loss: 200494.8125
Epoch number: 71 and the Loss: 200494.515625
Epoch number: 81 and the Loss: 200494.21875
Epoch number: 91 and the Loss: 200493.875
Epoch number: 101 and the Loss: 200493.484375
Epoch number: 111 and the Loss: 200493.03125
Epoch number: 121 and the Loss: 200492.78125
Epoch number: 131 and the Loss: 200492.390625
Epoch number: 141 and the Loss: 200491.953125
Epoch number: 151 and the Loss: 200491.546875
Epoch number: 161 and the Loss: 200491.046875
Epoch number: 171 and the Loss: 200490.5625
Epoch number: 181 and the Loss: 200490.109375
Epoch number: 191 and the Loss: 200489.671875
Epoch number: 201 and the Loss: 200489.046875
Epoch number: 211 and the Loss: 200488.640625
Epoch number: 221 and

Epoch number: 1811 and the Loss: 199998.984375
Epoch number: 1821 and the Loss: 199990.609375
Epoch number: 1831 and the Loss: 199989.578125
Epoch number: 1841 and the Loss: 199982.90625
Epoch number: 1851 and the Loss: 199968.375
Epoch number: 1861 and the Loss: 199971.453125
Epoch number: 1871 and the Loss: 199961.796875
Epoch number: 1881 and the Loss: 199952.453125
Epoch number: 1891 and the Loss: 199942.125
Epoch number: 1901 and the Loss: 199942.75
Epoch number: 1911 and the Loss: 199944.3125
Epoch number: 1921 and the Loss: 199932.234375
Epoch number: 1931 and the Loss: 199924.90625
Epoch number: 1941 and the Loss: 199915.140625
Epoch number: 1951 and the Loss: 199913.6875
Epoch number: 1961 and the Loss: 199907.265625
Epoch number: 1971 and the Loss: 199899.484375
Epoch number: 1981 and the Loss: 199901.078125
Epoch number: 1991 and the Loss: 199885.0625
Epoch number: 2001 and the Loss: 199881.703125
Epoch number: 2011 and the Loss: 199868.4375
Epoch number: 2021 and the Loss: 

In [42]:
# Validating the test data
y_pred = ""
with torch.no_grad():
    y_pred = model(test_categorical, test_cont)
    loss = torch.sqrt(loss_function(y_pred,y_test))
print("RMSE: {}".format(loss))

RMSE: 189606.5


In [43]:
data_verify=pd.DataFrame(y_test.tolist(),columns=["Test"])

In [44]:
data_predicted=pd.DataFrame(y_pred.tolist(),columns=["Prediction"])
data_predicted

Unnamed: 0,Prediction
0,808.03125
1,1196.393066
2,511.113098
3,2467.094727
4,995.690613
5,1732.424561
6,1976.099121
7,2642.55127
8,392.039642
9,4648.277832


In [45]:
# Output
final_output=pd.concat([data_verify,data_predicted],axis=1)
final_output['Difference']=final_output['Test']-final_output['Prediction']
final_output.head()

Unnamed: 0,Test,Prediction,Difference
0,130000.0,808.03125,129191.96875
1,138887.0,1196.393066,137690.606934
2,175500.0,511.113098,174988.886902
3,195000.0,2467.094727,192532.905273
4,142500.0,995.690613,141504.309387


### Saving The Model

In [46]:
torch.save(model,'HousePrice.pt')

  "type " + obj.__name__ + ". It won't be checked "


In [47]:
torch.save(model.state_dict(),'HouseWeights.pt')

### Loading the saved model

In [48]:
embs_size=[(15, 8), (5, 3), (2, 1), (4, 2)]
model1=FeedForwardNN(embs_size,5,1,[100,50],p=0.4)

In [49]:
model1.load_state_dict(torch.load('HouseWeights.pt'))

<All keys matched successfully>

In [50]:
model1.eval()

FeedForwardNN(
  (embeds): ModuleList(
    (0): Embedding(15, 8)
    (1): Embedding(5, 3)
    (2): Embedding(2, 1)
    (3): Embedding(4, 2)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=100, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=50, out_features=1, bias=True)
  )
)