<a href="https://colab.research.google.com/github/ujjwalbhatnagar/Analysis-on-Weather-Data/blob/main/House_Prices_Advanced_Regression_Techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [4]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [6]:
train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [8]:
x_train, y_train = train_data.drop(['SalePrice','Id'], axis=1), train_data['SalePrice']

In [9]:
num_cols = [x for x in x_train.columns if train_data[x].dtype in ['int64','float64']]
cat_cols = [x for x in x_train.columns if train_data[x].dtype in ['object']]


In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [12]:
num_transform = Pipeline(steps=[('scale', StandardScaler(with_mean=False)),
                        ('impute', SimpleImputer(strategy='constant'))])

cat_transform = Pipeline(steps=[
                                ('impute', SimpleImputer(strategy='most_frequent')),
                                ('onehot', OneHotEncoder(handle_unknown='ignore')),
                                ('scale', MaxAbsScaler())])

column_trans = ColumnTransformer(transformers=[
                                               ('cat', cat_transform, cat_cols),
                                               ('num', num_transform, num_cols)
])

x_train = column_trans.fit_transform(x_train)

In [19]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1)

In [21]:
x_train, y_train = torch.from_numpy(x_train), torch.from_numpy(y_train)
x_test, y_test = torch.from_numpy(x_test), torch.from_numpy(y_test)

In [22]:
x_train.shape

torch.Size([1182, 288])

In [23]:
import torch.nn as nn
model = nn.Sequential(nn.Linear(288,512),
                      nn.Dropout(0.3),
                      nn.Linear(512,1))

In [24]:
from torch.utils import data
batch_size = 100
train_dataset = data.TensorDataset(x_train, y_train)
train_loader = data.DataLoader(train_dataset, batch_size, shuffle=True)
valid_dataset = data.TensorDataset(x_test, y_test)
valid_loader = data.DataLoader(valid_dataset, batch_size, shuffle=False)

In [25]:
def log_rmse(yhat, labels):
  clipped_preds = torch.clamp(yhat,1,float('inf'))
  rmse = torch.sqrt(criterion(torch.log(clipped_preds),
                              torch.log(labels)))
  return rmse.item()

In [27]:
import torch.optim as optim
import sys
from torch.utils import data
from tqdm import tqdm

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=5)
n_epochs = 200

train_loss_epoch = []
valid_loss_epoch = []

for epoch in range(n_epochs):
  model.train()
  loss_record = []
  train_pbar = tqdm(train_loader, file=sys.stdout)

  for x,y in train_pbar:
    x,y = x.float(), y.float()
    yhat = model(x)
    loss = criterion(yhat,y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    loss_record.append(log_rmse(yhat,y))
    train_pbar.set_description(f"Epoch [{epoch + 1}/{n_epochs}]")
    train_pbar.set_postfix({"loss ": loss_record[-1]})
    train_mean_loss = sum(loss_record) / len(loss_record)
    train_loss_epoch.append(train_mean_loss)

    model.eval()
    loss_record = []
    with torch.no_grad():
      for x,y in valid_loader:
        x,y = x.float(), y.float()
        yhat = model(x)
        loss = criterion(yhat,y)
        loss_record.append(log_rmse(yhat,y))

    valid_mean_loss = sum(loss_record) / len(loss_record)
    valid_loss_epoch.append(valid_mean_loss)
    print(f"Epoch [{epoch + 1}/{n_epochs}] Train Loss: {train_mean_loss:.3f} Valid Loss: {valid_mean_loss:.3f}")




Epoch [1/200]:   0%|          | 0/12 [00:00<?, ?it/s, loss =10.9]Epoch [1/200] Train Loss: 10.890 Valid Loss: 1.851
Epoch [1/200]:   8%|▊         | 1/12 [00:00<00:02,  4.19it/s, loss =1.84]Epoch [1/200] Train Loss: 1.847 Valid Loss: 11.955
Epoch [1/200]:   8%|▊         | 1/12 [00:00<00:02,  4.19it/s, loss =12.1]

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/200] Train Loss: 11.988 Valid Loss: 11.955
Epoch [1/200]:   8%|▊         | 1/12 [00:00<00:02,  4.19it/s, loss =12]  Epoch [1/200] Train Loss: 11.970 Valid Loss: 5.122
Epoch [1/200]:   8%|▊         | 1/12 [00:00<00:02,  4.19it/s, loss =4.98]Epoch [1/200] Train Loss: 5.075 Valid Loss: 4.648
Epoch [1/200]:  42%|████▏     | 5/12 [00:00<00:00, 17.27it/s, loss =4.61]Epoch [1/200] Train Loss: 4.635 Valid Loss: 3.480
Epoch [1/200]:  42%|████▏     | 5/12 [00:00<00:00, 17.27it/s, loss =3.35]Epoch [1/200] Train Loss: 3.435 Valid Loss: 11.955
Epoch [1/200]:  42%|████▏     | 5/12 [00:00<00:00, 17.27it/s, loss =12.1]Epoch [1/200] Train Loss: 11.990 Valid Loss: 11.955
Epoch [1/200]:  42%|████▏     | 5/12 [00:00<00:00, 17.27it/s, loss =12]  Epoch [1/200] Train Loss: 11.954 Valid Loss: 11.955
Epoch [1/200]:  42%|████▏     | 5/12 [00:00<00:00, 17.27it/s, loss =12]Epoch [1/200] Train Loss: 11.986 Valid Loss: 3.088
Epoch [1/200]:  83%|████████▎ | 10/12 [00:00<00:00, 25.70it/s, loss =3.02]Epoch [1

  return F.mse_loss(input, target, reduction=self.reduction)



Epoch [2/200]:   0%|          | 0/12 [00:00<?, ?it/s, loss =3.75]Epoch [2/200] Train Loss: 3.751 Valid Loss: 1.085
Epoch [2/200]:   0%|          | 0/12 [00:00<?, ?it/s, loss =1.05]Epoch [2/200] Train Loss: 1.073 Valid Loss: 11.955
Epoch [2/200]:   0%|          | 0/12 [00:00<?, ?it/s, loss =12]  Epoch [2/200] Train Loss: 11.966 Valid Loss: 11.955
Epoch [2/200]:   0%|          | 0/12 [00:00<?, ?it/s, loss =12]Epoch [2/200] Train Loss: 11.985 Valid Loss: 2.840
Epoch [2/200]:  33%|███▎      | 4/12 [00:00<00:00, 35.30it/s, loss =2.82]Epoch [2/200] Train Loss: 2.834 Valid Loss: 4.001
Epoch [2/200]:  33%|███▎      | 4/12 [00:00<00:00, 35.30it/s, loss =3.91]Epoch [2/200] Train Loss: 3.970 Valid Loss: 1.932
Epoch [2/200]:  33%|███▎      | 4/12 [00:00<00:00, 35.30it/s, loss =1.93]Epoch [2/200] Train Loss: 1.933 Valid Loss: 11.955
Epoch [2/200]:  33%|███▎      | 4/12 [00:00<00:00, 35.30it/s, loss =12.1]Epoch [2/200] Train Loss: 11.989 Valid Loss: 11.955
Epoch [2/200]:  33%|███▎      | 4/12 [00:0

In [30]:
from google.colab import files
uploaded = files.upload()
test_data = pd.read_csv("test.csv")
test_data

Saving test.csv to test (1).csv


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [31]:
x_test = column_trans.transform(test_data)
x_test = x_test.toarray()
x_test = torch.from_numpy(x_test).float()
with torch.no_grad():
  preds = model(x_test)

test_data['SalePrice'] = pd.Series(preds.reshape(1,-1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)