# Model Building

In [1]:
import os
import requests
import zipfile
import tarfile
import hashlib
from sklearn.preprocessing import StandardScaler
#from google.colab import  drive
#drive.mount("/drive", force_remount=True)
#@save
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'

from d2l import torch as d2l
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('Solarize_Light2')
%matplotlib inline

## Downloading data

In [2]:
def download(name, cache_dir=os.path.join('..', 'data')):  #@save
    """Download a file inserted into DATA_HUB, return the local filename."""
    assert name in DATA_HUB, f"{name} does not exist in {DATA_HUB}."
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # Hit cache
    print(f'Downloading {fname} from {url}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname

def download_extract(name, folder=None):  #@save
    """Download and extract a zip/tar file."""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, 'Only zip/tar files can be extracted.'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

def download_all():  #@save
    """Download all files in the DATA_HUB."""
    for name in DATA_HUB:
        download(name)

DATA_HUB['kaggle_house_train'] = (  #@save
    DATA_URL + 'kaggle_house_pred_train.csv',
    '585e9cc93e70b39160e7921475f9bcd7d31219ce')

DATA_HUB['kaggle_house_test'] = (  #@save
    DATA_URL + 'kaggle_house_pred_test.csv',
    'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))
print(train_data.shape)
print(test_data.shape)
train_data['mod_yr_built'] = train_data.YrSold - train_data.YearRemodAdd
test_data['mod_yr_built'] = test_data.YrSold - test_data.YearRemodAdd

(1460, 81)
(1459, 80)


## Feature Engineering

In [16]:
numeric_features = ['GrLivArea','LowQualFinSF','2ndFlrSF','LotArea','mod_yr_built','FullBath','BedroomAbvGr','TotRmsAbvGrd',
                   'GarageArea','KitchenAbvGr']
categorical_features = ['GarageQual','KitchenQual','MoSold','MSSubClass','MSZoning','SaleCondition']
output_feature = ['SalePrice']
all_features = numeric_features + categorical_features + output_feature 

model_train_data = train_data.loc[:,all_features].copy()
model_train_data.loc[:,categorical_features] = model_train_data.loc[:,categorical_features].astype(str)
scaler_output = StandardScaler()
scaler_numerical = StandardScaler()
model_train_data.loc[:,output_feature] = scaler_output.fit_transform(train_data.loc[:,output_feature])
model_train_data.loc[:,numeric_features] = scaler_numerical.fit_transform(train_data.loc[:,numeric_features])
model_train_data = pd.get_dummies(model_train_data, dummy_na=False)

if not(all(model_train_data.isna().sum() == 0)):
    print('Null Value present')
model_train_data.head(n=2)

all_features =list(model_train_data.columns)
all_features.remove('SalePrice')

In [19]:
train_features = torch.tensor(model_train_data.loc[:,all_features].values, dtype=torch.float32)
train_labels = torch.tensor(model_train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)

In [29]:
scaler_output.inverse_transform(train_labels)

array([[208500.],
       [181500.],
       [223500.],
       ...,
       [266500.],
       [142125.],
       [147500.]], dtype=float32)

In [20]:
loss = nn.MSELoss()
in_features = train_features.shape[1]
def get_net():
    net = nn.Sequential(nn.Linear(in_features,1))
    return net

In [40]:
def log_rmse(net, features, labels):
    # To further stabilize the value when the logarithm is taken, set the
    # value less than 1 as 1
    pred_output = torch.tensor(scaler_output.inverse_transform(net(features).detach().numpy()))
    clipped_preds = torch.clamp(pred_output, 1, float('inf'))
    inverted_labels = torch.tensor(scaler_output.inverse_transform(labels))
    
    rmse = torch.sqrt(loss(torch.log(clipped_preds),
                           torch.log(inverted_labels)))
    return rmse.item()

In [43]:
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_iter = d2l.load_array((train_features, train_labels), batch_size)
    # The Adam optimization algorithm is used here
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr = learning_rate,
                                 weight_decay = weight_decay)
    for epoch in range(num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls

def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid

In [38]:
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
           batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i == 0:
            d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
                     xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
                     legend=['train', 'valid'], yscale='log')
        print(f'fold {i + 1}, train log rmse {float(train_ls[-1]):f}, '
              f'valid log rmse {float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum / k

In [None]:
k, num_epochs, lr, weight_decay, batch_size = 5, 1000, 0.01, 1.5, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
                          weight_decay, batch_size)
print(f'{k}-fold validation: avg train log rmse: {float(train_l):f}, '
      f'avg valid log rmse: {float(valid_l):f}')

fold 1, train log rmse 0.210221, valid log rmse 0.190780
fold 2, train log rmse 0.214954, valid log rmse 0.215419
fold 3, train log rmse 0.209936, valid log rmse 0.212605
fold 4, train log rmse 0.210509, valid log rmse 0.211416
fold 5, train log rmse 0.201906, valid log rmse 0.225748
5-fold validation: avg train log rmse: 0.209505, avg valid log rmse: 0.211193
